Some checks failed
/ build (push) Failing after 5s
174 tests covering URL normalization, FTS5 query sanitization, SSRF/CSRF guards, sharing-mode logic, DB schema and upsert paths, handler end-to-end flows, and gateway body-size / mesh-whitelist guards. Each recent bug-fix commit (6ffd38d,1bc695f,8dffd8c) has an explicit regression test in test_regressions.py. One xfail documents a minor latent bug in clean_url where port 80 is not stripped from upgraded https URLs. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
101 lines
3.7 KiB
Python
101 lines
3.7 KiB
Python
"""Tests for `clean_url` — URL normalization and tracking-param stripping.
|
|
|
|
Clean URLs are the deduplication key in the pages table, so any change to
|
|
this function can silently cause duplicate rows or mask legitimate saves.
|
|
"""
|
|
import pytest
|
|
|
|
from conftest import patch_dns_ok, patch_dns_fail
|
|
from db import clean_url, TRACKING_PARAMS
|
|
|
|
|
|
def test_strips_fragment(monkeypatch):
|
|
patch_dns_ok(monkeypatch)
|
|
assert clean_url("https://example.com/page#section") == "https://example.com/page"
|
|
|
|
|
|
def test_prefers_https(monkeypatch):
|
|
patch_dns_ok(monkeypatch)
|
|
assert clean_url("http://example.com/page") == "https://example.com/page"
|
|
|
|
|
|
def test_lowercases_hostname(monkeypatch):
|
|
patch_dns_ok(monkeypatch)
|
|
assert clean_url("https://EXAMPLE.COM/page") == "https://example.com/page"
|
|
|
|
|
|
def test_preserves_path_case(monkeypatch):
|
|
"""Paths are case-sensitive and should not be lowercased."""
|
|
patch_dns_ok(monkeypatch)
|
|
assert clean_url("https://example.com/Foo/Bar") == "https://example.com/Foo/Bar"
|
|
|
|
|
|
def test_strips_default_https_port(monkeypatch):
|
|
patch_dns_ok(monkeypatch)
|
|
assert clean_url("https://example.com:443/page") == "https://example.com/page"
|
|
|
|
|
|
@pytest.mark.xfail(reason="clean_url upgrades http->https before the port-default check, "
|
|
"so port 80 is not stripped. Minor dedup bug — harmless but worth fixing.")
|
|
def test_strips_http_port_80(monkeypatch):
|
|
"""Expected: http://foo:80 → https://foo (both scheme-upgrade and port-strip).
|
|
|
|
Currently fails because scheme is upgraded to https *before* the port check,
|
|
so `scheme == "http" and port == 80` is never true by the time the check runs.
|
|
"""
|
|
patch_dns_ok(monkeypatch)
|
|
assert clean_url("http://example.com:80/page") == "https://example.com/page"
|
|
|
|
|
|
def test_preserves_non_default_port(monkeypatch):
|
|
patch_dns_ok(monkeypatch)
|
|
assert clean_url("https://example.com:8443/page") == "https://example.com:8443/page"
|
|
|
|
|
|
def test_strips_trailing_slash(monkeypatch):
|
|
patch_dns_ok(monkeypatch)
|
|
assert clean_url("https://example.com/page/") == "https://example.com/page"
|
|
|
|
|
|
def test_root_slash_preserved(monkeypatch):
|
|
patch_dns_ok(monkeypatch)
|
|
assert clean_url("https://example.com/") == "https://example.com/"
|
|
|
|
|
|
@pytest.mark.parametrize("param", sorted(TRACKING_PARAMS))
|
|
def test_tracking_params_stripped(monkeypatch, param):
|
|
patch_dns_ok(monkeypatch)
|
|
result = clean_url(f"https://example.com/page?{param}=value&keep=yes")
|
|
assert param not in result
|
|
assert "keep=yes" in result
|
|
|
|
|
|
def test_strips_www_when_nonwww_resolves(monkeypatch):
|
|
"""Standard case: strip `www.` prefix to canonicalize."""
|
|
patch_dns_ok(monkeypatch)
|
|
assert clean_url("https://www.example.com/page") == "https://example.com/page"
|
|
|
|
|
|
def test_preserves_www_when_nonwww_does_not_resolve(monkeypatch):
|
|
"""Regression for 6ffd38d.
|
|
|
|
Some sites only serve their content at `www.domain.tld`; the bare domain
|
|
doesn't resolve. Stripping `www.` in that case produced a URL that we could
|
|
never actually fetch or dedupe against the real one.
|
|
"""
|
|
patch_dns_fail(monkeypatch)
|
|
assert clean_url("https://www.example.com/page") == "https://www.example.com/page"
|
|
|
|
|
|
def test_query_params_sorted_for_stable_ordering(monkeypatch):
|
|
"""Same URL with different param orderings should produce the same clean URL."""
|
|
patch_dns_ok(monkeypatch)
|
|
a = clean_url("https://example.com/page?b=2&a=1")
|
|
b = clean_url("https://example.com/page?a=1&b=2")
|
|
assert a == b
|
|
|
|
|
|
def test_path_and_query_preserved_through_cleanup(monkeypatch):
|
|
patch_dns_ok(monkeypatch)
|
|
result = clean_url("https://example.com/path/to/page?id=42&utm_source=twitter")
|
|
assert result == "https://example.com/path/to/page?id=42"
|