"""Tests for `clean_url` — URL normalization and tracking-param stripping. Clean URLs are the deduplication key in the pages table, so any change to this function can silently cause duplicate rows or mask legitimate saves. """ import pytest from conftest import patch_dns_ok, patch_dns_fail from db import clean_url, TRACKING_PARAMS def test_strips_fragment(monkeypatch): patch_dns_ok(monkeypatch) assert clean_url("https://example.com/page#section") == "https://example.com/page" def test_prefers_https(monkeypatch): patch_dns_ok(monkeypatch) assert clean_url("http://example.com/page") == "https://example.com/page" def test_lowercases_hostname(monkeypatch): patch_dns_ok(monkeypatch) assert clean_url("https://EXAMPLE.COM/page") == "https://example.com/page" def test_preserves_path_case(monkeypatch): """Paths are case-sensitive and should not be lowercased.""" patch_dns_ok(monkeypatch) assert clean_url("https://example.com/Foo/Bar") == "https://example.com/Foo/Bar" def test_strips_default_https_port(monkeypatch): patch_dns_ok(monkeypatch) assert clean_url("https://example.com:443/page") == "https://example.com/page" @pytest.mark.xfail(reason="clean_url upgrades http->https before the port-default check, " "so port 80 is not stripped. Minor dedup bug — harmless but worth fixing.") def test_strips_http_port_80(monkeypatch): """Expected: http://foo:80 → https://foo (both scheme-upgrade and port-strip). Currently fails because scheme is upgraded to https *before* the port check, so `scheme == "http" and port == 80` is never true by the time the check runs. """ patch_dns_ok(monkeypatch) assert clean_url("http://example.com:80/page") == "https://example.com/page" def test_preserves_non_default_port(monkeypatch): patch_dns_ok(monkeypatch) assert clean_url("https://example.com:8443/page") == "https://example.com:8443/page" def test_strips_trailing_slash(monkeypatch): patch_dns_ok(monkeypatch) assert clean_url("https://example.com/page/") == "https://example.com/page" def test_root_slash_preserved(monkeypatch): patch_dns_ok(monkeypatch) assert clean_url("https://example.com/") == "https://example.com/" @pytest.mark.parametrize("param", sorted(TRACKING_PARAMS)) def test_tracking_params_stripped(monkeypatch, param): patch_dns_ok(monkeypatch) result = clean_url(f"https://example.com/page?{param}=value&keep=yes") assert param not in result assert "keep=yes" in result def test_strips_www_when_nonwww_resolves(monkeypatch): """Standard case: strip `www.` prefix to canonicalize.""" patch_dns_ok(monkeypatch) assert clean_url("https://www.example.com/page") == "https://example.com/page" def test_preserves_www_when_nonwww_does_not_resolve(monkeypatch): """Regression for 6ffd38d. Some sites only serve their content at `www.domain.tld`; the bare domain doesn't resolve. Stripping `www.` in that case produced a URL that we could never actually fetch or dedupe against the real one. """ patch_dns_fail(monkeypatch) assert clean_url("https://www.example.com/page") == "https://www.example.com/page" def test_query_params_sorted_for_stable_ordering(monkeypatch): """Same URL with different param orderings should produce the same clean URL.""" patch_dns_ok(monkeypatch) a = clean_url("https://example.com/page?b=2&a=1") b = clean_url("https://example.com/page?a=1&b=2") assert a == b def test_path_and_query_preserved_through_cleanup(monkeypatch): patch_dns_ok(monkeypatch) result = clean_url("https://example.com/path/to/page?id=42&utm_source=twitter") assert result == "https://example.com/path/to/page?id=42"