174 tests covering URL normalization, FTS5 query sanitization, SSRF/CSRF guards, sharing-mode logic, DB schema and upsert paths, handler end-to-end flows, and gateway body-size / mesh-whitelist guards. Each recent bug-fix commit (6ffd38d,1bc695f,8dffd8c) has an explicit regression test in test_regressions.py. One xfail documents a minor latent bug in clean_url where port 80 is not stripped from upgraded https URLs. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
8dffd8ccea
commit
44a16dea98
18 changed files with 1673 additions and 0 deletions
101
tests/test_url_cleanup.py
Normal file
101
tests/test_url_cleanup.py
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
"""Tests for `clean_url` — URL normalization and tracking-param stripping.
|
||||
|
||||
Clean URLs are the deduplication key in the pages table, so any change to
|
||||
this function can silently cause duplicate rows or mask legitimate saves.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from conftest import patch_dns_ok, patch_dns_fail
|
||||
from db import clean_url, TRACKING_PARAMS
|
||||
|
||||
|
||||
def test_strips_fragment(monkeypatch):
|
||||
patch_dns_ok(monkeypatch)
|
||||
assert clean_url("https://example.com/page#section") == "https://example.com/page"
|
||||
|
||||
|
||||
def test_prefers_https(monkeypatch):
|
||||
patch_dns_ok(monkeypatch)
|
||||
assert clean_url("http://example.com/page") == "https://example.com/page"
|
||||
|
||||
|
||||
def test_lowercases_hostname(monkeypatch):
|
||||
patch_dns_ok(monkeypatch)
|
||||
assert clean_url("https://EXAMPLE.COM/page") == "https://example.com/page"
|
||||
|
||||
|
||||
def test_preserves_path_case(monkeypatch):
|
||||
"""Paths are case-sensitive and should not be lowercased."""
|
||||
patch_dns_ok(monkeypatch)
|
||||
assert clean_url("https://example.com/Foo/Bar") == "https://example.com/Foo/Bar"
|
||||
|
||||
|
||||
def test_strips_default_https_port(monkeypatch):
|
||||
patch_dns_ok(monkeypatch)
|
||||
assert clean_url("https://example.com:443/page") == "https://example.com/page"
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="clean_url upgrades http->https before the port-default check, "
|
||||
"so port 80 is not stripped. Minor dedup bug — harmless but worth fixing.")
|
||||
def test_strips_http_port_80(monkeypatch):
|
||||
"""Expected: http://foo:80 → https://foo (both scheme-upgrade and port-strip).
|
||||
|
||||
Currently fails because scheme is upgraded to https *before* the port check,
|
||||
so `scheme == "http" and port == 80` is never true by the time the check runs.
|
||||
"""
|
||||
patch_dns_ok(monkeypatch)
|
||||
assert clean_url("http://example.com:80/page") == "https://example.com/page"
|
||||
|
||||
|
||||
def test_preserves_non_default_port(monkeypatch):
|
||||
patch_dns_ok(monkeypatch)
|
||||
assert clean_url("https://example.com:8443/page") == "https://example.com:8443/page"
|
||||
|
||||
|
||||
def test_strips_trailing_slash(monkeypatch):
|
||||
patch_dns_ok(monkeypatch)
|
||||
assert clean_url("https://example.com/page/") == "https://example.com/page"
|
||||
|
||||
|
||||
def test_root_slash_preserved(monkeypatch):
|
||||
patch_dns_ok(monkeypatch)
|
||||
assert clean_url("https://example.com/") == "https://example.com/"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("param", sorted(TRACKING_PARAMS))
|
||||
def test_tracking_params_stripped(monkeypatch, param):
|
||||
patch_dns_ok(monkeypatch)
|
||||
result = clean_url(f"https://example.com/page?{param}=value&keep=yes")
|
||||
assert param not in result
|
||||
assert "keep=yes" in result
|
||||
|
||||
|
||||
def test_strips_www_when_nonwww_resolves(monkeypatch):
|
||||
"""Standard case: strip `www.` prefix to canonicalize."""
|
||||
patch_dns_ok(monkeypatch)
|
||||
assert clean_url("https://www.example.com/page") == "https://example.com/page"
|
||||
|
||||
|
||||
def test_preserves_www_when_nonwww_does_not_resolve(monkeypatch):
|
||||
"""Regression for 6ffd38d.
|
||||
|
||||
Some sites only serve their content at `www.domain.tld`; the bare domain
|
||||
doesn't resolve. Stripping `www.` in that case produced a URL that we could
|
||||
never actually fetch or dedupe against the real one.
|
||||
"""
|
||||
patch_dns_fail(monkeypatch)
|
||||
assert clean_url("https://www.example.com/page") == "https://www.example.com/page"
|
||||
|
||||
|
||||
def test_query_params_sorted_for_stable_ordering(monkeypatch):
|
||||
"""Same URL with different param orderings should produce the same clean URL."""
|
||||
patch_dns_ok(monkeypatch)
|
||||
a = clean_url("https://example.com/page?b=2&a=1")
|
||||
b = clean_url("https://example.com/page?a=1&b=2")
|
||||
assert a == b
|
||||
|
||||
|
||||
def test_path_and_query_preserved_through_cleanup(monkeypatch):
|
||||
patch_dns_ok(monkeypatch)
|
||||
result = clean_url("https://example.com/path/to/page?id=42&utm_source=twitter")
|
||||
assert result == "https://example.com/path/to/page?id=42"
|
||||
Loading…
Add table
Add a link
Reference in a new issue