tinyweb/tests/test_url_cleanup.py
Derick Phan 44a16dea98
Some checks failed
/ build (push) Failing after 5s
Add pytest test suite
174 tests covering URL normalization, FTS5 query sanitization, SSRF/CSRF
guards, sharing-mode logic, DB schema and upsert paths, handler
end-to-end flows, and gateway body-size / mesh-whitelist guards. Each
recent bug-fix commit (6ffd38d, 1bc695f, 8dffd8c) has an explicit
regression test in test_regressions.py. One xfail documents a minor
latent bug in clean_url where port 80 is not stripped from upgraded
https URLs.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-24 15:03:29 -07:00

101 lines
3.7 KiB
Python

"""Tests for `clean_url` — URL normalization and tracking-param stripping.
Clean URLs are the deduplication key in the pages table, so any change to
this function can silently cause duplicate rows or mask legitimate saves.
"""
import pytest
from conftest import patch_dns_ok, patch_dns_fail
from db import clean_url, TRACKING_PARAMS
def test_strips_fragment(monkeypatch):
patch_dns_ok(monkeypatch)
assert clean_url("https://example.com/page#section") == "https://example.com/page"
def test_prefers_https(monkeypatch):
patch_dns_ok(monkeypatch)
assert clean_url("http://example.com/page") == "https://example.com/page"
def test_lowercases_hostname(monkeypatch):
patch_dns_ok(monkeypatch)
assert clean_url("https://EXAMPLE.COM/page") == "https://example.com/page"
def test_preserves_path_case(monkeypatch):
"""Paths are case-sensitive and should not be lowercased."""
patch_dns_ok(monkeypatch)
assert clean_url("https://example.com/Foo/Bar") == "https://example.com/Foo/Bar"
def test_strips_default_https_port(monkeypatch):
patch_dns_ok(monkeypatch)
assert clean_url("https://example.com:443/page") == "https://example.com/page"
@pytest.mark.xfail(reason="clean_url upgrades http->https before the port-default check, "
"so port 80 is not stripped. Minor dedup bug — harmless but worth fixing.")
def test_strips_http_port_80(monkeypatch):
"""Expected: http://foo:80 → https://foo (both scheme-upgrade and port-strip).
Currently fails because scheme is upgraded to https *before* the port check,
so `scheme == "http" and port == 80` is never true by the time the check runs.
"""
patch_dns_ok(monkeypatch)
assert clean_url("http://example.com:80/page") == "https://example.com/page"
def test_preserves_non_default_port(monkeypatch):
patch_dns_ok(monkeypatch)
assert clean_url("https://example.com:8443/page") == "https://example.com:8443/page"
def test_strips_trailing_slash(monkeypatch):
patch_dns_ok(monkeypatch)
assert clean_url("https://example.com/page/") == "https://example.com/page"
def test_root_slash_preserved(monkeypatch):
patch_dns_ok(monkeypatch)
assert clean_url("https://example.com/") == "https://example.com/"
@pytest.mark.parametrize("param", sorted(TRACKING_PARAMS))
def test_tracking_params_stripped(monkeypatch, param):
patch_dns_ok(monkeypatch)
result = clean_url(f"https://example.com/page?{param}=value&keep=yes")
assert param not in result
assert "keep=yes" in result
def test_strips_www_when_nonwww_resolves(monkeypatch):
"""Standard case: strip `www.` prefix to canonicalize."""
patch_dns_ok(monkeypatch)
assert clean_url("https://www.example.com/page") == "https://example.com/page"
def test_preserves_www_when_nonwww_does_not_resolve(monkeypatch):
"""Regression for 6ffd38d.
Some sites only serve their content at `www.domain.tld`; the bare domain
doesn't resolve. Stripping `www.` in that case produced a URL that we could
never actually fetch or dedupe against the real one.
"""
patch_dns_fail(monkeypatch)
assert clean_url("https://www.example.com/page") == "https://www.example.com/page"
def test_query_params_sorted_for_stable_ordering(monkeypatch):
"""Same URL with different param orderings should produce the same clean URL."""
patch_dns_ok(monkeypatch)
a = clean_url("https://example.com/page?b=2&a=1")
b = clean_url("https://example.com/page?a=1&b=2")
assert a == b
def test_path_and_query_preserved_through_cleanup(monkeypatch):
patch_dns_ok(monkeypatch)
result = clean_url("https://example.com/path/to/page?id=42&utm_source=twitter")
assert result == "https://example.com/path/to/page?id=42"