Add pytest test suite
Some checks failed
/ build (push) Failing after 5s

174 tests covering URL normalization, FTS5 query sanitization, SSRF/CSRF
guards, sharing-mode logic, DB schema and upsert paths, handler
end-to-end flows, and gateway body-size / mesh-whitelist guards. Each
recent bug-fix commit (6ffd38d, 1bc695f, 8dffd8c) has an explicit
regression test in test_regressions.py. One xfail documents a minor
latent bug in clean_url where port 80 is not stripped from upgraded
https URLs.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Derick Phan 2026-04-24 15:03:29 -07:00
parent 8dffd8ccea
commit 44a16dea98
No known key found for this signature in database
18 changed files with 1673 additions and 0 deletions

View file

@ -0,0 +1,138 @@
"""Tests for link extraction inside `fetch_page`.
Link extraction powers the "trusted sites" fallback on empty searches and
feeds the `links` table. Rules: same-domain only, skip binary extensions,
skip Wikipedia special pages, resolve relatives via urljoin.
"""
from unittest.mock import patch
from conftest import patch_dns_ok
import db as db_module
class FakeResponse:
def __init__(self, text, status_code=200):
self.text = text
self.status_code = status_code
self.is_redirect = False
self.headers = {}
def raise_for_status(self):
if self.status_code >= 400:
raise Exception(f"status {self.status_code}")
def _fetch_with_html(monkeypatch, url, html):
"""Invoke fetch_page against `url` with `html` as the mocked response body."""
patch_dns_ok(monkeypatch)
with patch.object(db_module, "requests") as mock_requests:
mock_requests.get.return_value = FakeResponse(html)
return db_module.fetch_page(url)
def test_only_same_domain_links_kept(monkeypatch):
html = """
<html><body>
<a href="https://example.com/a">same</a>
<a href="https://other.com/b">cross</a>
<a href="https://sub.example.com/c">subdomain</a>
</body></html>
"""
_, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/", html)
urls = [u for u, _label in links]
assert "https://example.com/a" in urls
assert "https://other.com/b" not in urls
assert "https://sub.example.com/c" not in urls
def test_binary_extensions_skipped(monkeypatch):
html = """
<html><body>
<a href="/real-page">keep</a>
<a href="/image.png">skip</a>
<a href="/doc.pdf">skip</a>
<a href="/archive.zip">skip</a>
<a href="/song.mp3">skip</a>
<a href="/styles.css">skip</a>
</body></html>
"""
_, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/", html)
urls = [u for u, _label in links]
assert "https://example.com/real-page" in urls
for ext in (".png", ".pdf", ".zip", ".mp3", ".css"):
assert not any(u.endswith(ext) for u in urls), f"{ext} leaked through"
def test_wikipedia_special_pages_skipped(monkeypatch):
html = """
<html><body>
<a href="/wiki/Main_Page">keep</a>
<a href="/wiki/Special:Random">skip</a>
<a href="/wiki/Talk:Foo">skip</a>
<a href="/wiki/User:Jimbo">skip</a>
<a href="/wiki/Category:Bar">skip</a>
</body></html>
"""
_, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/", html)
urls = [u for u, _label in links]
assert "https://example.com/wiki/Main_Page" in urls
for skip in ("Special:Random", "Talk:Foo", "User:Jimbo", "Category:Bar"):
assert not any(skip in u for u in urls), f"wiki {skip!r} leaked"
def test_relative_urls_resolved(monkeypatch):
html = """<html><body><a href="/relative/path">r</a></body></html>"""
_, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/start", html)
urls = [u for u, _label in links]
assert "https://example.com/relative/path" in urls
def test_fragment_stripped_from_extracted_links(monkeypatch):
html = """<html><body><a href="/page#section">r</a></body></html>"""
_, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/", html)
urls = [u for u, _label in links]
assert "https://example.com/page" in urls
assert not any("#" in u for u in urls)
def test_duplicate_links_deduped(monkeypatch):
html = """
<html><body>
<a href="/a">first</a>
<a href="/a">second</a>
<a href="/a">third</a>
</body></html>
"""
_, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/", html)
urls = [u for u, _label in links]
assert urls.count("https://example.com/a") == 1
def test_label_truncated_to_200(monkeypatch):
long_text = "x" * 500
html = f'<html><body><a href="/p">{long_text}</a></body></html>'
_, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/", html)
assert len(links) == 1
_, label = links[0]
assert len(label) <= 200
def test_meta_description_extracted(monkeypatch):
html = """
<html><head>
<meta name="description" content="the real description">
</head><body><p>body content</p></body></html>
"""
title, body, links, meta = _fetch_with_html(monkeypatch, "https://example.com/", html)
assert meta == "the real description"
def test_og_description_fallback(monkeypatch):
"""When there's no <meta name=description>, og:description wins."""
html = """
<html><head>
<meta property="og:description" content="open graph fallback">
</head><body><p>body</p></body></html>
"""
_, _, _, meta = _fetch_with_html(monkeypatch, "https://example.com/", html)
assert meta == "open graph fallback"