174 tests covering URL normalization, FTS5 query sanitization, SSRF/CSRF guards, sharing-mode logic, DB schema and upsert paths, handler end-to-end flows, and gateway body-size / mesh-whitelist guards. Each recent bug-fix commit (6ffd38d,1bc695f,8dffd8c) has an explicit regression test in test_regressions.py. One xfail documents a minor latent bug in clean_url where port 80 is not stripped from upgraded https URLs.
138 lines
4.7 KiB
Python
138 lines
4.7 KiB
Python
"""Tests for link extraction inside `fetch_page`.
|
|
|
|
Link extraction powers the "trusted sites" fallback on empty searches and
|
|
feeds the `links` table. Rules: same-domain only, skip binary extensions,
|
|
skip Wikipedia special pages, resolve relatives via urljoin.
|
|
"""
|
|
from unittest.mock import patch
|
|
|
|
from conftest import patch_dns_ok
|
|
import db as db_module
|
|
|
|
|
|
class FakeResponse:
|
|
def __init__(self, text, status_code=200):
|
|
self.text = text
|
|
self.status_code = status_code
|
|
self.is_redirect = False
|
|
self.headers = {}
|
|
|
|
def raise_for_status(self):
|
|
if self.status_code >= 400:
|
|
raise Exception(f"status {self.status_code}")
|
|
|
|
|
|
def _fetch_with_html(monkeypatch, url, html):
|
|
"""Invoke fetch_page against `url` with `html` as the mocked response body."""
|
|
patch_dns_ok(monkeypatch)
|
|
with patch.object(db_module, "requests") as mock_requests:
|
|
mock_requests.get.return_value = FakeResponse(html)
|
|
return db_module.fetch_page(url)
|
|
|
|
|
|
def test_only_same_domain_links_kept(monkeypatch):
|
|
html = """
|
|
<html><body>
|
|
<a href="https://example.com/a">same</a>
|
|
<a href="https://other.com/b">cross</a>
|
|
<a href="https://sub.example.com/c">subdomain</a>
|
|
</body></html>
|
|
"""
|
|
_, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/", html)
|
|
urls = [u for u, _label in links]
|
|
assert "https://example.com/a" in urls
|
|
assert "https://other.com/b" not in urls
|
|
assert "https://sub.example.com/c" not in urls
|
|
|
|
|
|
def test_binary_extensions_skipped(monkeypatch):
|
|
html = """
|
|
<html><body>
|
|
<a href="/real-page">keep</a>
|
|
<a href="/image.png">skip</a>
|
|
<a href="/doc.pdf">skip</a>
|
|
<a href="/archive.zip">skip</a>
|
|
<a href="/song.mp3">skip</a>
|
|
<a href="/styles.css">skip</a>
|
|
</body></html>
|
|
"""
|
|
_, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/", html)
|
|
urls = [u for u, _label in links]
|
|
assert "https://example.com/real-page" in urls
|
|
for ext in (".png", ".pdf", ".zip", ".mp3", ".css"):
|
|
assert not any(u.endswith(ext) for u in urls), f"{ext} leaked through"
|
|
|
|
|
|
def test_wikipedia_special_pages_skipped(monkeypatch):
|
|
html = """
|
|
<html><body>
|
|
<a href="/wiki/Main_Page">keep</a>
|
|
<a href="/wiki/Special:Random">skip</a>
|
|
<a href="/wiki/Talk:Foo">skip</a>
|
|
<a href="/wiki/User:Jimbo">skip</a>
|
|
<a href="/wiki/Category:Bar">skip</a>
|
|
</body></html>
|
|
"""
|
|
_, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/", html)
|
|
urls = [u for u, _label in links]
|
|
assert "https://example.com/wiki/Main_Page" in urls
|
|
for skip in ("Special:Random", "Talk:Foo", "User:Jimbo", "Category:Bar"):
|
|
assert not any(skip in u for u in urls), f"wiki {skip!r} leaked"
|
|
|
|
|
|
def test_relative_urls_resolved(monkeypatch):
|
|
html = """<html><body><a href="/relative/path">r</a></body></html>"""
|
|
_, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/start", html)
|
|
urls = [u for u, _label in links]
|
|
assert "https://example.com/relative/path" in urls
|
|
|
|
|
|
def test_fragment_stripped_from_extracted_links(monkeypatch):
|
|
html = """<html><body><a href="/page#section">r</a></body></html>"""
|
|
_, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/", html)
|
|
urls = [u for u, _label in links]
|
|
assert "https://example.com/page" in urls
|
|
assert not any("#" in u for u in urls)
|
|
|
|
|
|
def test_duplicate_links_deduped(monkeypatch):
|
|
html = """
|
|
<html><body>
|
|
<a href="/a">first</a>
|
|
<a href="/a">second</a>
|
|
<a href="/a">third</a>
|
|
</body></html>
|
|
"""
|
|
_, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/", html)
|
|
urls = [u for u, _label in links]
|
|
assert urls.count("https://example.com/a") == 1
|
|
|
|
|
|
def test_label_truncated_to_200(monkeypatch):
|
|
long_text = "x" * 500
|
|
html = f'<html><body><a href="/p">{long_text}</a></body></html>'
|
|
_, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/", html)
|
|
assert len(links) == 1
|
|
_, label = links[0]
|
|
assert len(label) <= 200
|
|
|
|
|
|
def test_meta_description_extracted(monkeypatch):
|
|
html = """
|
|
<html><head>
|
|
<meta name="description" content="the real description">
|
|
</head><body><p>body content</p></body></html>
|
|
"""
|
|
title, body, links, meta = _fetch_with_html(monkeypatch, "https://example.com/", html)
|
|
assert meta == "the real description"
|
|
|
|
|
|
def test_og_description_fallback(monkeypatch):
|
|
"""When there's no <meta name=description>, og:description wins."""
|
|
html = """
|
|
<html><head>
|
|
<meta property="og:description" content="open graph fallback">
|
|
</head><body><p>body</p></body></html>
|
|
"""
|
|
_, _, _, meta = _fetch_with_html(monkeypatch, "https://example.com/", html)
|
|
assert meta == "open graph fallback"
|