"""Tests for link extraction inside `fetch_page`. Link extraction powers the "trusted sites" fallback on empty searches and feeds the `links` table. Rules: same-domain only, skip binary extensions, skip Wikipedia special pages, resolve relatives via urljoin. """ from unittest.mock import patch from conftest import patch_dns_ok import db as db_module class FakeResponse: def __init__(self, text, status_code=200): self.text = text self.status_code = status_code self.is_redirect = False self.headers = {} def raise_for_status(self): if self.status_code >= 400: raise Exception(f"status {self.status_code}") def _fetch_with_html(monkeypatch, url, html): """Invoke fetch_page against `url` with `html` as the mocked response body.""" patch_dns_ok(monkeypatch) with patch.object(db_module, "requests") as mock_requests: mock_requests.get.return_value = FakeResponse(html) return db_module.fetch_page(url) def test_only_same_domain_links_kept(monkeypatch): html = """
same cross subdomain """ _, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/", html) urls = [u for u, _label in links] assert "https://example.com/a" in urls assert "https://other.com/b" not in urls assert "https://sub.example.com/c" not in urls def test_binary_extensions_skipped(monkeypatch): html = """ keep skip skip skip skip skip """ _, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/", html) urls = [u for u, _label in links] assert "https://example.com/real-page" in urls for ext in (".png", ".pdf", ".zip", ".mp3", ".css"): assert not any(u.endswith(ext) for u in urls), f"{ext} leaked through" def test_wikipedia_special_pages_skipped(monkeypatch): html = """ keep skip skip skip skip """ _, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/", html) urls = [u for u, _label in links] assert "https://example.com/wiki/Main_Page" in urls for skip in ("Special:Random", "Talk:Foo", "User:Jimbo", "Category:Bar"): assert not any(skip in u for u in urls), f"wiki {skip!r} leaked" def test_relative_urls_resolved(monkeypatch): html = """r""" _, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/start", html) urls = [u for u, _label in links] assert "https://example.com/relative/path" in urls def test_fragment_stripped_from_extracted_links(monkeypatch): html = """r""" _, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/", html) urls = [u for u, _label in links] assert "https://example.com/page" in urls assert not any("#" in u for u in urls) def test_duplicate_links_deduped(monkeypatch): html = """ first second third """ _, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/", html) urls = [u for u, _label in links] assert urls.count("https://example.com/a") == 1 def test_label_truncated_to_200(monkeypatch): long_text = "x" * 500 html = f'{long_text}' _, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/", html) assert len(links) == 1 _, label = links[0] assert len(label) <= 200 def test_meta_description_extracted(monkeypatch): html = """body content
""" title, body, links, meta = _fetch_with_html(monkeypatch, "https://example.com/", html) assert meta == "the real description" def test_og_description_fallback(monkeypatch): """When there's no , og:description wins.""" html = """body
""" _, _, _, meta = _fetch_with_html(monkeypatch, "https://example.com/", html) assert meta == "open graph fallback"