tinyweb/tests/test_link_extraction.py

"""Tests for link extraction inside `fetch_page`.

Link extraction powers the "trusted sites" fallback on empty searches and
feeds the `links` table. Rules: same-domain only, skip binary extensions,
skip Wikipedia special pages, resolve relatives via urljoin.
"""
from unittest.mock import patch

from conftest import patch_dns_ok
import db as db_module


class FakeResponse:
    def __init__(self, text, status_code=200):
        self.text = text
        self.status_code = status_code
        self.is_redirect = False
        self.headers = {}

    def raise_for_status(self):
        if self.status_code >= 400:
            raise Exception(f"status {self.status_code}")


def _fetch_with_html(monkeypatch, url, html):
    """Invoke fetch_page against `url` with `html` as the mocked response body."""
    patch_dns_ok(monkeypatch)
    with patch.object(db_module, "requests") as mock_requests:
        mock_requests.get.return_value = FakeResponse(html)
        return db_module.fetch_page(url)


def test_only_same_domain_links_kept(monkeypatch):
    html = """
    <html><body>
      <a href="https://example.com/a">same</a>
      <a href="https://other.com/b">cross</a>
      <a href="https://sub.example.com/c">subdomain</a>
    </body></html>
    """
    _, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/", html)
    urls = [u for u, _label in links]
    assert "https://example.com/a" in urls
    assert "https://other.com/b" not in urls
    assert "https://sub.example.com/c" not in urls


def test_binary_extensions_skipped(monkeypatch):
    html = """
    <html><body>
      <a href="/real-page">keep</a>
      <a href="/image.png">skip</a>
      <a href="/doc.pdf">skip</a>
      <a href="/archive.zip">skip</a>
      <a href="/song.mp3">skip</a>
      <a href="/styles.css">skip</a>
    </body></html>
    """
    _, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/", html)
    urls = [u for u, _label in links]
    assert "https://example.com/real-page" in urls
    for ext in (".png", ".pdf", ".zip", ".mp3", ".css"):
        assert not any(u.endswith(ext) for u in urls), f"{ext} leaked through"


def test_wikipedia_special_pages_skipped(monkeypatch):
    html = """
    <html><body>
      <a href="/wiki/Main_Page">keep</a>
      <a href="/wiki/Special:Random">skip</a>
      <a href="/wiki/Talk:Foo">skip</a>
      <a href="/wiki/User:Jimbo">skip</a>
      <a href="/wiki/Category:Bar">skip</a>
    </body></html>
    """
    _, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/", html)
    urls = [u for u, _label in links]
    assert "https://example.com/wiki/Main_Page" in urls
    for skip in ("Special:Random", "Talk:Foo", "User:Jimbo", "Category:Bar"):
        assert not any(skip in u for u in urls), f"wiki {skip!r} leaked"


def test_relative_urls_resolved(monkeypatch):
    html = """<html><body><a href="/relative/path">r</a></body></html>"""
    _, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/start", html)
    urls = [u for u, _label in links]
    assert "https://example.com/relative/path" in urls


def test_fragment_stripped_from_extracted_links(monkeypatch):
    html = """<html><body><a href="/page#section">r</a></body></html>"""
    _, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/", html)
    urls = [u for u, _label in links]
    assert "https://example.com/page" in urls
    assert not any("#" in u for u in urls)


def test_duplicate_links_deduped(monkeypatch):
    html = """
    <html><body>
      <a href="/a">first</a>
      <a href="/a">second</a>
      <a href="/a">third</a>
    </body></html>
    """
    _, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/", html)
    urls = [u for u, _label in links]
    assert urls.count("https://example.com/a") == 1


def test_label_truncated_to_200(monkeypatch):
    long_text = "x" * 500
    html = f'<html><body><a href="/p">{long_text}</a></body></html>'
    _, _, links, _ = _fetch_with_html(monkeypatch, "https://example.com/", html)
    assert len(links) == 1
    _, label = links[0]
    assert len(label) <= 200


def test_meta_description_extracted(monkeypatch):
    html = """
    <html><head>
      <meta name="description" content="the real description">
    </head><body><p>body content</p></body></html>
    """
    title, body, links, meta = _fetch_with_html(monkeypatch, "https://example.com/", html)
    assert meta == "the real description"


def test_og_description_fallback(monkeypatch):
    """When there's no <meta name=description>, og:description wins."""
    html = """
    <html><head>
      <meta property="og:description" content="open graph fallback">
    </head><body><p>body</p></body></html>
    """
    _, _, _, meta = _fetch_with_html(monkeypatch, "https://example.com/", html)
    assert meta == "open graph fallback"