tinyweb/tests/test_db_index_url.py
Derick Phan 44a16dea98
Some checks failed
/ build (push) Failing after 5s
Add pytest test suite
174 tests covering URL normalization, FTS5 query sanitization, SSRF/CSRF
guards, sharing-mode logic, DB schema and upsert paths, handler
end-to-end flows, and gateway body-size / mesh-whitelist guards. Each
recent bug-fix commit (6ffd38d, 1bc695f, 8dffd8c) has an explicit
regression test in test_regressions.py. One xfail documents a minor
latent bug in clean_url where port 80 is not stripped from upgraded
https URLs.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-24 15:03:29 -07:00

155 lines
5.2 KiB
Python

"""Tests for `index_url` — the main write path.
Covers UPSERT behavior, links being replaced on re-index, FTS index staying
in sync via triggers, and the connection pool returning clean connections.
"""
from unittest.mock import patch
from conftest import patch_dns_ok
import db as db_module
from db import get_db, return_db, index_url
def _mock_fetch_page(title="Test Page", body="test body text", links=None, meta=""):
"""Return a replacement for db.fetch_page that yields canned data."""
links = links or []
def fake(url):
return (title, body, links, meta)
return fake
def test_insert_creates_page_row_and_fts_entry(temp_db, monkeypatch):
patch_dns_ok(monkeypatch)
monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page(
title="Rust Intro", body="ownership and borrowing basics", links=[],
))
index_url("https://example.com/rust")
db = get_db()
try:
row = db.execute("SELECT id, title, body FROM pages").fetchone()
assert row is not None
assert row["title"] == "Rust Intro"
assert "ownership" in row["body"]
# Verify FTS trigger fired.
fts_hits = db.execute(
"SELECT rowid FROM pages_fts WHERE pages_fts MATCH 'ownership*'"
).fetchall()
assert len(fts_hits) == 1
assert fts_hits[0]["rowid"] == row["id"]
finally:
return_db(db)
def test_re_indexing_same_url_updates_in_place(temp_db, monkeypatch):
patch_dns_ok(monkeypatch)
monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page(
title="First Title", body="first body", links=[],
))
index_url("https://example.com/page")
monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page(
title="Second Title", body="second body", links=[],
))
index_url("https://example.com/page")
db = get_db()
try:
rows = db.execute("SELECT title, body FROM pages").fetchall()
finally:
return_db(db)
assert len(rows) == 1, "re-indexing should UPDATE not INSERT"
assert rows[0]["title"] == "Second Title"
def test_links_replaced_on_reindex(temp_db, monkeypatch):
patch_dns_ok(monkeypatch)
monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page(
title="T", body="b",
links=[("https://example.com/a", "first"), ("https://example.com/b", "second")],
))
index_url("https://example.com/src")
monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page(
title="T", body="b",
links=[("https://example.com/c", "third-only")],
))
index_url("https://example.com/src")
db = get_db()
try:
rows = db.execute("SELECT url FROM links").fetchall()
finally:
return_db(db)
urls = {r["url"] for r in rows}
assert urls == {"https://example.com/c"}, "old links should be deleted on reindex"
def test_url_cleaned_before_insert(temp_db, monkeypatch):
"""index_url should apply clean_url before touching the DB, so tracking params
don't create duplicate rows."""
patch_dns_ok(monkeypatch)
monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page(title="T", body="b"))
index_url("https://example.com/page?utm_source=twitter#frag")
db = get_db()
try:
rows = db.execute("SELECT url FROM pages").fetchall()
finally:
return_db(db)
assert len(rows) == 1
assert rows[0]["url"] == "https://example.com/page"
def test_summary_populated_from_meta_description(temp_db, monkeypatch):
patch_dns_ok(monkeypatch)
long_meta = "A thoughtful description that exceeds twenty chars"
monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page(
title="T", body="b", meta=long_meta,
))
index_url("https://example.com/page")
db = get_db()
try:
row = db.execute("SELECT summary FROM pages").fetchone()
finally:
return_db(db)
assert row["summary"] == long_meta
def test_short_meta_description_not_stored_as_summary(temp_db, monkeypatch):
patch_dns_ok(monkeypatch)
monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page(
title="T", body="b", meta="too short",
))
index_url("https://example.com/page")
db = get_db()
try:
row = db.execute("SELECT summary FROM pages").fetchone()
finally:
return_db(db)
assert row["summary"] == ""
def test_pool_returns_clean_connection(temp_db, monkeypatch):
"""Regression for 1bc695f — `return_db` should roll back uncommitted work
so the next consumer doesn't see stale state."""
patch_dns_ok(monkeypatch)
monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page(title="T", body="b"))
index_url("https://example.com/one")
# Take a connection, make a dirty uncommitted change, return it.
db = get_db()
db.execute("INSERT INTO pages (url, title, body) VALUES (?, ?, ?)",
("https://dirty.example.com/", "dirty", "dirty"))
# NOTE: no commit here — this is the dirty state we want rolled back.
return_db(db)
# A later consumer must not see the dirty row.
db2 = get_db()
try:
urls = {r["url"] for r in db2.execute("SELECT url FROM pages").fetchall()}
finally:
return_db(db2)
assert "https://dirty.example.com/" not in urls