"""Tests for `index_url` — the main write path. Covers UPSERT behavior, links being replaced on re-index, FTS index staying in sync via triggers, and the connection pool returning clean connections. """ from unittest.mock import patch from conftest import patch_dns_ok import db as db_module from db import get_db, return_db, index_url def _mock_fetch_page(title="Test Page", body="test body text", links=None, meta=""): """Return a replacement for db.fetch_page that yields canned data.""" links = links or [] def fake(url): return (title, body, links, meta) return fake def test_insert_creates_page_row_and_fts_entry(temp_db, monkeypatch): patch_dns_ok(monkeypatch) monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page( title="Rust Intro", body="ownership and borrowing basics", links=[], )) index_url("https://example.com/rust") db = get_db() try: row = db.execute("SELECT id, title, body FROM pages").fetchone() assert row is not None assert row["title"] == "Rust Intro" assert "ownership" in row["body"] # Verify FTS trigger fired. fts_hits = db.execute( "SELECT rowid FROM pages_fts WHERE pages_fts MATCH 'ownership*'" ).fetchall() assert len(fts_hits) == 1 assert fts_hits[0]["rowid"] == row["id"] finally: return_db(db) def test_re_indexing_same_url_updates_in_place(temp_db, monkeypatch): patch_dns_ok(monkeypatch) monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page( title="First Title", body="first body", links=[], )) index_url("https://example.com/page") monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page( title="Second Title", body="second body", links=[], )) index_url("https://example.com/page") db = get_db() try: rows = db.execute("SELECT title, body FROM pages").fetchall() finally: return_db(db) assert len(rows) == 1, "re-indexing should UPDATE not INSERT" assert rows[0]["title"] == "Second Title" def test_links_replaced_on_reindex(temp_db, monkeypatch): patch_dns_ok(monkeypatch) monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page( title="T", body="b", links=[("https://example.com/a", "first"), ("https://example.com/b", "second")], )) index_url("https://example.com/src") monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page( title="T", body="b", links=[("https://example.com/c", "third-only")], )) index_url("https://example.com/src") db = get_db() try: rows = db.execute("SELECT url FROM links").fetchall() finally: return_db(db) urls = {r["url"] for r in rows} assert urls == {"https://example.com/c"}, "old links should be deleted on reindex" def test_url_cleaned_before_insert(temp_db, monkeypatch): """index_url should apply clean_url before touching the DB, so tracking params don't create duplicate rows.""" patch_dns_ok(monkeypatch) monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page(title="T", body="b")) index_url("https://example.com/page?utm_source=twitter#frag") db = get_db() try: rows = db.execute("SELECT url FROM pages").fetchall() finally: return_db(db) assert len(rows) == 1 assert rows[0]["url"] == "https://example.com/page" def test_summary_populated_from_meta_description(temp_db, monkeypatch): patch_dns_ok(monkeypatch) long_meta = "A thoughtful description that exceeds twenty chars" monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page( title="T", body="b", meta=long_meta, )) index_url("https://example.com/page") db = get_db() try: row = db.execute("SELECT summary FROM pages").fetchone() finally: return_db(db) assert row["summary"] == long_meta def test_short_meta_description_not_stored_as_summary(temp_db, monkeypatch): patch_dns_ok(monkeypatch) monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page( title="T", body="b", meta="too short", )) index_url("https://example.com/page") db = get_db() try: row = db.execute("SELECT summary FROM pages").fetchone() finally: return_db(db) assert row["summary"] == "" def test_pool_returns_clean_connection(temp_db, monkeypatch): """Regression for 1bc695f — `return_db` should roll back uncommitted work so the next consumer doesn't see stale state.""" patch_dns_ok(monkeypatch) monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page(title="T", body="b")) index_url("https://example.com/one") # Take a connection, make a dirty uncommitted change, return it. db = get_db() db.execute("INSERT INTO pages (url, title, body) VALUES (?, ?, ?)", ("https://dirty.example.com/", "dirty", "dirty")) # NOTE: no commit here — this is the dirty state we want rolled back. return_db(db) # A later consumer must not see the dirty row. db2 = get_db() try: urls = {r["url"] for r in db2.execute("SELECT url FROM pages").fetchall()} finally: return_db(db2) assert "https://dirty.example.com/" not in urls