added pytest test suite (174 tests)
174 tests covering URL normalization, FTS5 query sanitization, SSRF/CSRF guards, sharing-mode logic, DB schema and upsert paths, handler end-to-end flows, and gateway body-size / mesh-whitelist guards. Each recent bug-fix commit (6ffd38d,1bc695f,8dffd8c) has an explicit regression test in test_regressions.py. One xfail documents a minor latent bug in clean_url where port 80 is not stripped from upgraded https URLs.
This commit is contained in:
parent
55c6619ba3
commit
4d522ce62c
18 changed files with 1673 additions and 0 deletions
155
tests/test_db_index_url.py
Normal file
155
tests/test_db_index_url.py
Normal file
|
|
@ -0,0 +1,155 @@
|
|||
"""Tests for `index_url` — the main write path.
|
||||
|
||||
Covers UPSERT behavior, links being replaced on re-index, FTS index staying
|
||||
in sync via triggers, and the connection pool returning clean connections.
|
||||
"""
|
||||
from unittest.mock import patch
|
||||
|
||||
from conftest import patch_dns_ok
|
||||
import db as db_module
|
||||
from db import get_db, return_db, index_url
|
||||
|
||||
|
||||
def _mock_fetch_page(title="Test Page", body="test body text", links=None, meta=""):
|
||||
"""Return a replacement for db.fetch_page that yields canned data."""
|
||||
links = links or []
|
||||
def fake(url):
|
||||
return (title, body, links, meta)
|
||||
return fake
|
||||
|
||||
|
||||
def test_insert_creates_page_row_and_fts_entry(temp_db, monkeypatch):
|
||||
patch_dns_ok(monkeypatch)
|
||||
monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page(
|
||||
title="Rust Intro", body="ownership and borrowing basics", links=[],
|
||||
))
|
||||
index_url("https://example.com/rust")
|
||||
|
||||
db = get_db()
|
||||
try:
|
||||
row = db.execute("SELECT id, title, body FROM pages").fetchone()
|
||||
assert row is not None
|
||||
assert row["title"] == "Rust Intro"
|
||||
assert "ownership" in row["body"]
|
||||
# Verify FTS trigger fired.
|
||||
fts_hits = db.execute(
|
||||
"SELECT rowid FROM pages_fts WHERE pages_fts MATCH 'ownership*'"
|
||||
).fetchall()
|
||||
assert len(fts_hits) == 1
|
||||
assert fts_hits[0]["rowid"] == row["id"]
|
||||
finally:
|
||||
return_db(db)
|
||||
|
||||
|
||||
def test_re_indexing_same_url_updates_in_place(temp_db, monkeypatch):
|
||||
patch_dns_ok(monkeypatch)
|
||||
monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page(
|
||||
title="First Title", body="first body", links=[],
|
||||
))
|
||||
index_url("https://example.com/page")
|
||||
|
||||
monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page(
|
||||
title="Second Title", body="second body", links=[],
|
||||
))
|
||||
index_url("https://example.com/page")
|
||||
|
||||
db = get_db()
|
||||
try:
|
||||
rows = db.execute("SELECT title, body FROM pages").fetchall()
|
||||
finally:
|
||||
return_db(db)
|
||||
assert len(rows) == 1, "re-indexing should UPDATE not INSERT"
|
||||
assert rows[0]["title"] == "Second Title"
|
||||
|
||||
|
||||
def test_links_replaced_on_reindex(temp_db, monkeypatch):
|
||||
patch_dns_ok(monkeypatch)
|
||||
monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page(
|
||||
title="T", body="b",
|
||||
links=[("https://example.com/a", "first"), ("https://example.com/b", "second")],
|
||||
))
|
||||
index_url("https://example.com/src")
|
||||
|
||||
monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page(
|
||||
title="T", body="b",
|
||||
links=[("https://example.com/c", "third-only")],
|
||||
))
|
||||
index_url("https://example.com/src")
|
||||
|
||||
db = get_db()
|
||||
try:
|
||||
rows = db.execute("SELECT url FROM links").fetchall()
|
||||
finally:
|
||||
return_db(db)
|
||||
urls = {r["url"] for r in rows}
|
||||
assert urls == {"https://example.com/c"}, "old links should be deleted on reindex"
|
||||
|
||||
|
||||
def test_url_cleaned_before_insert(temp_db, monkeypatch):
|
||||
"""index_url should apply clean_url before touching the DB, so tracking params
|
||||
don't create duplicate rows."""
|
||||
patch_dns_ok(monkeypatch)
|
||||
monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page(title="T", body="b"))
|
||||
index_url("https://example.com/page?utm_source=twitter#frag")
|
||||
|
||||
db = get_db()
|
||||
try:
|
||||
rows = db.execute("SELECT url FROM pages").fetchall()
|
||||
finally:
|
||||
return_db(db)
|
||||
assert len(rows) == 1
|
||||
assert rows[0]["url"] == "https://example.com/page"
|
||||
|
||||
|
||||
def test_summary_populated_from_meta_description(temp_db, monkeypatch):
|
||||
patch_dns_ok(monkeypatch)
|
||||
long_meta = "A thoughtful description that exceeds twenty chars"
|
||||
monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page(
|
||||
title="T", body="b", meta=long_meta,
|
||||
))
|
||||
index_url("https://example.com/page")
|
||||
|
||||
db = get_db()
|
||||
try:
|
||||
row = db.execute("SELECT summary FROM pages").fetchone()
|
||||
finally:
|
||||
return_db(db)
|
||||
assert row["summary"] == long_meta
|
||||
|
||||
|
||||
def test_short_meta_description_not_stored_as_summary(temp_db, monkeypatch):
|
||||
patch_dns_ok(monkeypatch)
|
||||
monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page(
|
||||
title="T", body="b", meta="too short",
|
||||
))
|
||||
index_url("https://example.com/page")
|
||||
|
||||
db = get_db()
|
||||
try:
|
||||
row = db.execute("SELECT summary FROM pages").fetchone()
|
||||
finally:
|
||||
return_db(db)
|
||||
assert row["summary"] == ""
|
||||
|
||||
|
||||
def test_pool_returns_clean_connection(temp_db, monkeypatch):
|
||||
"""Regression for 1bc695f — `return_db` should roll back uncommitted work
|
||||
so the next consumer doesn't see stale state."""
|
||||
patch_dns_ok(monkeypatch)
|
||||
monkeypatch.setattr(db_module, "fetch_page", _mock_fetch_page(title="T", body="b"))
|
||||
index_url("https://example.com/one")
|
||||
|
||||
# Take a connection, make a dirty uncommitted change, return it.
|
||||
db = get_db()
|
||||
db.execute("INSERT INTO pages (url, title, body) VALUES (?, ?, ?)",
|
||||
("https://dirty.example.com/", "dirty", "dirty"))
|
||||
# NOTE: no commit here — this is the dirty state we want rolled back.
|
||||
return_db(db)
|
||||
|
||||
# A later consumer must not see the dirty row.
|
||||
db2 = get_db()
|
||||
try:
|
||||
urls = {r["url"] for r in db2.execute("SELECT url FROM pages").fetchall()}
|
||||
finally:
|
||||
return_db(db2)
|
||||
assert "https://dirty.example.com/" not in urls
|
||||
Loading…
Add table
Add a link
Reference in a new issue