added pytest test suite (174 tests)

174 tests covering URL normalization, FTS5 query sanitization, SSRF/CSRF
guards, sharing-mode logic, DB schema and upsert paths, handler
end-to-end flows, and gateway body-size / mesh-whitelist guards. Each
recent bug-fix commit (6ffd38d, 1bc695f, 8dffd8c) has an explicit
regression test in test_regressions.py. One xfail documents a minor
latent bug in clean_url where port 80 is not stripped from upgraded
https URLs.
This commit is contained in:
lichenblankie 2026-04-24 15:03:29 -07:00
parent 55c6619ba3
commit 4d522ce62c
18 changed files with 1673 additions and 0 deletions

113
tests/test_fts_sanitizer.py Normal file
View file

@ -0,0 +1,113 @@
"""Tests for `_sanitize_fts_query`.
The sanitizer is the boundary between user input and FTS5 MATCH syntax.
Commit 1bc695f tightened it after noticing that colons and operator words
could escape the quoting. These tests keep that regression dead.
"""
import pytest
from handlers import _sanitize_fts_query
def test_empty_query_returns_no_match_token():
assert _sanitize_fts_query("") == '""'
assert _sanitize_fts_query(" ") == '""'
def test_single_word_becomes_prefix_match():
assert _sanitize_fts_query("rust") == "rust*"
def test_multi_word_quotes_all_but_last():
result = _sanitize_fts_query("rust borrow checker")
assert result == '"rust" "borrow" checker*'
def test_stopwords_are_dropped():
# "the" and "a" should vanish; only "cat" remains (and gets prefix star).
assert _sanitize_fts_query("the a cat") == "cat*"
def test_all_stopwords_returns_no_match_token():
assert _sanitize_fts_query("the and or") == '""'
@pytest.mark.parametrize("bad_char", ["'", "(", ")", "+", "-", "^", "~", ":"])
def test_fts5_operators_stripped_from_tokens(bad_char):
"""FTS5 special chars inside user tokens must not survive — regression for 1bc695f.
The sanitizer legitimately adds `"` around tokens and a trailing `*` for prefix
matching; both are excluded from this check.
"""
payload = f"foo{bad_char}bar"
out = _sanitize_fts_query(payload)
assert bad_char not in out, f"{bad_char!r} leaked into {out!r}"
def test_asterisk_only_appears_as_trailing_prefix():
"""Input `*` should not become an in-token asterisk; the sanitizer's trailing `*` is fine."""
out = _sanitize_fts_query("foo*bar")
assert out.count("*") <= 1
if "*" in out:
assert out.endswith("*")
def test_quote_in_input_does_not_break_out_of_quoted_token():
"""A `"` in user input must not close the sanitizer's protective quoting.
The sanitizer wraps each non-last token in double quotes; if a stray `"` from
the user slipped through, the resulting FTS5 expression would be interpreted
as broken syntax or, worse, a column filter.
"""
out = _sanitize_fts_query('foo"bar baz"qux')
# Each pair of quotes in the output should be balanced and around a clean token.
assert out.count('"') % 2 == 0
# No embedded quotes inside a quoted region.
import re
for match in re.findall(r'"[^"]*"', out):
inner = match[1:-1]
assert '"' not in inner
@pytest.mark.parametrize("op", ["AND", "OR", "NOT", "NEAR", "and", "or", "not", "near"])
def test_fts5_operator_words_dropped(op):
"""AND/OR/NOT/NEAR would be interpreted as operators on the unquoted last token."""
out = _sanitize_fts_query(f"foo {op} bar")
# the operator word itself should not appear
assert op.upper() not in out.upper().split('"'), f"operator {op!r} survived in {out!r}"
def test_injection_payload_produces_valid_fts5():
"""End-to-end: a realistic injection payload must produce syntactically valid FTS5.
We run the sanitized output through a throwaway FTS5 table; if the sanitizer
leaks operator characters the MATCH either raises or interprets malicious syntax.
"""
import sqlite3
conn = sqlite3.connect(":memory:")
conn.execute("CREATE VIRTUAL TABLE t USING fts5(body)")
conn.execute("INSERT INTO t (body) VALUES ('hello world')")
for payload in [
'foo": OR bar NOT baz AND qux*()',
'" OR 1=1 --',
"title:secret AND public",
"(((",
"^^^~~~",
]:
q = _sanitize_fts_query(payload)
# Must not raise — if operators leaked, FTS5 would error or mis-parse.
conn.execute("SELECT * FROM t WHERE t MATCH ?", (q,)).fetchall()
conn.close()
def test_whitespace_only_tokens_dropped():
# tokens that become empty after stripping special chars should not produce bare quotes
out = _sanitize_fts_query('""" "" ""')
assert out == '""'
def test_colon_stripped():
"""Regression for 1bc695f — colon is an FTS5 column filter and must be stripped."""
out = _sanitize_fts_query("title:secret")
assert ":" not in out