added pytest test suite (174 tests)
174 tests covering URL normalization, FTS5 query sanitization, SSRF/CSRF guards, sharing-mode logic, DB schema and upsert paths, handler end-to-end flows, and gateway body-size / mesh-whitelist guards. Each recent bug-fix commit (6ffd38d,1bc695f,8dffd8c) has an explicit regression test in test_regressions.py. One xfail documents a minor latent bug in clean_url where port 80 is not stripped from upgraded https URLs.
This commit is contained in:
parent
55c6619ba3
commit
4d522ce62c
18 changed files with 1673 additions and 0 deletions
113
tests/test_fts_sanitizer.py
Normal file
113
tests/test_fts_sanitizer.py
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
"""Tests for `_sanitize_fts_query`.
|
||||
|
||||
The sanitizer is the boundary between user input and FTS5 MATCH syntax.
|
||||
Commit 1bc695f tightened it after noticing that colons and operator words
|
||||
could escape the quoting. These tests keep that regression dead.
|
||||
"""
|
||||
import pytest
|
||||
|
||||
from handlers import _sanitize_fts_query
|
||||
|
||||
|
||||
def test_empty_query_returns_no_match_token():
|
||||
assert _sanitize_fts_query("") == '""'
|
||||
assert _sanitize_fts_query(" ") == '""'
|
||||
|
||||
|
||||
def test_single_word_becomes_prefix_match():
|
||||
assert _sanitize_fts_query("rust") == "rust*"
|
||||
|
||||
|
||||
def test_multi_word_quotes_all_but_last():
|
||||
result = _sanitize_fts_query("rust borrow checker")
|
||||
assert result == '"rust" "borrow" checker*'
|
||||
|
||||
|
||||
def test_stopwords_are_dropped():
|
||||
# "the" and "a" should vanish; only "cat" remains (and gets prefix star).
|
||||
assert _sanitize_fts_query("the a cat") == "cat*"
|
||||
|
||||
|
||||
def test_all_stopwords_returns_no_match_token():
|
||||
assert _sanitize_fts_query("the and or") == '""'
|
||||
|
||||
|
||||
@pytest.mark.parametrize("bad_char", ["'", "(", ")", "+", "-", "^", "~", ":"])
|
||||
def test_fts5_operators_stripped_from_tokens(bad_char):
|
||||
"""FTS5 special chars inside user tokens must not survive — regression for 1bc695f.
|
||||
|
||||
The sanitizer legitimately adds `"` around tokens and a trailing `*` for prefix
|
||||
matching; both are excluded from this check.
|
||||
"""
|
||||
payload = f"foo{bad_char}bar"
|
||||
out = _sanitize_fts_query(payload)
|
||||
assert bad_char not in out, f"{bad_char!r} leaked into {out!r}"
|
||||
|
||||
|
||||
def test_asterisk_only_appears_as_trailing_prefix():
|
||||
"""Input `*` should not become an in-token asterisk; the sanitizer's trailing `*` is fine."""
|
||||
out = _sanitize_fts_query("foo*bar")
|
||||
assert out.count("*") <= 1
|
||||
if "*" in out:
|
||||
assert out.endswith("*")
|
||||
|
||||
|
||||
def test_quote_in_input_does_not_break_out_of_quoted_token():
|
||||
"""A `"` in user input must not close the sanitizer's protective quoting.
|
||||
|
||||
The sanitizer wraps each non-last token in double quotes; if a stray `"` from
|
||||
the user slipped through, the resulting FTS5 expression would be interpreted
|
||||
as broken syntax or, worse, a column filter.
|
||||
"""
|
||||
out = _sanitize_fts_query('foo"bar baz"qux')
|
||||
# Each pair of quotes in the output should be balanced and around a clean token.
|
||||
assert out.count('"') % 2 == 0
|
||||
# No embedded quotes inside a quoted region.
|
||||
import re
|
||||
for match in re.findall(r'"[^"]*"', out):
|
||||
inner = match[1:-1]
|
||||
assert '"' not in inner
|
||||
|
||||
|
||||
@pytest.mark.parametrize("op", ["AND", "OR", "NOT", "NEAR", "and", "or", "not", "near"])
|
||||
def test_fts5_operator_words_dropped(op):
|
||||
"""AND/OR/NOT/NEAR would be interpreted as operators on the unquoted last token."""
|
||||
out = _sanitize_fts_query(f"foo {op} bar")
|
||||
# the operator word itself should not appear
|
||||
assert op.upper() not in out.upper().split('"'), f"operator {op!r} survived in {out!r}"
|
||||
|
||||
|
||||
def test_injection_payload_produces_valid_fts5():
|
||||
"""End-to-end: a realistic injection payload must produce syntactically valid FTS5.
|
||||
|
||||
We run the sanitized output through a throwaway FTS5 table; if the sanitizer
|
||||
leaks operator characters the MATCH either raises or interprets malicious syntax.
|
||||
"""
|
||||
import sqlite3
|
||||
conn = sqlite3.connect(":memory:")
|
||||
conn.execute("CREATE VIRTUAL TABLE t USING fts5(body)")
|
||||
conn.execute("INSERT INTO t (body) VALUES ('hello world')")
|
||||
|
||||
for payload in [
|
||||
'foo": OR bar NOT baz AND qux*()',
|
||||
'" OR 1=1 --',
|
||||
"title:secret AND public",
|
||||
"(((",
|
||||
"^^^~~~",
|
||||
]:
|
||||
q = _sanitize_fts_query(payload)
|
||||
# Must not raise — if operators leaked, FTS5 would error or mis-parse.
|
||||
conn.execute("SELECT * FROM t WHERE t MATCH ?", (q,)).fetchall()
|
||||
conn.close()
|
||||
|
||||
|
||||
def test_whitespace_only_tokens_dropped():
|
||||
# tokens that become empty after stripping special chars should not produce bare quotes
|
||||
out = _sanitize_fts_query('""" "" ""')
|
||||
assert out == '""'
|
||||
|
||||
|
||||
def test_colon_stripped():
|
||||
"""Regression for 1bc695f — colon is an FTS5 column filter and must be stripped."""
|
||||
out = _sanitize_fts_query("title:secret")
|
||||
assert ":" not in out
|
||||
Loading…
Add table
Add a link
Reference in a new issue