174 tests covering URL normalization, FTS5 query sanitization, SSRF/CSRF guards, sharing-mode logic, DB schema and upsert paths, handler end-to-end flows, and gateway body-size / mesh-whitelist guards. Each recent bug-fix commit (6ffd38d,1bc695f,8dffd8c) has an explicit regression test in test_regressions.py. One xfail documents a minor latent bug in clean_url where port 80 is not stripped from upgraded https URLs.
113 lines
4 KiB
Python
113 lines
4 KiB
Python
"""Tests for `_sanitize_fts_query`.
|
|
|
|
The sanitizer is the boundary between user input and FTS5 MATCH syntax.
|
|
Commit 1bc695f tightened it after noticing that colons and operator words
|
|
could escape the quoting. These tests keep that regression dead.
|
|
"""
|
|
import pytest
|
|
|
|
from handlers import _sanitize_fts_query
|
|
|
|
|
|
def test_empty_query_returns_no_match_token():
|
|
assert _sanitize_fts_query("") == '""'
|
|
assert _sanitize_fts_query(" ") == '""'
|
|
|
|
|
|
def test_single_word_becomes_prefix_match():
|
|
assert _sanitize_fts_query("rust") == "rust*"
|
|
|
|
|
|
def test_multi_word_quotes_all_but_last():
|
|
result = _sanitize_fts_query("rust borrow checker")
|
|
assert result == '"rust" "borrow" checker*'
|
|
|
|
|
|
def test_stopwords_are_dropped():
|
|
# "the" and "a" should vanish; only "cat" remains (and gets prefix star).
|
|
assert _sanitize_fts_query("the a cat") == "cat*"
|
|
|
|
|
|
def test_all_stopwords_returns_no_match_token():
|
|
assert _sanitize_fts_query("the and or") == '""'
|
|
|
|
|
|
@pytest.mark.parametrize("bad_char", ["'", "(", ")", "+", "-", "^", "~", ":"])
|
|
def test_fts5_operators_stripped_from_tokens(bad_char):
|
|
"""FTS5 special chars inside user tokens must not survive — regression for 1bc695f.
|
|
|
|
The sanitizer legitimately adds `"` around tokens and a trailing `*` for prefix
|
|
matching; both are excluded from this check.
|
|
"""
|
|
payload = f"foo{bad_char}bar"
|
|
out = _sanitize_fts_query(payload)
|
|
assert bad_char not in out, f"{bad_char!r} leaked into {out!r}"
|
|
|
|
|
|
def test_asterisk_only_appears_as_trailing_prefix():
|
|
"""Input `*` should not become an in-token asterisk; the sanitizer's trailing `*` is fine."""
|
|
out = _sanitize_fts_query("foo*bar")
|
|
assert out.count("*") <= 1
|
|
if "*" in out:
|
|
assert out.endswith("*")
|
|
|
|
|
|
def test_quote_in_input_does_not_break_out_of_quoted_token():
|
|
"""A `"` in user input must not close the sanitizer's protective quoting.
|
|
|
|
The sanitizer wraps each non-last token in double quotes; if a stray `"` from
|
|
the user slipped through, the resulting FTS5 expression would be interpreted
|
|
as broken syntax or, worse, a column filter.
|
|
"""
|
|
out = _sanitize_fts_query('foo"bar baz"qux')
|
|
# Each pair of quotes in the output should be balanced and around a clean token.
|
|
assert out.count('"') % 2 == 0
|
|
# No embedded quotes inside a quoted region.
|
|
import re
|
|
for match in re.findall(r'"[^"]*"', out):
|
|
inner = match[1:-1]
|
|
assert '"' not in inner
|
|
|
|
|
|
@pytest.mark.parametrize("op", ["AND", "OR", "NOT", "NEAR", "and", "or", "not", "near"])
|
|
def test_fts5_operator_words_dropped(op):
|
|
"""AND/OR/NOT/NEAR would be interpreted as operators on the unquoted last token."""
|
|
out = _sanitize_fts_query(f"foo {op} bar")
|
|
# the operator word itself should not appear
|
|
assert op.upper() not in out.upper().split('"'), f"operator {op!r} survived in {out!r}"
|
|
|
|
|
|
def test_injection_payload_produces_valid_fts5():
|
|
"""End-to-end: a realistic injection payload must produce syntactically valid FTS5.
|
|
|
|
We run the sanitized output through a throwaway FTS5 table; if the sanitizer
|
|
leaks operator characters the MATCH either raises or interprets malicious syntax.
|
|
"""
|
|
import sqlite3
|
|
conn = sqlite3.connect(":memory:")
|
|
conn.execute("CREATE VIRTUAL TABLE t USING fts5(body)")
|
|
conn.execute("INSERT INTO t (body) VALUES ('hello world')")
|
|
|
|
for payload in [
|
|
'foo": OR bar NOT baz AND qux*()',
|
|
'" OR 1=1 --',
|
|
"title:secret AND public",
|
|
"(((",
|
|
"^^^~~~",
|
|
]:
|
|
q = _sanitize_fts_query(payload)
|
|
# Must not raise — if operators leaked, FTS5 would error or mis-parse.
|
|
conn.execute("SELECT * FROM t WHERE t MATCH ?", (q,)).fetchall()
|
|
conn.close()
|
|
|
|
|
|
def test_whitespace_only_tokens_dropped():
|
|
# tokens that become empty after stripping special chars should not produce bare quotes
|
|
out = _sanitize_fts_query('""" "" ""')
|
|
assert out == '""'
|
|
|
|
|
|
def test_colon_stripped():
|
|
"""Regression for 1bc695f — colon is an FTS5 column filter and must be stripped."""
|
|
out = _sanitize_fts_query("title:secret")
|
|
assert ":" not in out
|