"""Tests for `_sanitize_fts_query`. The sanitizer is the boundary between user input and FTS5 MATCH syntax. Commit 1bc695f tightened it after noticing that colons and operator words could escape the quoting. These tests keep that regression dead. """ import pytest from handlers import _sanitize_fts_query def test_empty_query_returns_no_match_token(): assert _sanitize_fts_query("") == '""' assert _sanitize_fts_query(" ") == '""' def test_single_word_becomes_prefix_match(): assert _sanitize_fts_query("rust") == "rust*" def test_multi_word_quotes_all_but_last(): result = _sanitize_fts_query("rust borrow checker") assert result == '"rust" "borrow" checker*' def test_stopwords_are_dropped(): # "the" and "a" should vanish; only "cat" remains (and gets prefix star). assert _sanitize_fts_query("the a cat") == "cat*" def test_all_stopwords_returns_no_match_token(): assert _sanitize_fts_query("the and or") == '""' @pytest.mark.parametrize("bad_char", ["'", "(", ")", "+", "-", "^", "~", ":"]) def test_fts5_operators_stripped_from_tokens(bad_char): """FTS5 special chars inside user tokens must not survive — regression for 1bc695f. The sanitizer legitimately adds `"` around tokens and a trailing `*` for prefix matching; both are excluded from this check. """ payload = f"foo{bad_char}bar" out = _sanitize_fts_query(payload) assert bad_char not in out, f"{bad_char!r} leaked into {out!r}" def test_asterisk_only_appears_as_trailing_prefix(): """Input `*` should not become an in-token asterisk; the sanitizer's trailing `*` is fine.""" out = _sanitize_fts_query("foo*bar") assert out.count("*") <= 1 if "*" in out: assert out.endswith("*") def test_quote_in_input_does_not_break_out_of_quoted_token(): """A `"` in user input must not close the sanitizer's protective quoting. The sanitizer wraps each non-last token in double quotes; if a stray `"` from the user slipped through, the resulting FTS5 expression would be interpreted as broken syntax or, worse, a column filter. """ out = _sanitize_fts_query('foo"bar baz"qux') # Each pair of quotes in the output should be balanced and around a clean token. assert out.count('"') % 2 == 0 # No embedded quotes inside a quoted region. import re for match in re.findall(r'"[^"]*"', out): inner = match[1:-1] assert '"' not in inner @pytest.mark.parametrize("op", ["AND", "OR", "NOT", "NEAR", "and", "or", "not", "near"]) def test_fts5_operator_words_dropped(op): """AND/OR/NOT/NEAR would be interpreted as operators on the unquoted last token.""" out = _sanitize_fts_query(f"foo {op} bar") # the operator word itself should not appear assert op.upper() not in out.upper().split('"'), f"operator {op!r} survived in {out!r}" def test_injection_payload_produces_valid_fts5(): """End-to-end: a realistic injection payload must produce syntactically valid FTS5. We run the sanitized output through a throwaway FTS5 table; if the sanitizer leaks operator characters the MATCH either raises or interprets malicious syntax. """ import sqlite3 conn = sqlite3.connect(":memory:") conn.execute("CREATE VIRTUAL TABLE t USING fts5(body)") conn.execute("INSERT INTO t (body) VALUES ('hello world')") for payload in [ 'foo": OR bar NOT baz AND qux*()', '" OR 1=1 --', "title:secret AND public", "(((", "^^^~~~", ]: q = _sanitize_fts_query(payload) # Must not raise — if operators leaked, FTS5 would error or mis-parse. conn.execute("SELECT * FROM t WHERE t MATCH ?", (q,)).fetchall() conn.close() def test_whitespace_only_tokens_dropped(): # tokens that become empty after stripping special chars should not produce bare quotes out = _sanitize_fts_query('""" "" ""') assert out == '""' def test_colon_stripped(): """Regression for 1bc695f — colon is an FTS5 column filter and must be stripped.""" out = _sanitize_fts_query("title:secret") assert ":" not in out