added hybrid semantic search with reranking

Implements a three-stage search pipeline:
1. BM25 keyword search via FTS5 with column weights
2. Semantic search via Snowflake arctic-embed-s bi-encoder + HNSW index
3. Optional cross-encoder reranking (on by default, toggleable in settings)

Top 20 results are reranked for precision, next 10 appended from RRF
for coverage, giving 30 total results across 3 pages.

- New embeddings.py with ONNX Runtime inference, text chunking, HNSW
  index management, RRF fusion, and cross-encoder reranking
- Meta description extraction for authentic page snippets with centroid
  extractive fallback
- Stopword filtering in FTS5 queries to avoid overly strict matching
- /reindex page for batch embedding of existing pages
- Semantic embedding of remote pages during subscription sync
- ~125MB dependency footprint (onnxruntime, tokenizers, hnswlib, numpy)
- Models: 34MB bi-encoder + 22MB cross-encoder (downloaded on first use)
This commit is contained in:
lichenblankie 2026-03-27 03:24:41 -07:00
parent 212e9a017d
commit 5ded9f1339
6 changed files with 839 additions and 17 deletions

View file

@ -1,4 +1,5 @@
import json
import re
import secrets
import threading
from datetime import datetime
@ -27,10 +28,41 @@ def _check_csrf(body):
return secrets.compare_digest(token, expected)
_STOPWORDS = frozenset({
"a", "an", "the", "and", "or", "but", "is", "are", "was", "were",
"in", "on", "at", "to", "for", "of", "with", "by", "from", "as",
"into", "about", "how", "what", "which", "who", "where", "when",
"do", "does", "did", "be", "been", "being", "have", "has", "had",
"it", "its", "this", "that", "not", "no", "so", "if", "can", "will",
"my", "your", "i", "me", "we", "you", "he", "she", "they",
})
def _sanitize_fts_query(query):
"""Escape user input for safe use in FTS5 MATCH."""
escaped = query.replace('"', '""')
return f'"{escaped}"'
"""Escape user input for safe use in FTS5 MATCH.
Splits into individual quoted tokens joined by implicit AND,
so all words must appear but in any order. Appends * to the
last token for prefix matching. Stopwords are dropped to avoid
overly strict matching.
"""
words = query.split()
if not words:
return '""'
tokens = []
for i, w in enumerate(words):
# Strip FTS5 special characters to prevent injection
cleaned = re.sub(r'["\'\(\)\*\+\-\^~]', '', w).strip()
if not cleaned:
continue
if cleaned.lower() in _STOPWORDS:
continue
if i == len(words) - 1:
# Prefix match on the last token for partial word matching
tokens.append(f"{cleaned}*")
else:
tokens.append(f'"{cleaned}"')
return " ".join(tokens) if tokens else '""'
def _get_bookmark_token():
@ -155,20 +187,46 @@ def handle_search(query):
result_html = ""
trusted_html = ""
if q:
# BM25 keyword search with column weights: title=10, body=1, url=5, note=3
try:
total_results = db.execute(
"SELECT count(*) FROM pages_fts WHERE pages_fts MATCH ?",
(_sanitize_fts_query(q),),
).fetchone()[0]
rows = db.execute(
fts_q = _sanitize_fts_query(q)
bm25_rows = db.execute(
"SELECT p.id, p.url, p.title, p.body, p.note "
"FROM pages_fts f JOIN pages p ON f.rowid = p.id "
"WHERE pages_fts MATCH ? ORDER BY rank LIMIT ? OFFSET ?",
(_sanitize_fts_query(q), PER_PAGE, offset),
"WHERE pages_fts MATCH ? "
"ORDER BY bm25(pages_fts, 10.0, 1.0, 5.0, 3.0) LIMIT 100",
(fts_q,),
).fetchall()
except Exception:
bm25_rows = []
# Hybrid search: merge BM25 + semantic via RRF
bm25_ids = [r["id"] for r in bm25_rows]
chunk_snippets = {} # page_id -> best chunk text
try:
from embeddings import hybrid_search
use_reranker = get_setting("use_reranker", "1") == "1"
fused = hybrid_search(q, bm25_ids, limit=100, db=db, use_reranker=use_reranker)
fused_ids = [pid for pid, _ in fused]
chunk_snippets = {pid: text for pid, text in fused if text}
except Exception:
fused_ids = bm25_ids
total_results = len(fused_ids)
page_ids = fused_ids[offset:offset + PER_PAGE]
if page_ids:
# Fetch rows in fused order
placeholders = ",".join("?" * len(page_ids))
all_rows = db.execute(
f"SELECT id, url, title, body, note, summary FROM pages WHERE id IN ({placeholders})",
page_ids,
).fetchall()
row_map = {r["id"]: r for r in all_rows}
rows = [row_map[pid] for pid in page_ids if pid in row_map]
else:
rows = []
total_results = 0
if rows:
for r in rows:
note_html = ""
@ -179,11 +237,13 @@ def handle_search(query):
if tags:
tag_links = " ".join(f'<a href="/tags/{esc(t)}" class="tag">[{esc(t)}]</a>' for t in tags)
tags_html = f'<div class="tags">{tag_links}</div>'
# Use page summary as snippet (meta description or centroid sentence)
snip = r["summary"] if r["summary"] else snippet(r["body"], q)
result_html += (
f'<div class="result">'
f'<a href="{esc(r["url"])}">{esc(r["title"])}</a><br>'
f'<small>{esc(r["url"])}</small><br>'
f'{esc(snippet(r["body"], q))}'
f'{esc(snip)}'
f'{note_html}{tags_html}'
f'</div>'
)
@ -495,6 +555,8 @@ def handle_style_form(msg=""):
name = get_site_name()
sharing = get_setting("sharing_enabled", "0")
checked = " checked" if sharing == "1" else ""
reranker = get_setting("use_reranker", "1")
reranker_checked = " checked" if reranker == "1" else ""
return _respond(
f"<h1>customize</h1>"
f"<h2>name your search engine</h2>"
@ -504,6 +566,10 @@ def handle_style_form(msg=""):
f"<h2>sharing</h2>"
f'<label><input type="checkbox" name="sharing_enabled" value="1"{checked}>'
f" share your site list publicly at /api/sites</label><br><br>"
f"<h2>search</h2>"
f'<label><input type="checkbox" name="use_reranker" value="1"{reranker_checked}>'
f" cross-encoder reranking (more accurate, on by default)</label><br>"
f"<small>Uses a 22MB model. Adds ~50ms per search. Disable for faster results.</small><br><br>"
f"<h2>custom html</h2>"
f"<p>Edit the full page template. Use <code>{esc('{{content}}')}</code> "
f"where page content should appear.</p>"
@ -528,9 +594,11 @@ def handle_style_submit(body):
template = body.get("template", [""])[0].replace("\r\n", "\n").replace("\r", "\n")
name = body.get("site_name", ["tinyweb"])[0].strip()
sharing = "1" if body.get("sharing_enabled") else "0"
reranker = "1" if body.get("use_reranker") else "0"
set_setting("custom_template", template if template.strip() != DEFAULT_TEMPLATE.strip() else "")
set_setting("site_name", name or "tinyweb")
set_setting("sharing_enabled", sharing)
set_setting("use_reranker", reranker)
return handle_style_form("Saved.")
@ -904,6 +972,16 @@ def handle_subscription_sync(sub_id):
"ON CONFLICT(subscription_id, url) DO UPDATE SET title=excluded.title, note=excluded.note, tags=excluded.tags",
(sub_id, s["url"], s["title"], s.get("note", ""), tags_str),
)
# Embed remote page for semantic search
try:
from embeddings import store_remote_embeddings
rp_id = db.execute(
"SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
(sub_id, s["url"]),
).fetchone()["id"]
store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
except Exception:
pass
synced += 1
except Exception:
pass
@ -970,6 +1048,15 @@ def handle_subscription_syncall():
"ON CONFLICT(subscription_id, url) DO UPDATE SET title=excluded.title, note=excluded.note, tags=excluded.tags",
(sub["id"], s["url"], s["title"], s.get("note", ""), tags_str),
)
try:
from embeddings import store_remote_embeddings
rp_id = db.execute(
"SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
(sub["id"], s["url"]),
).fetchone()["id"]
store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
except Exception:
pass
except Exception:
pass
now = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
@ -983,6 +1070,60 @@ def handle_subscription_syncall():
return handle_subscriptions(f"Synced {total} subscription(s).")
# --- Reindex (semantic search) ---
_reindex_thread = None
def handle_reindex_form():
db = get_db()
try:
total_pages = db.execute("SELECT count(*) FROM pages").fetchone()[0]
pages_with_chunks = db.execute(
"SELECT count(DISTINCT page_id) FROM chunks WHERE page_id IS NOT NULL"
).fetchone()[0]
finally:
return_db(db)
progress = get_setting("reindex_progress", "")
status_html = ""
if progress:
status_html = f'<p class="meta">Reindex in progress: {esc(progress)}</p>'
elif _reindex_thread and _reindex_thread.is_alive():
status_html = '<p class="meta">Reindex running...</p>'
return _respond(
f"<h2>semantic search index</h2>"
f"<p>{pages_with_chunks} of {total_pages} pages have embeddings.</p>"
f'{status_html}'
f'<form method="post" action="/reindex">'
f'{_csrf_field()}'
f'<button type="submit">reindex all pages</button>'
f'</form>'
f'<p><a href="/">back to search</a></p>'
)
def handle_reindex_submit(body):
global _reindex_thread
if _reindex_thread and _reindex_thread.is_alive():
return handle_reindex_form()
def _run():
try:
from embeddings import reindex_all
def progress(current, total):
set_setting("reindex_progress", f"{current}/{total}")
reindex_all(progress_callback=progress)
except Exception:
pass
finally:
set_setting("reindex_progress", "")
_reindex_thread = threading.Thread(target=_run, daemon=True)
_reindex_thread.start()
return _redirect("/reindex")
# --- Dispatcher ---
@ -1027,6 +1168,8 @@ def _dispatch_inner(data):
elif path.startswith("/tags/"):
tag_name = unquote(path[len("/tags/"):])
return handle_tag_browse(tag_name, query) if tag_name else _error(400)
elif path == "/reindex":
return handle_reindex_form()
elif path == "/api/sites":
return handle_api_sites(query)
elif path == "/subscriptions":
@ -1052,6 +1195,8 @@ def _dispatch_inner(data):
return handle_style_form("Template reset to default.")
elif path == "/import":
return handle_import_submit(body)
elif path == "/reindex":
return handle_reindex_submit(body)
elif path == "/subscriptions/add":
return handle_subscription_add(body)
elif path == "/subscriptions/pick":