added hybrid semantic search with reranking
Implements a three-stage search pipeline: 1. BM25 keyword search via FTS5 with column weights 2. Semantic search via Snowflake arctic-embed-s bi-encoder + HNSW index 3. Optional cross-encoder reranking (on by default, toggleable in settings) Top 20 results are reranked for precision, next 10 appended from RRF for coverage, giving 30 total results across 3 pages. - New embeddings.py with ONNX Runtime inference, text chunking, HNSW index management, RRF fusion, and cross-encoder reranking - Meta description extraction for authentic page snippets with centroid extractive fallback - Stopword filtering in FTS5 queries to avoid overly strict matching - /reindex page for batch embedding of existing pages - Semantic embedding of remote pages during subscription sync - ~125MB dependency footprint (onnxruntime, tokenizers, hnswlib, numpy) - Models: 34MB bi-encoder + 22MB cross-encoder (downloaded on first use)
This commit is contained in:
parent
212e9a017d
commit
5ded9f1339
6 changed files with 839 additions and 17 deletions
169
handlers.py
169
handlers.py
|
|
@ -1,4 +1,5 @@
|
|||
import json
|
||||
import re
|
||||
import secrets
|
||||
import threading
|
||||
from datetime import datetime
|
||||
|
|
@ -27,10 +28,41 @@ def _check_csrf(body):
|
|||
return secrets.compare_digest(token, expected)
|
||||
|
||||
|
||||
_STOPWORDS = frozenset({
|
||||
"a", "an", "the", "and", "or", "but", "is", "are", "was", "were",
|
||||
"in", "on", "at", "to", "for", "of", "with", "by", "from", "as",
|
||||
"into", "about", "how", "what", "which", "who", "where", "when",
|
||||
"do", "does", "did", "be", "been", "being", "have", "has", "had",
|
||||
"it", "its", "this", "that", "not", "no", "so", "if", "can", "will",
|
||||
"my", "your", "i", "me", "we", "you", "he", "she", "they",
|
||||
})
|
||||
|
||||
|
||||
def _sanitize_fts_query(query):
|
||||
"""Escape user input for safe use in FTS5 MATCH."""
|
||||
escaped = query.replace('"', '""')
|
||||
return f'"{escaped}"'
|
||||
"""Escape user input for safe use in FTS5 MATCH.
|
||||
|
||||
Splits into individual quoted tokens joined by implicit AND,
|
||||
so all words must appear but in any order. Appends * to the
|
||||
last token for prefix matching. Stopwords are dropped to avoid
|
||||
overly strict matching.
|
||||
"""
|
||||
words = query.split()
|
||||
if not words:
|
||||
return '""'
|
||||
tokens = []
|
||||
for i, w in enumerate(words):
|
||||
# Strip FTS5 special characters to prevent injection
|
||||
cleaned = re.sub(r'["\'\(\)\*\+\-\^~]', '', w).strip()
|
||||
if not cleaned:
|
||||
continue
|
||||
if cleaned.lower() in _STOPWORDS:
|
||||
continue
|
||||
if i == len(words) - 1:
|
||||
# Prefix match on the last token for partial word matching
|
||||
tokens.append(f"{cleaned}*")
|
||||
else:
|
||||
tokens.append(f'"{cleaned}"')
|
||||
return " ".join(tokens) if tokens else '""'
|
||||
|
||||
|
||||
def _get_bookmark_token():
|
||||
|
|
@ -155,20 +187,46 @@ def handle_search(query):
|
|||
result_html = ""
|
||||
trusted_html = ""
|
||||
if q:
|
||||
# BM25 keyword search with column weights: title=10, body=1, url=5, note=3
|
||||
try:
|
||||
total_results = db.execute(
|
||||
"SELECT count(*) FROM pages_fts WHERE pages_fts MATCH ?",
|
||||
(_sanitize_fts_query(q),),
|
||||
).fetchone()[0]
|
||||
rows = db.execute(
|
||||
fts_q = _sanitize_fts_query(q)
|
||||
bm25_rows = db.execute(
|
||||
"SELECT p.id, p.url, p.title, p.body, p.note "
|
||||
"FROM pages_fts f JOIN pages p ON f.rowid = p.id "
|
||||
"WHERE pages_fts MATCH ? ORDER BY rank LIMIT ? OFFSET ?",
|
||||
(_sanitize_fts_query(q), PER_PAGE, offset),
|
||||
"WHERE pages_fts MATCH ? "
|
||||
"ORDER BY bm25(pages_fts, 10.0, 1.0, 5.0, 3.0) LIMIT 100",
|
||||
(fts_q,),
|
||||
).fetchall()
|
||||
except Exception:
|
||||
bm25_rows = []
|
||||
|
||||
# Hybrid search: merge BM25 + semantic via RRF
|
||||
bm25_ids = [r["id"] for r in bm25_rows]
|
||||
chunk_snippets = {} # page_id -> best chunk text
|
||||
try:
|
||||
from embeddings import hybrid_search
|
||||
use_reranker = get_setting("use_reranker", "1") == "1"
|
||||
fused = hybrid_search(q, bm25_ids, limit=100, db=db, use_reranker=use_reranker)
|
||||
fused_ids = [pid for pid, _ in fused]
|
||||
chunk_snippets = {pid: text for pid, text in fused if text}
|
||||
except Exception:
|
||||
fused_ids = bm25_ids
|
||||
|
||||
total_results = len(fused_ids)
|
||||
page_ids = fused_ids[offset:offset + PER_PAGE]
|
||||
|
||||
if page_ids:
|
||||
# Fetch rows in fused order
|
||||
placeholders = ",".join("?" * len(page_ids))
|
||||
all_rows = db.execute(
|
||||
f"SELECT id, url, title, body, note, summary FROM pages WHERE id IN ({placeholders})",
|
||||
page_ids,
|
||||
).fetchall()
|
||||
row_map = {r["id"]: r for r in all_rows}
|
||||
rows = [row_map[pid] for pid in page_ids if pid in row_map]
|
||||
else:
|
||||
rows = []
|
||||
total_results = 0
|
||||
|
||||
if rows:
|
||||
for r in rows:
|
||||
note_html = ""
|
||||
|
|
@ -179,11 +237,13 @@ def handle_search(query):
|
|||
if tags:
|
||||
tag_links = " ".join(f'<a href="/tags/{esc(t)}" class="tag">[{esc(t)}]</a>' for t in tags)
|
||||
tags_html = f'<div class="tags">{tag_links}</div>'
|
||||
# Use page summary as snippet (meta description or centroid sentence)
|
||||
snip = r["summary"] if r["summary"] else snippet(r["body"], q)
|
||||
result_html += (
|
||||
f'<div class="result">'
|
||||
f'<a href="{esc(r["url"])}">{esc(r["title"])}</a><br>'
|
||||
f'<small>{esc(r["url"])}</small><br>'
|
||||
f'{esc(snippet(r["body"], q))}'
|
||||
f'{esc(snip)}'
|
||||
f'{note_html}{tags_html}'
|
||||
f'</div>'
|
||||
)
|
||||
|
|
@ -495,6 +555,8 @@ def handle_style_form(msg=""):
|
|||
name = get_site_name()
|
||||
sharing = get_setting("sharing_enabled", "0")
|
||||
checked = " checked" if sharing == "1" else ""
|
||||
reranker = get_setting("use_reranker", "1")
|
||||
reranker_checked = " checked" if reranker == "1" else ""
|
||||
return _respond(
|
||||
f"<h1>customize</h1>"
|
||||
f"<h2>name your search engine</h2>"
|
||||
|
|
@ -504,6 +566,10 @@ def handle_style_form(msg=""):
|
|||
f"<h2>sharing</h2>"
|
||||
f'<label><input type="checkbox" name="sharing_enabled" value="1"{checked}>'
|
||||
f" share your site list publicly at /api/sites</label><br><br>"
|
||||
f"<h2>search</h2>"
|
||||
f'<label><input type="checkbox" name="use_reranker" value="1"{reranker_checked}>'
|
||||
f" cross-encoder reranking (more accurate, on by default)</label><br>"
|
||||
f"<small>Uses a 22MB model. Adds ~50ms per search. Disable for faster results.</small><br><br>"
|
||||
f"<h2>custom html</h2>"
|
||||
f"<p>Edit the full page template. Use <code>{esc('{{content}}')}</code> "
|
||||
f"where page content should appear.</p>"
|
||||
|
|
@ -528,9 +594,11 @@ def handle_style_submit(body):
|
|||
template = body.get("template", [""])[0].replace("\r\n", "\n").replace("\r", "\n")
|
||||
name = body.get("site_name", ["tinyweb"])[0].strip()
|
||||
sharing = "1" if body.get("sharing_enabled") else "0"
|
||||
reranker = "1" if body.get("use_reranker") else "0"
|
||||
set_setting("custom_template", template if template.strip() != DEFAULT_TEMPLATE.strip() else "")
|
||||
set_setting("site_name", name or "tinyweb")
|
||||
set_setting("sharing_enabled", sharing)
|
||||
set_setting("use_reranker", reranker)
|
||||
return handle_style_form("Saved.")
|
||||
|
||||
|
||||
|
|
@ -904,6 +972,16 @@ def handle_subscription_sync(sub_id):
|
|||
"ON CONFLICT(subscription_id, url) DO UPDATE SET title=excluded.title, note=excluded.note, tags=excluded.tags",
|
||||
(sub_id, s["url"], s["title"], s.get("note", ""), tags_str),
|
||||
)
|
||||
# Embed remote page for semantic search
|
||||
try:
|
||||
from embeddings import store_remote_embeddings
|
||||
rp_id = db.execute(
|
||||
"SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
|
||||
(sub_id, s["url"]),
|
||||
).fetchone()["id"]
|
||||
store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
|
||||
except Exception:
|
||||
pass
|
||||
synced += 1
|
||||
except Exception:
|
||||
pass
|
||||
|
|
@ -970,6 +1048,15 @@ def handle_subscription_syncall():
|
|||
"ON CONFLICT(subscription_id, url) DO UPDATE SET title=excluded.title, note=excluded.note, tags=excluded.tags",
|
||||
(sub["id"], s["url"], s["title"], s.get("note", ""), tags_str),
|
||||
)
|
||||
try:
|
||||
from embeddings import store_remote_embeddings
|
||||
rp_id = db.execute(
|
||||
"SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
|
||||
(sub["id"], s["url"]),
|
||||
).fetchone()["id"]
|
||||
store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
now = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
|
||||
|
|
@ -983,6 +1070,60 @@ def handle_subscription_syncall():
|
|||
return handle_subscriptions(f"Synced {total} subscription(s).")
|
||||
|
||||
|
||||
# --- Reindex (semantic search) ---
|
||||
|
||||
|
||||
_reindex_thread = None
|
||||
|
||||
|
||||
def handle_reindex_form():
|
||||
db = get_db()
|
||||
try:
|
||||
total_pages = db.execute("SELECT count(*) FROM pages").fetchone()[0]
|
||||
pages_with_chunks = db.execute(
|
||||
"SELECT count(DISTINCT page_id) FROM chunks WHERE page_id IS NOT NULL"
|
||||
).fetchone()[0]
|
||||
finally:
|
||||
return_db(db)
|
||||
progress = get_setting("reindex_progress", "")
|
||||
status_html = ""
|
||||
if progress:
|
||||
status_html = f'<p class="meta">Reindex in progress: {esc(progress)}</p>'
|
||||
elif _reindex_thread and _reindex_thread.is_alive():
|
||||
status_html = '<p class="meta">Reindex running...</p>'
|
||||
return _respond(
|
||||
f"<h2>semantic search index</h2>"
|
||||
f"<p>{pages_with_chunks} of {total_pages} pages have embeddings.</p>"
|
||||
f'{status_html}'
|
||||
f'<form method="post" action="/reindex">'
|
||||
f'{_csrf_field()}'
|
||||
f'<button type="submit">reindex all pages</button>'
|
||||
f'</form>'
|
||||
f'<p><a href="/">back to search</a></p>'
|
||||
)
|
||||
|
||||
|
||||
def handle_reindex_submit(body):
|
||||
global _reindex_thread
|
||||
if _reindex_thread and _reindex_thread.is_alive():
|
||||
return handle_reindex_form()
|
||||
|
||||
def _run():
|
||||
try:
|
||||
from embeddings import reindex_all
|
||||
def progress(current, total):
|
||||
set_setting("reindex_progress", f"{current}/{total}")
|
||||
reindex_all(progress_callback=progress)
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
set_setting("reindex_progress", "")
|
||||
|
||||
_reindex_thread = threading.Thread(target=_run, daemon=True)
|
||||
_reindex_thread.start()
|
||||
return _redirect("/reindex")
|
||||
|
||||
|
||||
# --- Dispatcher ---
|
||||
|
||||
|
||||
|
|
@ -1027,6 +1168,8 @@ def _dispatch_inner(data):
|
|||
elif path.startswith("/tags/"):
|
||||
tag_name = unquote(path[len("/tags/"):])
|
||||
return handle_tag_browse(tag_name, query) if tag_name else _error(400)
|
||||
elif path == "/reindex":
|
||||
return handle_reindex_form()
|
||||
elif path == "/api/sites":
|
||||
return handle_api_sites(query)
|
||||
elif path == "/subscriptions":
|
||||
|
|
@ -1052,6 +1195,8 @@ def _dispatch_inner(data):
|
|||
return handle_style_form("Template reset to default.")
|
||||
elif path == "/import":
|
||||
return handle_import_submit(body)
|
||||
elif path == "/reindex":
|
||||
return handle_reindex_submit(body)
|
||||
elif path == "/subscriptions/add":
|
||||
return handle_subscription_add(body)
|
||||
elif path == "/subscriptions/pick":
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue