added hybrid semantic search with reranking

Implements a three-stage search pipeline: 1. BM25 keyword search via FTS5 with column weights 2. Semantic search via Snowflake arctic-embed-s bi-encoder + HNSW index 3. Optional cross-encoder reranking (on by default, toggleable in settings) Top 20 results are reranked for precision, next 10 appended from RRF for coverage, giving 30 total results across 3 pages. - New embeddings.py with ONNX Runtime inference, text chunking, HNSW index management, RRF fusion, and cross-encoder reranking - Meta description extraction for authentic page snippets with centroid extractive fallback - Stopword filtering in FTS5 queries to avoid overly strict matching - /reindex page for batch embedding of existing pages - Semantic embedding of remote pages during subscription sync - ~125MB dependency footprint (onnxruntime, tokenizers, hnswlib, numpy) - Models: 34MB bi-encoder + 22MB cross-encoder (downloaded on first use)
2026-03-27 03:24:41 -07:00 · 2026-03-27 03:24:41 -07:00 · 5ded9f1339
commit 5ded9f1339
parent 212e9a017d
6 changed files with 839 additions and 17 deletions
--- a/handlers.py
+++ b/handlers.py
@ -1,4 +1,5 @@
 import json
+import re
 import secrets
 import threading
 from datetime import datetime
@ -27,10 +28,41 @@ def _check_csrf(body):
    return secrets.compare_digest(token, expected)


+_STOPWORDS = frozenset({
+    "a", "an", "the", "and", "or", "but", "is", "are", "was", "were",
+    "in", "on", "at", "to", "for", "of", "with", "by", "from", "as",
+    "into", "about", "how", "what", "which", "who", "where", "when",
+    "do", "does", "did", "be", "been", "being", "have", "has", "had",
+    "it", "its", "this", "that", "not", "no", "so", "if", "can", "will",
+    "my", "your", "i", "me", "we", "you", "he", "she", "they",
+})
+
+
 def _sanitize_fts_query(query):
-    """Escape user input for safe use in FTS5 MATCH."""
-    escaped = query.replace('"', '""')
-    return f'"{escaped}"'
+    """Escape user input for safe use in FTS5 MATCH.
+
+    Splits into individual quoted tokens joined by implicit AND,
+    so all words must appear but in any order. Appends * to the
+    last token for prefix matching. Stopwords are dropped to avoid
+    overly strict matching.
+    """
+    words = query.split()
+    if not words:
+        return '""'
+    tokens = []
+    for i, w in enumerate(words):
+        # Strip FTS5 special characters to prevent injection
+        cleaned = re.sub(r'["\'\(\)\*\+\-\^~]', '', w).strip()
+        if not cleaned:
+            continue
+        if cleaned.lower() in _STOPWORDS:
+            continue
+        if i == len(words) - 1:
+            # Prefix match on the last token for partial word matching
+            tokens.append(f"{cleaned}*")
+        else:
+            tokens.append(f'"{cleaned}"')
+    return " ".join(tokens) if tokens else '""'


 def _get_bookmark_token():
@ -155,20 +187,46 @@ def handle_search(query):
        result_html = ""
        trusted_html = ""
        if q:
+            # BM25 keyword search with column weights: title=10, body=1, url=5, note=3
            try:
-                total_results = db.execute(
-                    "SELECT count(*) FROM pages_fts WHERE pages_fts MATCH ?",
-                    (_sanitize_fts_query(q),),
-                ).fetchone()[0]
-                rows = db.execute(
+                fts_q = _sanitize_fts_query(q)
+                bm25_rows = db.execute(
                    "SELECT p.id, p.url, p.title, p.body, p.note "
                    "FROM pages_fts f JOIN pages p ON f.rowid = p.id "
-                    "WHERE pages_fts MATCH ? ORDER BY rank LIMIT ? OFFSET ?",
-                    (_sanitize_fts_query(q), PER_PAGE, offset),
+                    "WHERE pages_fts MATCH ? "
+                    "ORDER BY bm25(pages_fts, 10.0, 1.0, 5.0, 3.0) LIMIT 100",
+                    (fts_q,),
                ).fetchall()
            except Exception:
+                bm25_rows = []
+
+            # Hybrid search: merge BM25 + semantic via RRF
+            bm25_ids = [r["id"] for r in bm25_rows]
+            chunk_snippets = {}  # page_id -> best chunk text
+            try:
+                from embeddings import hybrid_search
+                use_reranker = get_setting("use_reranker", "1") == "1"
+                fused = hybrid_search(q, bm25_ids, limit=100, db=db, use_reranker=use_reranker)
+                fused_ids = [pid for pid, _ in fused]
+                chunk_snippets = {pid: text for pid, text in fused if text}
+            except Exception:
+                fused_ids = bm25_ids
+
+            total_results = len(fused_ids)
+            page_ids = fused_ids[offset:offset + PER_PAGE]
+
+            if page_ids:
+                # Fetch rows in fused order
+                placeholders = ",".join("?" * len(page_ids))
+                all_rows = db.execute(
+                    f"SELECT id, url, title, body, note, summary FROM pages WHERE id IN ({placeholders})",
+                    page_ids,
+                ).fetchall()
+                row_map = {r["id"]: r for r in all_rows}
+                rows = [row_map[pid] for pid in page_ids if pid in row_map]
+            else:
                rows = []
-                total_results = 0
+
            if rows:
                for r in rows:
                    note_html = ""
@ -179,11 +237,13 @@ def handle_search(query):
                    if tags:
                        tag_links = " ".join(f'<a href="/tags/{esc(t)}" class="tag">[{esc(t)}]</a>' for t in tags)
                        tags_html = f'<div class="tags">{tag_links}</div>'
+                    # Use page summary as snippet (meta description or centroid sentence)
+                    snip = r["summary"] if r["summary"] else snippet(r["body"], q)
                    result_html += (
                        f'<div class="result">'
                        f'<a href="{esc(r["url"])}">{esc(r["title"])}</a><br>'
                        f'<small>{esc(r["url"])}</small><br>'
-                        f'{esc(snippet(r["body"], q))}'
+                        f'{esc(snip)}'
                        f'{note_html}{tags_html}'
                        f'</div>'
                    )
@ -495,6 +555,8 @@ def handle_style_form(msg=""):
    name = get_site_name()
    sharing = get_setting("sharing_enabled", "0")
    checked = " checked" if sharing == "1" else ""
+    reranker = get_setting("use_reranker", "1")
+    reranker_checked = " checked" if reranker == "1" else ""
    return _respond(
        f"<h1>customize</h1>"
        f"<h2>name your search engine</h2>"
@ -504,6 +566,10 @@ def handle_style_form(msg=""):
        f"<h2>sharing</h2>"
        f'<label><input type="checkbox" name="sharing_enabled" value="1"{checked}>'
        f" share your site list publicly at /api/sites</label><br><br>"
+        f"<h2>search</h2>"
+        f'<label><input type="checkbox" name="use_reranker" value="1"{reranker_checked}>'
+        f" cross-encoder reranking (more accurate, on by default)</label><br>"
+        f"<small>Uses a 22MB model. Adds ~50ms per search. Disable for faster results.</small><br><br>"
        f"<h2>custom html</h2>"
        f"<p>Edit the full page template. Use <code>{esc('{{content}}')}</code> "
        f"where page content should appear.</p>"
@ -528,9 +594,11 @@ def handle_style_submit(body):
    template = body.get("template", [""])[0].replace("\r\n", "\n").replace("\r", "\n")
    name = body.get("site_name", ["tinyweb"])[0].strip()
    sharing = "1" if body.get("sharing_enabled") else "0"
+    reranker = "1" if body.get("use_reranker") else "0"
    set_setting("custom_template", template if template.strip() != DEFAULT_TEMPLATE.strip() else "")
    set_setting("site_name", name or "tinyweb")
    set_setting("sharing_enabled", sharing)
+    set_setting("use_reranker", reranker)
    return handle_style_form("Saved.")


@ -904,6 +972,16 @@ def handle_subscription_sync(sub_id):
                    "ON CONFLICT(subscription_id, url) DO UPDATE SET title=excluded.title, note=excluded.note, tags=excluded.tags",
                    (sub_id, s["url"], s["title"], s.get("note", ""), tags_str),
                )
+                # Embed remote page for semantic search
+                try:
+                    from embeddings import store_remote_embeddings
+                    rp_id = db.execute(
+                        "SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
+                        (sub_id, s["url"]),
+                    ).fetchone()["id"]
+                    store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
+                except Exception:
+                    pass
                synced += 1
            except Exception:
                pass
@ -970,6 +1048,15 @@ def handle_subscription_syncall():
                            "ON CONFLICT(subscription_id, url) DO UPDATE SET title=excluded.title, note=excluded.note, tags=excluded.tags",
                            (sub["id"], s["url"], s["title"], s.get("note", ""), tags_str),
                        )
+                        try:
+                            from embeddings import store_remote_embeddings
+                            rp_id = db.execute(
+                                "SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
+                                (sub["id"], s["url"]),
+                            ).fetchone()["id"]
+                            store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
+                        except Exception:
+                            pass
                    except Exception:
                        pass
                now = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
@ -983,6 +1070,60 @@ def handle_subscription_syncall():
    return handle_subscriptions(f"Synced {total} subscription(s).")


+# --- Reindex (semantic search) ---
+
+
+_reindex_thread = None
+
+
+def handle_reindex_form():
+    db = get_db()
+    try:
+        total_pages = db.execute("SELECT count(*) FROM pages").fetchone()[0]
+        pages_with_chunks = db.execute(
+            "SELECT count(DISTINCT page_id) FROM chunks WHERE page_id IS NOT NULL"
+        ).fetchone()[0]
+    finally:
+        return_db(db)
+    progress = get_setting("reindex_progress", "")
+    status_html = ""
+    if progress:
+        status_html = f'<p class="meta">Reindex in progress: {esc(progress)}</p>'
+    elif _reindex_thread and _reindex_thread.is_alive():
+        status_html = '<p class="meta">Reindex running...</p>'
+    return _respond(
+        f"<h2>semantic search index</h2>"
+        f"<p>{pages_with_chunks} of {total_pages} pages have embeddings.</p>"
+        f'{status_html}'
+        f'<form method="post" action="/reindex">'
+        f'{_csrf_field()}'
+        f'<button type="submit">reindex all pages</button>'
+        f'</form>'
+        f'<p><a href="/">back to search</a></p>'
+    )
+
+
+def handle_reindex_submit(body):
+    global _reindex_thread
+    if _reindex_thread and _reindex_thread.is_alive():
+        return handle_reindex_form()
+
+    def _run():
+        try:
+            from embeddings import reindex_all
+            def progress(current, total):
+                set_setting("reindex_progress", f"{current}/{total}")
+            reindex_all(progress_callback=progress)
+        except Exception:
+            pass
+        finally:
+            set_setting("reindex_progress", "")
+
+    _reindex_thread = threading.Thread(target=_run, daemon=True)
+    _reindex_thread.start()
+    return _redirect("/reindex")
+
+
 # --- Dispatcher ---


@ -1027,6 +1168,8 @@ def _dispatch_inner(data):
        elif path.startswith("/tags/"):
            tag_name = unquote(path[len("/tags/"):])
            return handle_tag_browse(tag_name, query) if tag_name else _error(400)
+        elif path == "/reindex":
+            return handle_reindex_form()
        elif path == "/api/sites":
            return handle_api_sites(query)
        elif path == "/subscriptions":
@ -1052,6 +1195,8 @@ def _dispatch_inner(data):
            return handle_style_form("Template reset to default.")
        elif path == "/import":
            return handle_import_submit(body)
+        elif path == "/reindex":
+            return handle_reindex_submit(body)
        elif path == "/subscriptions/add":
            return handle_subscription_add(body)
        elif path == "/subscriptions/pick":