Add hybrid semantic search with optional cross-encoder reranking

Implements a three-stage search pipeline: 1. BM25 keyword search via FTS5 with column weights 2. Semantic search via Snowflake arctic-embed-s bi-encoder + HNSW index 3. Optional cross-encoder reranking (on by default, toggleable in settings) Top 20 results are reranked for precision, next 10 appended from RRF for coverage, giving 30 total results across 3 pages. - New embeddings.py with ONNX Runtime inference, text chunking, HNSW index management, RRF fusion, and cross-encoder reranking - Meta description extraction for authentic page snippets with centroid extractive fallback - Stopword filtering in FTS5 queries to avoid overly strict matching - /reindex page for batch embedding of existing pages - Semantic embedding of remote pages during subscription sync - ~125MB dependency footprint (onnxruntime, tokenizers, hnswlib, numpy) - Models: 34MB bi-encoder + 22MB cross-encoder (downloaded on first use) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-27 03:24:41 -07:00 · 2026-03-27 03:24:41 -07:00 · 395fc17092
commit 395fc17092
parent 2df92752b6
6 changed files with 839 additions and 17 deletions
--- a/db.py
+++ b/db.py
@ -226,6 +226,27 @@ def init_db():
        db.execute("UPDATE pages SET last_modified = strftime('%Y-%m-%dT%H:%M:%S','now') WHERE last_modified = ''")
        db.commit()

+    # Migrate pages: add summary column if missing
+    if "summary" not in page_cols:
+        db.execute("ALTER TABLE pages ADD COLUMN summary TEXT DEFAULT ''")
+        db.commit()
+
+    # Chunks table for semantic search embeddings
+    db.execute(
+        "CREATE TABLE IF NOT EXISTS chunks ("
+        "  id INTEGER PRIMARY KEY AUTOINCREMENT,"
+        "  page_id INTEGER,"
+        "  remote_page_id INTEGER,"
+        "  chunk_index INTEGER NOT NULL,"
+        "  chunk_text TEXT NOT NULL,"
+        "  embedding BLOB NOT NULL,"
+        "  FOREIGN KEY (page_id) REFERENCES pages(id) ON DELETE CASCADE,"
+        "  FOREIGN KEY (remote_page_id) REFERENCES remote_pages(id) ON DELETE CASCADE"
+        ")"
+    )
+    db.execute("CREATE INDEX IF NOT EXISTS idx_chunks_page ON chunks(page_id)")
+    db.execute("CREATE INDEX IF NOT EXISTS idx_chunks_remote ON chunks(remote_page_id)")
+
    db.execute("PRAGMA journal_mode=WAL")
    db.commit()
    db.close()
@ -296,24 +317,96 @@ def fetch_page(url):
        label = a.get_text(strip=True) or href
        links.append((href, label[:200]))

+    # Extract meta description before stripping tags
+    meta_desc = ""
+    meta_tag = soup.find("meta", attrs={"name": "description"})
+    if meta_tag and meta_tag.get("content"):
+        meta_desc = meta_tag["content"].strip()
+    if not meta_desc:
+        # Try og:description as fallback
+        og_tag = soup.find("meta", attrs={"property": "og:description"})
+        if og_tag and og_tag.get("content"):
+            meta_desc = og_tag["content"].strip()
+
    for tag in soup(["script", "style", "nav", "footer", "header"]):
        tag.decompose()
    title = soup.title.string.strip() if soup.title and soup.title.string else url
    body = soup.get_text(separator=" ", strip=True)
-    return title, body, links
+    return title, body, links, meta_desc
+
+
+def _generate_summary(title, body):
+    """Generate a summary from body text using centroid extractive method.
+
+    Filters out UI debris, embeds remaining sentences, finds the one
+    closest to the centroid (most representative of the page).
+    """
+    import re
+    # Split on sentence boundaries
+    raw = re.split(r'(?<=[.!?])\s+', body)
+    sentences = []
+    noise_patterns = re.compile(
+        r'arrow-|fedilink|message-square|link-external|'
+        r'skip to|cookie|subscribe|sign up|log in|'
+        r'privacy policy|terms of|©|\bads?\b',
+        re.IGNORECASE
+    )
+    for s in raw:
+        s = s.strip()
+        if len(s) < 40:
+            continue
+        words = s.split()
+        if len(words) < 7:
+            continue
+        # Skip if mostly non-alpha (icons, arrows, encoded chars)
+        alpha_chars = sum(1 for c in s if c.isalpha() or c == ' ')
+        if alpha_chars < len(s) * 0.6:
+            continue
+        # Skip nav/menu patterns
+        if s.count('|') > 2 or s.count('·') > 2 or s.count('►') > 0:
+            continue
+        # Skip UI debris
+        if noise_patterns.search(s):
+            continue
+        sentences.append(s)
+
+    if not sentences:
+        # Last resort: take the first chunk of body that looks like prose
+        clean = re.sub(r'\s+', ' ', body).strip()
+        return clean[:160] + "..." if len(clean) > 160 else clean
+    if len(sentences) == 1:
+        s = sentences[0]
+        return s[:200] if len(s) > 200 else s
+    try:
+        from embeddings import embed
+        import numpy as np
+        embs = embed(sentences[:50])  # cap to avoid embedding too many
+        centroid = embs.mean(axis=0, keepdims=True)
+        centroid = centroid / max(np.linalg.norm(centroid), 1e-12)
+        scores = (embs @ centroid.T).flatten()
+        best_idx = int(np.argmax(scores))
+        result = sentences[best_idx]
+        # Try to add a second sentence if it fits
+        if best_idx + 1 < len(sentences) and len(result) + len(sentences[best_idx + 1]) + 1 <= 200:
+            result += " " + sentences[best_idx + 1]
+        return result[:200] if len(result) > 200 else result
+    except Exception:
+        return sentences[0][:200]


 def index_url(url, note=""):
    url = clean_url(url)
-    title, body, links = fetch_page(url)
+    title, body, links, meta_desc = fetch_page(url)
+    # Use meta description if available, otherwise generate from body
+    summary = meta_desc if meta_desc else _generate_summary(title, body)
    db = get_db()
    try:
        now = __import__("datetime").datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
        db.execute(
-            "INSERT INTO pages (url, title, body, note, last_modified) VALUES (?, ?, ?, ?, ?) "
+            "INSERT INTO pages (url, title, body, note, last_modified, summary) VALUES (?, ?, ?, ?, ?, ?) "
            "ON CONFLICT(url) DO UPDATE SET title=excluded.title, body=excluded.body, "
-            "note=excluded.note, last_modified=excluded.last_modified",
-            (url, title, body, note, now),
+            "note=excluded.note, last_modified=excluded.last_modified, summary=excluded.summary",
+            (url, title, body, note, now, summary),
        )
        page_id = db.execute("SELECT id FROM pages WHERE url = ?", (url,)).fetchone()[0]
        db.execute("DELETE FROM links WHERE page_id = ?", (page_id,))
@ -323,6 +416,11 @@ def index_url(url, note=""):
                (page_id, href, label),
            )
        db.commit()
+        try:
+            from embeddings import store_embeddings
+            store_embeddings(page_id, title, body, db)
+        except Exception:
+            pass  # embedding generation is best-effort
    finally:
        return_db(db)
    return title