Make semantic search and reranking optional, use site meta descriptions for snippets

- Add semantic_search setting to toggle AI-powered search on/off - Skip embedding generation, hybrid search, and model preloading when disabled - Use site owner's meta description as snippet instead of heuristic extraction - Remove _generate_summary() and snippet() - no more generated snippets - Show reranker/reindex controls grayed out when semantic search is off - AI dependencies (onnxruntime, hnswlib, etc.) are now fully optional Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-28 20:58:04 -07:00 · 2026-03-28 20:58:04 -07:00 · c959ee98ae
commit c959ee98ae
parent c9a8cba9d1
5 changed files with 70 additions and 118 deletions
--- a/app.py
+++ b/app.py
@ -74,6 +74,9 @@ def ensure_rns_config(config_dir):

 def _preload_embeddings():
    """Pre-load the embedding model and build the HNSW index in background."""
+    if get_setting("semantic_search", "1") != "1":
+        print("Semantic search disabled.")
+        return
    try:
        from embeddings import _get_session, _get_reranker, build_index
        _get_session()  # downloads model on first run, loads ONNX session
--- a/db.py
+++ b/db.py
@ -334,80 +334,16 @@ def fetch_page(url):
        tag.decompose()
    title = soup.title.string.strip() if soup.title and soup.title.string else url

-    # Extract paragraph text for better summary generation
-    paragraphs = []
-    for p in soup.find_all("p"):
-        text = p.get_text(strip=True)
-        if len(text) >= 40:
-            paragraphs.append(text)
-
    body = soup.get_text(separator=" ", strip=True)
-    return title, body, links, meta_desc, paragraphs
+    return title, body, links, meta_desc


-def _generate_summary(title, body, paragraphs=None):
-    """Generate a summary by extracting the best sentence from the page.
-
-    Priority: sentence mentioning the site name > first paragraph sentence
-    > first body sentence > title.
-    """
-    import re
-    noise_patterns = re.compile(
-        r'arrow-|fedilink|message-square|link-external|'
-        r'skip to|cookie|subscribe|sign up|log in|'
-        r'privacy policy|terms of|©|\bads?\b',
-        re.IGNORECASE
-    )
-
-    def _filter_sentences(raw):
-        result = []
-        for s in raw:
-            s = s.strip()
-            if len(s) < 40 or len(s.split()) < 7:
-                continue
-            alpha_chars = sum(1 for c in s if c.isalpha() or c == ' ')
-            if alpha_chars < len(s) * 0.6:
-                continue
-            if s.count('|') > 2 or s.count('·') > 2 or s.count('►') > 0:
-                continue
-            if noise_patterns.search(s):
-                continue
-            result.append(s)
-        return result
-
-    # Prefer sentences from <p> tags (actual content, not UI)
-    sentences = []
-    if paragraphs:
-        raw = []
-        for p in paragraphs:
-            raw.extend(re.split(r'(?<=[.!?])\s+', p))
-        sentences = _filter_sentences(raw)
-
-    # Fall back to full body text
-    if not sentences:
-        raw = re.split(r'(?<=[.!?])\s+', body)
-        sentences = _filter_sentences(raw)
-
-    if not sentences:
-        return title[:200] if title else ""
-
-    # Prefer a sentence that mentions the site name
-    if title:
-        title_words = [w.lower() for w in re.split(r'\W+', title) if len(w) >= 3]
-        for s in sentences:
-            s_lower = s.lower()
-            if sum(1 for w in title_words if w in s_lower) >= max(1, len(title_words) // 2):
-                return s[:200]
-
-    # Otherwise use the first quality sentence
-    return sentences[0][:200]
-

 def index_url(url, note=""):
    url = clean_url(url)
-    title, body, links, meta_desc, paragraphs = fetch_page(url)
+    title, body, links, meta_desc = fetch_page(url)
    # Use meta description if available and meaningful, otherwise generate from body
-    summary = meta_desc if meta_desc and len(meta_desc) > 20 else _generate_summary(title, body, paragraphs)
+    summary = meta_desc if meta_desc and len(meta_desc) > 20 else ""
    db = get_db()
    try:
        now = __import__("datetime").datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
@ -425,11 +361,12 @@ def index_url(url, note=""):
                (page_id, href, label),
            )
        db.commit()
-        try:
-            from embeddings import store_embeddings
-            store_embeddings(page_id, title, body, db)
-        except Exception:
-            pass  # embedding generation is best-effort
+        if get_setting("semantic_search", "1") == "1":
+            try:
+                from embeddings import store_embeddings
+                store_embeddings(page_id, title, body, db)
+            except Exception:
+                pass  # embedding generation is best-effort
    finally:
        return_db(db)
    return title
--- a/embeddings.py
+++ b/embeddings.py
@ -507,7 +507,7 @@ def hybrid_search(query_text, bm25_ranked_ids, limit=10, db=None, use_reranker=F

 def reindex_all(db=None, progress_callback=None):
    """Re-embed all pages and regenerate all summaries. Rebuilds HNSW index."""
-    from db import get_db, return_db, _generate_summary
+    from db import get_db, return_db
    own_db = db is None
    if own_db:
        db = get_db()
@ -523,11 +523,6 @@ def reindex_all(db=None, progress_callback=None):
        total = len(rows)
        for i, row in enumerate(rows):
            store_embeddings(row["id"], row["title"], row["body"], db)
-            # Only regenerate summary if missing
-            if not row["summary"]:
-                summary = _generate_summary(row["title"], row["body"])
-                db.execute("UPDATE pages SET summary = ? WHERE id = ?", (summary, row["id"]))
-                db.commit()
            if progress_callback:
                progress_callback(i + 1, total)

--- a/handlers.py
+++ b/handlers.py
@ -6,7 +6,7 @@ from datetime import datetime
 from urllib.parse import unquote

 from db import get_db, return_db, get_setting, set_setting, get_site_name, index_url, clean_url
-from templates import esc, snippet, wrap_page, DEFAULT_TEMPLATE
+from templates import esc, wrap_page, DEFAULT_TEMPLATE
 from rns_client import fetch_remote_sites

 _request_local = threading.local()
@ -205,13 +205,16 @@ def handle_search(query):
            # Hybrid search: merge BM25 + semantic via RRF
            bm25_ids = [r["id"] for r in bm25_rows]
            chunk_snippets = {}  # page_id -> best chunk text
-            try:
-                from embeddings import hybrid_search
-                use_reranker = get_setting("use_reranker", "1") == "1"
-                fused = hybrid_search(q, bm25_ids, limit=100, db=db, use_reranker=use_reranker)
-                fused_ids = [pid for pid, _ in fused]
-                chunk_snippets = {pid: text for pid, text in fused if text}
-            except Exception:
+            if get_setting("semantic_search", "1") == "1":
+                try:
+                    from embeddings import hybrid_search
+                    use_reranker = get_setting("use_reranker", "1") == "1"
+                    fused = hybrid_search(q, bm25_ids, limit=100, db=db, use_reranker=use_reranker)
+                    fused_ids = [pid for pid, _ in fused]
+                    chunk_snippets = {pid: text for pid, text in fused if text}
+                except Exception:
+                    fused_ids = bm25_ids
+            else:
                fused_ids = bm25_ids

            total_results = len(fused_ids)
@ -239,13 +242,12 @@ def handle_search(query):
                    if tags:
                        tag_links = " ".join(f'<a href="/tags/{esc(t)}" class="tag">[{esc(t)}]</a>' for t in tags)
                        tags_html = f'<div class="tags">{tag_links}</div>'
-                    # Use page summary as snippet (meta description or centroid sentence)
-                    snip = r["summary"] if r["summary"] else snippet(r["body"], q)
+                    snip_html = f'<br>{esc(r["summary"])}' if r["summary"] else ""
                    result_html += (
                        f'<div class="result">'
                        f'<a href="{esc(r["url"])}">{esc(r["title"])}</a><br>'
-                        f'<small>{esc(r["url"])}</small><br>'
-                        f'{esc(snip)}'
+                        f'<small>{esc(r["url"])}</small>'
+                        f'{snip_html}'
                        f'{note_html}{tags_html}'
                        f'</div>'
                    )
@ -557,8 +559,12 @@ def handle_style_form(msg=""):
    name = get_site_name()
    sharing = get_setting("sharing_enabled", "0")
    checked = " checked" if sharing == "1" else ""
+    semantic = get_setting("semantic_search", "1")
+    semantic_checked = " checked" if semantic == "1" else ""
    reranker = get_setting("use_reranker", "1")
    reranker_checked = " checked" if reranker == "1" else ""
+    disabled = "" if semantic == "1" else " disabled"
+    dimmed = ' style="opacity:0.4"' if semantic != "1" else ""
    return _respond(
        f"<h1>customize</h1>"
        f"<h2>name your search engine</h2>"
@ -569,9 +575,18 @@ def handle_style_form(msg=""):
        f'<label><input type="checkbox" name="sharing_enabled" value="1"{checked}>'
        f" share your site list publicly at /api/sites</label><br><br>"
        f"<h2>search</h2>"
-        f'<label><input type="checkbox" name="use_reranker" value="1"{reranker_checked}>'
-        f" cross-encoder reranking (more accurate, on by default)</label><br>"
+        f"<h3>ai</h3>"
+        f'<label><input type="checkbox" name="semantic_search" value="1"{semantic_checked} '
+        f'onchange="var d=!this.checked;document.getElementById(\'reranker\').disabled=d;'
+        f'document.getElementById(\'ai-extras\').style.opacity=d?\'0.4\':\'1\'">'
+        f" semantic search (similarity matching)</label><br>"
+        f"<small>Requires onnxruntime, tokenizers, hnswlib. Downloads ~30MB of models on first use.</small><br><br>"
+        f'<div id="ai-extras"{dimmed}>'
+        f'<label><input type="checkbox" id="reranker" name="use_reranker" value="1"{reranker_checked}{disabled}>'
+        f" cross-encoder reranking (more accurate)</label><br>"
        f"<small>Uses a 22MB model. Adds ~50ms per search. Disable for faster results.</small><br><br>"
+        f'<a href="/reindex">manage semantic index</a><br><br>'
+        f"</div>"
        f"<h2>custom html</h2>"
        f"<p>Edit the full page template. Use <code>{esc('{{content}}')}</code> "
        f"where page content should appear.</p>"
@ -596,10 +611,12 @@ def handle_style_submit(body):
    template = body.get("template", [""])[0].replace("\r\n", "\n").replace("\r", "\n")
    name = body.get("site_name", ["tinyweb"])[0].strip()
    sharing = "1" if body.get("sharing_enabled") else "0"
+    semantic = "1" if body.get("semantic_search") else "0"
    reranker = "1" if body.get("use_reranker") else "0"
    set_setting("custom_template", template if template.strip() != DEFAULT_TEMPLATE.strip() else "")
    set_setting("site_name", name or "tinyweb")
    set_setting("sharing_enabled", sharing)
+    set_setting("semantic_search", semantic)
    set_setting("use_reranker", reranker)
    return handle_style_form("Saved.")

@ -975,15 +992,16 @@ def handle_subscription_sync(sub_id):
                    (sub_id, s["url"], s["title"], s.get("note", ""), tags_str),
                )
                # Embed remote page for semantic search
-                try:
-                    from embeddings import store_remote_embeddings
-                    rp_id = db.execute(
-                        "SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
-                        (sub_id, s["url"]),
-                    ).fetchone()["id"]
-                    store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
-                except Exception:
-                    pass
+                if get_setting("semantic_search", "1") == "1":
+                    try:
+                        from embeddings import store_remote_embeddings
+                        rp_id = db.execute(
+                            "SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
+                            (sub_id, s["url"]),
+                        ).fetchone()["id"]
+                        store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
+                    except Exception:
+                        pass
                synced += 1
            except Exception:
                pass
@ -1050,15 +1068,16 @@ def handle_subscription_syncall():
                            "ON CONFLICT(subscription_id, url) DO UPDATE SET title=excluded.title, note=excluded.note, tags=excluded.tags",
                            (sub["id"], s["url"], s["title"], s.get("note", ""), tags_str),
                        )
-                        try:
-                            from embeddings import store_remote_embeddings
-                            rp_id = db.execute(
-                                "SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
-                                (sub["id"], s["url"]),
-                            ).fetchone()["id"]
-                            store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
-                        except Exception:
-                            pass
+                        if get_setting("semantic_search", "1") == "1":
+                            try:
+                                from embeddings import store_remote_embeddings
+                                rp_id = db.execute(
+                                    "SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
+                                    (sub["id"], s["url"]),
+                                ).fetchone()["id"]
+                                store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
+                            except Exception:
+                                pass
                    except Exception:
                        pass
                now = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
@ -1079,6 +1098,12 @@ _reindex_thread = None


 def handle_reindex_form():
+    if get_setting("semantic_search", "1") != "1":
+        return _respond(
+            f"<h2>semantic search index</h2>"
+            f"<p>Semantic search is disabled. Enable it in <a href=\"/style\">settings</a> to use embeddings.</p>"
+            f'<p><a href="/">back to search</a></p>'
+        )
    db = get_db()
    try:
        total_pages = db.execute("SELECT count(*) FROM pages").fetchone()[0]
--- a/templates.py
+++ b/templates.py
@ -6,14 +6,6 @@ def esc(s):
    return html.escape(str(s))


-def snippet(text, query, ctx=80):
-    pos = text.lower().find(query.lower())
-    if pos == -1:
-        return text[:200]
-    start = max(0, pos - ctx)
-    end = min(len(text), pos + len(query) + ctx)
-    return ("..." if start > 0 else "") + text[start:end] + ("..." if end < len(text) else "")
-

 DEFAULT_TEMPLATE = "<html>\n<head>\n</head>\n<body>\n{{content}}\n</body>\n</html>"