made semantic search optional, use meta snippets

- Add semantic_search setting to toggle AI-powered search on/off - Skip embedding generation, hybrid search, and model preloading when disabled - Use site owner's meta description as snippet instead of heuristic extraction - Remove _generate_summary() and snippet() - no more generated snippets - Show reranker/reindex controls grayed out when semantic search is off - AI dependencies (onnxruntime, hnswlib, etc.) are now fully optional
2026-03-28 20:58:04 -07:00 · 2026-03-28 20:58:04 -07:00 · 9bc5abd32f
commit 9bc5abd32f
parent e72afbb22e
5 changed files with 70 additions and 118 deletions
--- a/db.py
+++ b/db.py
@ -334,80 +334,16 @@ def fetch_page(url):
        tag.decompose()
    title = soup.title.string.strip() if soup.title and soup.title.string else url

-    # Extract paragraph text for better summary generation
-    paragraphs = []
-    for p in soup.find_all("p"):
-        text = p.get_text(strip=True)
-        if len(text) >= 40:
-            paragraphs.append(text)
-
    body = soup.get_text(separator=" ", strip=True)
-    return title, body, links, meta_desc, paragraphs
+    return title, body, links, meta_desc


-def _generate_summary(title, body, paragraphs=None):
-    """Generate a summary by extracting the best sentence from the page.
-
-    Priority: sentence mentioning the site name > first paragraph sentence
-    > first body sentence > title.
-    """
-    import re
-    noise_patterns = re.compile(
-        r'arrow-|fedilink|message-square|link-external|'
-        r'skip to|cookie|subscribe|sign up|log in|'
-        r'privacy policy|terms of|©|\bads?\b',
-        re.IGNORECASE
-    )
-
-    def _filter_sentences(raw):
-        result = []
-        for s in raw:
-            s = s.strip()
-            if len(s) < 40 or len(s.split()) < 7:
-                continue
-            alpha_chars = sum(1 for c in s if c.isalpha() or c == ' ')
-            if alpha_chars < len(s) * 0.6:
-                continue
-            if s.count('|') > 2 or s.count('·') > 2 or s.count('►') > 0:
-                continue
-            if noise_patterns.search(s):
-                continue
-            result.append(s)
-        return result
-
-    # Prefer sentences from <p> tags (actual content, not UI)
-    sentences = []
-    if paragraphs:
-        raw = []
-        for p in paragraphs:
-            raw.extend(re.split(r'(?<=[.!?])\s+', p))
-        sentences = _filter_sentences(raw)
-
-    # Fall back to full body text
-    if not sentences:
-        raw = re.split(r'(?<=[.!?])\s+', body)
-        sentences = _filter_sentences(raw)
-
-    if not sentences:
-        return title[:200] if title else ""
-
-    # Prefer a sentence that mentions the site name
-    if title:
-        title_words = [w.lower() for w in re.split(r'\W+', title) if len(w) >= 3]
-        for s in sentences:
-            s_lower = s.lower()
-            if sum(1 for w in title_words if w in s_lower) >= max(1, len(title_words) // 2):
-                return s[:200]
-
-    # Otherwise use the first quality sentence
-    return sentences[0][:200]
-

 def index_url(url, note=""):
    url = clean_url(url)
-    title, body, links, meta_desc, paragraphs = fetch_page(url)
+    title, body, links, meta_desc = fetch_page(url)
    # Use meta description if available and meaningful, otherwise generate from body
-    summary = meta_desc if meta_desc and len(meta_desc) > 20 else _generate_summary(title, body, paragraphs)
+    summary = meta_desc if meta_desc and len(meta_desc) > 20 else ""
    db = get_db()
    try:
        now = __import__("datetime").datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
@ -425,11 +361,12 @@ def index_url(url, note=""):
                (page_id, href, label),
            )
        db.commit()
-        try:
-            from embeddings import store_embeddings
-            store_embeddings(page_id, title, body, db)
-        except Exception:
-            pass  # embedding generation is best-effort
+        if get_setting("semantic_search", "1") == "1":
+            try:
+                from embeddings import store_embeddings
+                store_embeddings(page_id, title, body, db)
+            except Exception:
+                pass  # embedding generation is best-effort
    finally:
        return_db(db)
    return title