made semantic search optional, use meta snippets

- Add semantic_search setting to toggle AI-powered search on/off
- Skip embedding generation, hybrid search, and model preloading when disabled
- Use site owner's meta description as snippet instead of heuristic extraction
- Remove _generate_summary() and snippet() - no more generated snippets
- Show reranker/reindex controls grayed out when semantic search is off
- AI dependencies (onnxruntime, hnswlib, etc.) are now fully optional
This commit is contained in:
lichenblankie 2026-03-28 20:58:04 -07:00
parent e72afbb22e
commit 9bc5abd32f
5 changed files with 70 additions and 118 deletions

81
db.py
View file

@ -334,80 +334,16 @@ def fetch_page(url):
tag.decompose()
title = soup.title.string.strip() if soup.title and soup.title.string else url
# Extract paragraph text for better summary generation
paragraphs = []
for p in soup.find_all("p"):
text = p.get_text(strip=True)
if len(text) >= 40:
paragraphs.append(text)
body = soup.get_text(separator=" ", strip=True)
return title, body, links, meta_desc, paragraphs
return title, body, links, meta_desc
def _generate_summary(title, body, paragraphs=None):
"""Generate a summary by extracting the best sentence from the page.
Priority: sentence mentioning the site name > first paragraph sentence
> first body sentence > title.
"""
import re
noise_patterns = re.compile(
r'arrow-|fedilink|message-square|link-external|'
r'skip to|cookie|subscribe|sign up|log in|'
r'privacy policy|terms of|©|\bads?\b',
re.IGNORECASE
)
def _filter_sentences(raw):
result = []
for s in raw:
s = s.strip()
if len(s) < 40 or len(s.split()) < 7:
continue
alpha_chars = sum(1 for c in s if c.isalpha() or c == ' ')
if alpha_chars < len(s) * 0.6:
continue
if s.count('|') > 2 or s.count('·') > 2 or s.count('') > 0:
continue
if noise_patterns.search(s):
continue
result.append(s)
return result
# Prefer sentences from <p> tags (actual content, not UI)
sentences = []
if paragraphs:
raw = []
for p in paragraphs:
raw.extend(re.split(r'(?<=[.!?])\s+', p))
sentences = _filter_sentences(raw)
# Fall back to full body text
if not sentences:
raw = re.split(r'(?<=[.!?])\s+', body)
sentences = _filter_sentences(raw)
if not sentences:
return title[:200] if title else ""
# Prefer a sentence that mentions the site name
if title:
title_words = [w.lower() for w in re.split(r'\W+', title) if len(w) >= 3]
for s in sentences:
s_lower = s.lower()
if sum(1 for w in title_words if w in s_lower) >= max(1, len(title_words) // 2):
return s[:200]
# Otherwise use the first quality sentence
return sentences[0][:200]
def index_url(url, note=""):
url = clean_url(url)
title, body, links, meta_desc, paragraphs = fetch_page(url)
title, body, links, meta_desc = fetch_page(url)
# Use meta description if available and meaningful, otherwise generate from body
summary = meta_desc if meta_desc and len(meta_desc) > 20 else _generate_summary(title, body, paragraphs)
summary = meta_desc if meta_desc and len(meta_desc) > 20 else ""
db = get_db()
try:
now = __import__("datetime").datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
@ -425,11 +361,12 @@ def index_url(url, note=""):
(page_id, href, label),
)
db.commit()
try:
from embeddings import store_embeddings
store_embeddings(page_id, title, body, db)
except Exception:
pass # embedding generation is best-effort
if get_setting("semantic_search", "1") == "1":
try:
from embeddings import store_embeddings
store_embeddings(page_id, title, body, db)
except Exception:
pass # embedding generation is best-effort
finally:
return_db(db)
return title