Make semantic search and reranking optional, use site meta descriptions for snippets

- Add semantic_search setting to toggle AI-powered search on/off
- Skip embedding generation, hybrid search, and model preloading when disabled
- Use site owner's meta description as snippet instead of heuristic extraction
- Remove _generate_summary() and snippet() - no more generated snippets
- Show reranker/reindex controls grayed out when semantic search is off
- AI dependencies (onnxruntime, hnswlib, etc.) are now fully optional

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Derick Phan 2026-03-28 20:58:04 -07:00
parent c9a8cba9d1
commit c959ee98ae
No known key found for this signature in database
5 changed files with 70 additions and 118 deletions

3
app.py
View file

@ -74,6 +74,9 @@ def ensure_rns_config(config_dir):
def _preload_embeddings():
"""Pre-load the embedding model and build the HNSW index in background."""
if get_setting("semantic_search", "1") != "1":
print("Semantic search disabled.")
return
try:
from embeddings import _get_session, _get_reranker, build_index
_get_session() # downloads model on first run, loads ONNX session

81
db.py
View file

@ -334,80 +334,16 @@ def fetch_page(url):
tag.decompose()
title = soup.title.string.strip() if soup.title and soup.title.string else url
# Extract paragraph text for better summary generation
paragraphs = []
for p in soup.find_all("p"):
text = p.get_text(strip=True)
if len(text) >= 40:
paragraphs.append(text)
body = soup.get_text(separator=" ", strip=True)
return title, body, links, meta_desc, paragraphs
return title, body, links, meta_desc
def _generate_summary(title, body, paragraphs=None):
"""Generate a summary by extracting the best sentence from the page.
Priority: sentence mentioning the site name > first paragraph sentence
> first body sentence > title.
"""
import re
noise_patterns = re.compile(
r'arrow-|fedilink|message-square|link-external|'
r'skip to|cookie|subscribe|sign up|log in|'
r'privacy policy|terms of|©|\bads?\b',
re.IGNORECASE
)
def _filter_sentences(raw):
result = []
for s in raw:
s = s.strip()
if len(s) < 40 or len(s.split()) < 7:
continue
alpha_chars = sum(1 for c in s if c.isalpha() or c == ' ')
if alpha_chars < len(s) * 0.6:
continue
if s.count('|') > 2 or s.count('·') > 2 or s.count('') > 0:
continue
if noise_patterns.search(s):
continue
result.append(s)
return result
# Prefer sentences from <p> tags (actual content, not UI)
sentences = []
if paragraphs:
raw = []
for p in paragraphs:
raw.extend(re.split(r'(?<=[.!?])\s+', p))
sentences = _filter_sentences(raw)
# Fall back to full body text
if not sentences:
raw = re.split(r'(?<=[.!?])\s+', body)
sentences = _filter_sentences(raw)
if not sentences:
return title[:200] if title else ""
# Prefer a sentence that mentions the site name
if title:
title_words = [w.lower() for w in re.split(r'\W+', title) if len(w) >= 3]
for s in sentences:
s_lower = s.lower()
if sum(1 for w in title_words if w in s_lower) >= max(1, len(title_words) // 2):
return s[:200]
# Otherwise use the first quality sentence
return sentences[0][:200]
def index_url(url, note=""):
url = clean_url(url)
title, body, links, meta_desc, paragraphs = fetch_page(url)
title, body, links, meta_desc = fetch_page(url)
# Use meta description if available and meaningful, otherwise generate from body
summary = meta_desc if meta_desc and len(meta_desc) > 20 else _generate_summary(title, body, paragraphs)
summary = meta_desc if meta_desc and len(meta_desc) > 20 else ""
db = get_db()
try:
now = __import__("datetime").datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
@ -425,11 +361,12 @@ def index_url(url, note=""):
(page_id, href, label),
)
db.commit()
try:
from embeddings import store_embeddings
store_embeddings(page_id, title, body, db)
except Exception:
pass # embedding generation is best-effort
if get_setting("semantic_search", "1") == "1":
try:
from embeddings import store_embeddings
store_embeddings(page_id, title, body, db)
except Exception:
pass # embedding generation is best-effort
finally:
return_db(db)
return title

View file

@ -507,7 +507,7 @@ def hybrid_search(query_text, bm25_ranked_ids, limit=10, db=None, use_reranker=F
def reindex_all(db=None, progress_callback=None):
"""Re-embed all pages and regenerate all summaries. Rebuilds HNSW index."""
from db import get_db, return_db, _generate_summary
from db import get_db, return_db
own_db = db is None
if own_db:
db = get_db()
@ -523,11 +523,6 @@ def reindex_all(db=None, progress_callback=None):
total = len(rows)
for i, row in enumerate(rows):
store_embeddings(row["id"], row["title"], row["body"], db)
# Only regenerate summary if missing
if not row["summary"]:
summary = _generate_summary(row["title"], row["body"])
db.execute("UPDATE pages SET summary = ? WHERE id = ?", (summary, row["id"]))
db.commit()
if progress_callback:
progress_callback(i + 1, total)

View file

@ -6,7 +6,7 @@ from datetime import datetime
from urllib.parse import unquote
from db import get_db, return_db, get_setting, set_setting, get_site_name, index_url, clean_url
from templates import esc, snippet, wrap_page, DEFAULT_TEMPLATE
from templates import esc, wrap_page, DEFAULT_TEMPLATE
from rns_client import fetch_remote_sites
_request_local = threading.local()
@ -205,13 +205,16 @@ def handle_search(query):
# Hybrid search: merge BM25 + semantic via RRF
bm25_ids = [r["id"] for r in bm25_rows]
chunk_snippets = {} # page_id -> best chunk text
try:
from embeddings import hybrid_search
use_reranker = get_setting("use_reranker", "1") == "1"
fused = hybrid_search(q, bm25_ids, limit=100, db=db, use_reranker=use_reranker)
fused_ids = [pid for pid, _ in fused]
chunk_snippets = {pid: text for pid, text in fused if text}
except Exception:
if get_setting("semantic_search", "1") == "1":
try:
from embeddings import hybrid_search
use_reranker = get_setting("use_reranker", "1") == "1"
fused = hybrid_search(q, bm25_ids, limit=100, db=db, use_reranker=use_reranker)
fused_ids = [pid for pid, _ in fused]
chunk_snippets = {pid: text for pid, text in fused if text}
except Exception:
fused_ids = bm25_ids
else:
fused_ids = bm25_ids
total_results = len(fused_ids)
@ -239,13 +242,12 @@ def handle_search(query):
if tags:
tag_links = " ".join(f'<a href="/tags/{esc(t)}" class="tag">[{esc(t)}]</a>' for t in tags)
tags_html = f'<div class="tags">{tag_links}</div>'
# Use page summary as snippet (meta description or centroid sentence)
snip = r["summary"] if r["summary"] else snippet(r["body"], q)
snip_html = f'<br>{esc(r["summary"])}' if r["summary"] else ""
result_html += (
f'<div class="result">'
f'<a href="{esc(r["url"])}">{esc(r["title"])}</a><br>'
f'<small>{esc(r["url"])}</small><br>'
f'{esc(snip)}'
f'<small>{esc(r["url"])}</small>'
f'{snip_html}'
f'{note_html}{tags_html}'
f'</div>'
)
@ -557,8 +559,12 @@ def handle_style_form(msg=""):
name = get_site_name()
sharing = get_setting("sharing_enabled", "0")
checked = " checked" if sharing == "1" else ""
semantic = get_setting("semantic_search", "1")
semantic_checked = " checked" if semantic == "1" else ""
reranker = get_setting("use_reranker", "1")
reranker_checked = " checked" if reranker == "1" else ""
disabled = "" if semantic == "1" else " disabled"
dimmed = ' style="opacity:0.4"' if semantic != "1" else ""
return _respond(
f"<h1>customize</h1>"
f"<h2>name your search engine</h2>"
@ -569,9 +575,18 @@ def handle_style_form(msg=""):
f'<label><input type="checkbox" name="sharing_enabled" value="1"{checked}>'
f" share your site list publicly at /api/sites</label><br><br>"
f"<h2>search</h2>"
f'<label><input type="checkbox" name="use_reranker" value="1"{reranker_checked}>'
f" cross-encoder reranking (more accurate, on by default)</label><br>"
f"<h3>ai</h3>"
f'<label><input type="checkbox" name="semantic_search" value="1"{semantic_checked} '
f'onchange="var d=!this.checked;document.getElementById(\'reranker\').disabled=d;'
f'document.getElementById(\'ai-extras\').style.opacity=d?\'0.4\':\'1\'">'
f" semantic search (similarity matching)</label><br>"
f"<small>Requires onnxruntime, tokenizers, hnswlib. Downloads ~30MB of models on first use.</small><br><br>"
f'<div id="ai-extras"{dimmed}>'
f'<label><input type="checkbox" id="reranker" name="use_reranker" value="1"{reranker_checked}{disabled}>'
f" cross-encoder reranking (more accurate)</label><br>"
f"<small>Uses a 22MB model. Adds ~50ms per search. Disable for faster results.</small><br><br>"
f'<a href="/reindex">manage semantic index</a><br><br>'
f"</div>"
f"<h2>custom html</h2>"
f"<p>Edit the full page template. Use <code>{esc('{{content}}')}</code> "
f"where page content should appear.</p>"
@ -596,10 +611,12 @@ def handle_style_submit(body):
template = body.get("template", [""])[0].replace("\r\n", "\n").replace("\r", "\n")
name = body.get("site_name", ["tinyweb"])[0].strip()
sharing = "1" if body.get("sharing_enabled") else "0"
semantic = "1" if body.get("semantic_search") else "0"
reranker = "1" if body.get("use_reranker") else "0"
set_setting("custom_template", template if template.strip() != DEFAULT_TEMPLATE.strip() else "")
set_setting("site_name", name or "tinyweb")
set_setting("sharing_enabled", sharing)
set_setting("semantic_search", semantic)
set_setting("use_reranker", reranker)
return handle_style_form("Saved.")
@ -975,15 +992,16 @@ def handle_subscription_sync(sub_id):
(sub_id, s["url"], s["title"], s.get("note", ""), tags_str),
)
# Embed remote page for semantic search
try:
from embeddings import store_remote_embeddings
rp_id = db.execute(
"SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
(sub_id, s["url"]),
).fetchone()["id"]
store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
except Exception:
pass
if get_setting("semantic_search", "1") == "1":
try:
from embeddings import store_remote_embeddings
rp_id = db.execute(
"SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
(sub_id, s["url"]),
).fetchone()["id"]
store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
except Exception:
pass
synced += 1
except Exception:
pass
@ -1050,15 +1068,16 @@ def handle_subscription_syncall():
"ON CONFLICT(subscription_id, url) DO UPDATE SET title=excluded.title, note=excluded.note, tags=excluded.tags",
(sub["id"], s["url"], s["title"], s.get("note", ""), tags_str),
)
try:
from embeddings import store_remote_embeddings
rp_id = db.execute(
"SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
(sub["id"], s["url"]),
).fetchone()["id"]
store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
except Exception:
pass
if get_setting("semantic_search", "1") == "1":
try:
from embeddings import store_remote_embeddings
rp_id = db.execute(
"SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
(sub["id"], s["url"]),
).fetchone()["id"]
store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
except Exception:
pass
except Exception:
pass
now = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
@ -1079,6 +1098,12 @@ _reindex_thread = None
def handle_reindex_form():
if get_setting("semantic_search", "1") != "1":
return _respond(
f"<h2>semantic search index</h2>"
f"<p>Semantic search is disabled. Enable it in <a href=\"/style\">settings</a> to use embeddings.</p>"
f'<p><a href="/">back to search</a></p>'
)
db = get_db()
try:
total_pages = db.execute("SELECT count(*) FROM pages").fetchone()[0]

View file

@ -6,14 +6,6 @@ def esc(s):
return html.escape(str(s))
def snippet(text, query, ctx=80):
pos = text.lower().find(query.lower())
if pos == -1:
return text[:200]
start = max(0, pos - ctx)
end = min(len(text), pos + len(query) + ctx)
return ("..." if start > 0 else "") + text[start:end] + ("..." if end < len(text) else "")
DEFAULT_TEMPLATE = "<html>\n<head>\n</head>\n<body>\n{{content}}\n</body>\n</html>"