made semantic search optional, use meta snippets

- Add semantic_search setting to toggle AI-powered search on/off
- Skip embedding generation, hybrid search, and model preloading when disabled
- Use site owner's meta description as snippet instead of heuristic extraction
- Remove _generate_summary() and snippet() - no more generated snippets
- Show reranker/reindex controls grayed out when semantic search is off
- AI dependencies (onnxruntime, hnswlib, etc.) are now fully optional
This commit is contained in:
lichenblankie 2026-03-28 20:58:04 -07:00
parent e72afbb22e
commit 9bc5abd32f
5 changed files with 70 additions and 118 deletions

View file

@ -6,7 +6,7 @@ from datetime import datetime
from urllib.parse import unquote
from db import get_db, return_db, get_setting, set_setting, get_site_name, index_url, clean_url
from templates import esc, snippet, wrap_page, DEFAULT_TEMPLATE
from templates import esc, wrap_page, DEFAULT_TEMPLATE
from rns_client import fetch_remote_sites
_request_local = threading.local()
@ -205,13 +205,16 @@ def handle_search(query):
# Hybrid search: merge BM25 + semantic via RRF
bm25_ids = [r["id"] for r in bm25_rows]
chunk_snippets = {} # page_id -> best chunk text
try:
from embeddings import hybrid_search
use_reranker = get_setting("use_reranker", "1") == "1"
fused = hybrid_search(q, bm25_ids, limit=100, db=db, use_reranker=use_reranker)
fused_ids = [pid for pid, _ in fused]
chunk_snippets = {pid: text for pid, text in fused if text}
except Exception:
if get_setting("semantic_search", "1") == "1":
try:
from embeddings import hybrid_search
use_reranker = get_setting("use_reranker", "1") == "1"
fused = hybrid_search(q, bm25_ids, limit=100, db=db, use_reranker=use_reranker)
fused_ids = [pid for pid, _ in fused]
chunk_snippets = {pid: text for pid, text in fused if text}
except Exception:
fused_ids = bm25_ids
else:
fused_ids = bm25_ids
total_results = len(fused_ids)
@ -239,13 +242,12 @@ def handle_search(query):
if tags:
tag_links = " ".join(f'<a href="/tags/{esc(t)}" class="tag">[{esc(t)}]</a>' for t in tags)
tags_html = f'<div class="tags">{tag_links}</div>'
# Use page summary as snippet (meta description or centroid sentence)
snip = r["summary"] if r["summary"] else snippet(r["body"], q)
snip_html = f'<br>{esc(r["summary"])}' if r["summary"] else ""
result_html += (
f'<div class="result">'
f'<a href="{esc(r["url"])}">{esc(r["title"])}</a><br>'
f'<small>{esc(r["url"])}</small><br>'
f'{esc(snip)}'
f'<small>{esc(r["url"])}</small>'
f'{snip_html}'
f'{note_html}{tags_html}'
f'</div>'
)
@ -557,8 +559,12 @@ def handle_style_form(msg=""):
name = get_site_name()
sharing = get_setting("sharing_enabled", "0")
checked = " checked" if sharing == "1" else ""
semantic = get_setting("semantic_search", "1")
semantic_checked = " checked" if semantic == "1" else ""
reranker = get_setting("use_reranker", "1")
reranker_checked = " checked" if reranker == "1" else ""
disabled = "" if semantic == "1" else " disabled"
dimmed = ' style="opacity:0.4"' if semantic != "1" else ""
return _respond(
f"<h1>customize</h1>"
f"<h2>name your search engine</h2>"
@ -569,9 +575,18 @@ def handle_style_form(msg=""):
f'<label><input type="checkbox" name="sharing_enabled" value="1"{checked}>'
f" share your site list publicly at /api/sites</label><br><br>"
f"<h2>search</h2>"
f'<label><input type="checkbox" name="use_reranker" value="1"{reranker_checked}>'
f" cross-encoder reranking (more accurate, on by default)</label><br>"
f"<h3>ai</h3>"
f'<label><input type="checkbox" name="semantic_search" value="1"{semantic_checked} '
f'onchange="var d=!this.checked;document.getElementById(\'reranker\').disabled=d;'
f'document.getElementById(\'ai-extras\').style.opacity=d?\'0.4\':\'1\'">'
f" semantic search (similarity matching)</label><br>"
f"<small>Requires onnxruntime, tokenizers, hnswlib. Downloads ~30MB of models on first use.</small><br><br>"
f'<div id="ai-extras"{dimmed}>'
f'<label><input type="checkbox" id="reranker" name="use_reranker" value="1"{reranker_checked}{disabled}>'
f" cross-encoder reranking (more accurate)</label><br>"
f"<small>Uses a 22MB model. Adds ~50ms per search. Disable for faster results.</small><br><br>"
f'<a href="/reindex">manage semantic index</a><br><br>'
f"</div>"
f"<h2>custom html</h2>"
f"<p>Edit the full page template. Use <code>{esc('{{content}}')}</code> "
f"where page content should appear.</p>"
@ -596,10 +611,12 @@ def handle_style_submit(body):
template = body.get("template", [""])[0].replace("\r\n", "\n").replace("\r", "\n")
name = body.get("site_name", ["tinyweb"])[0].strip()
sharing = "1" if body.get("sharing_enabled") else "0"
semantic = "1" if body.get("semantic_search") else "0"
reranker = "1" if body.get("use_reranker") else "0"
set_setting("custom_template", template if template.strip() != DEFAULT_TEMPLATE.strip() else "")
set_setting("site_name", name or "tinyweb")
set_setting("sharing_enabled", sharing)
set_setting("semantic_search", semantic)
set_setting("use_reranker", reranker)
return handle_style_form("Saved.")
@ -975,15 +992,16 @@ def handle_subscription_sync(sub_id):
(sub_id, s["url"], s["title"], s.get("note", ""), tags_str),
)
# Embed remote page for semantic search
try:
from embeddings import store_remote_embeddings
rp_id = db.execute(
"SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
(sub_id, s["url"]),
).fetchone()["id"]
store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
except Exception:
pass
if get_setting("semantic_search", "1") == "1":
try:
from embeddings import store_remote_embeddings
rp_id = db.execute(
"SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
(sub_id, s["url"]),
).fetchone()["id"]
store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
except Exception:
pass
synced += 1
except Exception:
pass
@ -1050,15 +1068,16 @@ def handle_subscription_syncall():
"ON CONFLICT(subscription_id, url) DO UPDATE SET title=excluded.title, note=excluded.note, tags=excluded.tags",
(sub["id"], s["url"], s["title"], s.get("note", ""), tags_str),
)
try:
from embeddings import store_remote_embeddings
rp_id = db.execute(
"SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
(sub["id"], s["url"]),
).fetchone()["id"]
store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
except Exception:
pass
if get_setting("semantic_search", "1") == "1":
try:
from embeddings import store_remote_embeddings
rp_id = db.execute(
"SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
(sub["id"], s["url"]),
).fetchone()["id"]
store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
except Exception:
pass
except Exception:
pass
now = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
@ -1079,6 +1098,12 @@ _reindex_thread = None
def handle_reindex_form():
if get_setting("semantic_search", "1") != "1":
return _respond(
f"<h2>semantic search index</h2>"
f"<p>Semantic search is disabled. Enable it in <a href=\"/style\">settings</a> to use embeddings.</p>"
f'<p><a href="/">back to search</a></p>'
)
db = get_db()
try:
total_pages = db.execute("SELECT count(*) FROM pages").fetchone()[0]