made semantic search optional, use meta snippets
- Add semantic_search setting to toggle AI-powered search on/off - Skip embedding generation, hybrid search, and model preloading when disabled - Use site owner's meta description as snippet instead of heuristic extraction - Remove _generate_summary() and snippet() - no more generated snippets - Show reranker/reindex controls grayed out when semantic search is off - AI dependencies (onnxruntime, hnswlib, etc.) are now fully optional
This commit is contained in:
parent
e72afbb22e
commit
9bc5abd32f
5 changed files with 70 additions and 118 deletions
89
handlers.py
89
handlers.py
|
|
@ -6,7 +6,7 @@ from datetime import datetime
|
|||
from urllib.parse import unquote
|
||||
|
||||
from db import get_db, return_db, get_setting, set_setting, get_site_name, index_url, clean_url
|
||||
from templates import esc, snippet, wrap_page, DEFAULT_TEMPLATE
|
||||
from templates import esc, wrap_page, DEFAULT_TEMPLATE
|
||||
from rns_client import fetch_remote_sites
|
||||
|
||||
_request_local = threading.local()
|
||||
|
|
@ -205,13 +205,16 @@ def handle_search(query):
|
|||
# Hybrid search: merge BM25 + semantic via RRF
|
||||
bm25_ids = [r["id"] for r in bm25_rows]
|
||||
chunk_snippets = {} # page_id -> best chunk text
|
||||
try:
|
||||
from embeddings import hybrid_search
|
||||
use_reranker = get_setting("use_reranker", "1") == "1"
|
||||
fused = hybrid_search(q, bm25_ids, limit=100, db=db, use_reranker=use_reranker)
|
||||
fused_ids = [pid for pid, _ in fused]
|
||||
chunk_snippets = {pid: text for pid, text in fused if text}
|
||||
except Exception:
|
||||
if get_setting("semantic_search", "1") == "1":
|
||||
try:
|
||||
from embeddings import hybrid_search
|
||||
use_reranker = get_setting("use_reranker", "1") == "1"
|
||||
fused = hybrid_search(q, bm25_ids, limit=100, db=db, use_reranker=use_reranker)
|
||||
fused_ids = [pid for pid, _ in fused]
|
||||
chunk_snippets = {pid: text for pid, text in fused if text}
|
||||
except Exception:
|
||||
fused_ids = bm25_ids
|
||||
else:
|
||||
fused_ids = bm25_ids
|
||||
|
||||
total_results = len(fused_ids)
|
||||
|
|
@ -239,13 +242,12 @@ def handle_search(query):
|
|||
if tags:
|
||||
tag_links = " ".join(f'<a href="/tags/{esc(t)}" class="tag">[{esc(t)}]</a>' for t in tags)
|
||||
tags_html = f'<div class="tags">{tag_links}</div>'
|
||||
# Use page summary as snippet (meta description or centroid sentence)
|
||||
snip = r["summary"] if r["summary"] else snippet(r["body"], q)
|
||||
snip_html = f'<br>{esc(r["summary"])}' if r["summary"] else ""
|
||||
result_html += (
|
||||
f'<div class="result">'
|
||||
f'<a href="{esc(r["url"])}">{esc(r["title"])}</a><br>'
|
||||
f'<small>{esc(r["url"])}</small><br>'
|
||||
f'{esc(snip)}'
|
||||
f'<small>{esc(r["url"])}</small>'
|
||||
f'{snip_html}'
|
||||
f'{note_html}{tags_html}'
|
||||
f'</div>'
|
||||
)
|
||||
|
|
@ -557,8 +559,12 @@ def handle_style_form(msg=""):
|
|||
name = get_site_name()
|
||||
sharing = get_setting("sharing_enabled", "0")
|
||||
checked = " checked" if sharing == "1" else ""
|
||||
semantic = get_setting("semantic_search", "1")
|
||||
semantic_checked = " checked" if semantic == "1" else ""
|
||||
reranker = get_setting("use_reranker", "1")
|
||||
reranker_checked = " checked" if reranker == "1" else ""
|
||||
disabled = "" if semantic == "1" else " disabled"
|
||||
dimmed = ' style="opacity:0.4"' if semantic != "1" else ""
|
||||
return _respond(
|
||||
f"<h1>customize</h1>"
|
||||
f"<h2>name your search engine</h2>"
|
||||
|
|
@ -569,9 +575,18 @@ def handle_style_form(msg=""):
|
|||
f'<label><input type="checkbox" name="sharing_enabled" value="1"{checked}>'
|
||||
f" share your site list publicly at /api/sites</label><br><br>"
|
||||
f"<h2>search</h2>"
|
||||
f'<label><input type="checkbox" name="use_reranker" value="1"{reranker_checked}>'
|
||||
f" cross-encoder reranking (more accurate, on by default)</label><br>"
|
||||
f"<h3>ai</h3>"
|
||||
f'<label><input type="checkbox" name="semantic_search" value="1"{semantic_checked} '
|
||||
f'onchange="var d=!this.checked;document.getElementById(\'reranker\').disabled=d;'
|
||||
f'document.getElementById(\'ai-extras\').style.opacity=d?\'0.4\':\'1\'">'
|
||||
f" semantic search (similarity matching)</label><br>"
|
||||
f"<small>Requires onnxruntime, tokenizers, hnswlib. Downloads ~30MB of models on first use.</small><br><br>"
|
||||
f'<div id="ai-extras"{dimmed}>'
|
||||
f'<label><input type="checkbox" id="reranker" name="use_reranker" value="1"{reranker_checked}{disabled}>'
|
||||
f" cross-encoder reranking (more accurate)</label><br>"
|
||||
f"<small>Uses a 22MB model. Adds ~50ms per search. Disable for faster results.</small><br><br>"
|
||||
f'<a href="/reindex">manage semantic index</a><br><br>'
|
||||
f"</div>"
|
||||
f"<h2>custom html</h2>"
|
||||
f"<p>Edit the full page template. Use <code>{esc('{{content}}')}</code> "
|
||||
f"where page content should appear.</p>"
|
||||
|
|
@ -596,10 +611,12 @@ def handle_style_submit(body):
|
|||
template = body.get("template", [""])[0].replace("\r\n", "\n").replace("\r", "\n")
|
||||
name = body.get("site_name", ["tinyweb"])[0].strip()
|
||||
sharing = "1" if body.get("sharing_enabled") else "0"
|
||||
semantic = "1" if body.get("semantic_search") else "0"
|
||||
reranker = "1" if body.get("use_reranker") else "0"
|
||||
set_setting("custom_template", template if template.strip() != DEFAULT_TEMPLATE.strip() else "")
|
||||
set_setting("site_name", name or "tinyweb")
|
||||
set_setting("sharing_enabled", sharing)
|
||||
set_setting("semantic_search", semantic)
|
||||
set_setting("use_reranker", reranker)
|
||||
return handle_style_form("Saved.")
|
||||
|
||||
|
|
@ -975,15 +992,16 @@ def handle_subscription_sync(sub_id):
|
|||
(sub_id, s["url"], s["title"], s.get("note", ""), tags_str),
|
||||
)
|
||||
# Embed remote page for semantic search
|
||||
try:
|
||||
from embeddings import store_remote_embeddings
|
||||
rp_id = db.execute(
|
||||
"SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
|
||||
(sub_id, s["url"]),
|
||||
).fetchone()["id"]
|
||||
store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
|
||||
except Exception:
|
||||
pass
|
||||
if get_setting("semantic_search", "1") == "1":
|
||||
try:
|
||||
from embeddings import store_remote_embeddings
|
||||
rp_id = db.execute(
|
||||
"SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
|
||||
(sub_id, s["url"]),
|
||||
).fetchone()["id"]
|
||||
store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
|
||||
except Exception:
|
||||
pass
|
||||
synced += 1
|
||||
except Exception:
|
||||
pass
|
||||
|
|
@ -1050,15 +1068,16 @@ def handle_subscription_syncall():
|
|||
"ON CONFLICT(subscription_id, url) DO UPDATE SET title=excluded.title, note=excluded.note, tags=excluded.tags",
|
||||
(sub["id"], s["url"], s["title"], s.get("note", ""), tags_str),
|
||||
)
|
||||
try:
|
||||
from embeddings import store_remote_embeddings
|
||||
rp_id = db.execute(
|
||||
"SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
|
||||
(sub["id"], s["url"]),
|
||||
).fetchone()["id"]
|
||||
store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
|
||||
except Exception:
|
||||
pass
|
||||
if get_setting("semantic_search", "1") == "1":
|
||||
try:
|
||||
from embeddings import store_remote_embeddings
|
||||
rp_id = db.execute(
|
||||
"SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
|
||||
(sub["id"], s["url"]),
|
||||
).fetchone()["id"]
|
||||
store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
now = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
|
||||
|
|
@ -1079,6 +1098,12 @@ _reindex_thread = None
|
|||
|
||||
|
||||
def handle_reindex_form():
|
||||
if get_setting("semantic_search", "1") != "1":
|
||||
return _respond(
|
||||
f"<h2>semantic search index</h2>"
|
||||
f"<p>Semantic search is disabled. Enable it in <a href=\"/style\">settings</a> to use embeddings.</p>"
|
||||
f'<p><a href="/">back to search</a></p>'
|
||||
)
|
||||
db = get_db()
|
||||
try:
|
||||
total_pages = db.execute("SELECT count(*) FROM pages").fetchone()[0]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue