Make semantic search and reranking optional, use site meta descriptions for snippets
- Add semantic_search setting to toggle AI-powered search on/off - Skip embedding generation, hybrid search, and model preloading when disabled - Use site owner's meta description as snippet instead of heuristic extraction - Remove _generate_summary() and snippet() - no more generated snippets - Show reranker/reindex controls grayed out when semantic search is off - AI dependencies (onnxruntime, hnswlib, etc.) are now fully optional Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
c9a8cba9d1
commit
c959ee98ae
5 changed files with 70 additions and 118 deletions
3
app.py
3
app.py
|
|
@ -74,6 +74,9 @@ def ensure_rns_config(config_dir):
|
|||
|
||||
def _preload_embeddings():
|
||||
"""Pre-load the embedding model and build the HNSW index in background."""
|
||||
if get_setting("semantic_search", "1") != "1":
|
||||
print("Semantic search disabled.")
|
||||
return
|
||||
try:
|
||||
from embeddings import _get_session, _get_reranker, build_index
|
||||
_get_session() # downloads model on first run, loads ONNX session
|
||||
|
|
|
|||
81
db.py
81
db.py
|
|
@ -334,80 +334,16 @@ def fetch_page(url):
|
|||
tag.decompose()
|
||||
title = soup.title.string.strip() if soup.title and soup.title.string else url
|
||||
|
||||
# Extract paragraph text for better summary generation
|
||||
paragraphs = []
|
||||
for p in soup.find_all("p"):
|
||||
text = p.get_text(strip=True)
|
||||
if len(text) >= 40:
|
||||
paragraphs.append(text)
|
||||
|
||||
body = soup.get_text(separator=" ", strip=True)
|
||||
return title, body, links, meta_desc, paragraphs
|
||||
return title, body, links, meta_desc
|
||||
|
||||
|
||||
def _generate_summary(title, body, paragraphs=None):
|
||||
"""Generate a summary by extracting the best sentence from the page.
|
||||
|
||||
Priority: sentence mentioning the site name > first paragraph sentence
|
||||
> first body sentence > title.
|
||||
"""
|
||||
import re
|
||||
noise_patterns = re.compile(
|
||||
r'arrow-|fedilink|message-square|link-external|'
|
||||
r'skip to|cookie|subscribe|sign up|log in|'
|
||||
r'privacy policy|terms of|©|\bads?\b',
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
def _filter_sentences(raw):
|
||||
result = []
|
||||
for s in raw:
|
||||
s = s.strip()
|
||||
if len(s) < 40 or len(s.split()) < 7:
|
||||
continue
|
||||
alpha_chars = sum(1 for c in s if c.isalpha() or c == ' ')
|
||||
if alpha_chars < len(s) * 0.6:
|
||||
continue
|
||||
if s.count('|') > 2 or s.count('·') > 2 or s.count('►') > 0:
|
||||
continue
|
||||
if noise_patterns.search(s):
|
||||
continue
|
||||
result.append(s)
|
||||
return result
|
||||
|
||||
# Prefer sentences from <p> tags (actual content, not UI)
|
||||
sentences = []
|
||||
if paragraphs:
|
||||
raw = []
|
||||
for p in paragraphs:
|
||||
raw.extend(re.split(r'(?<=[.!?])\s+', p))
|
||||
sentences = _filter_sentences(raw)
|
||||
|
||||
# Fall back to full body text
|
||||
if not sentences:
|
||||
raw = re.split(r'(?<=[.!?])\s+', body)
|
||||
sentences = _filter_sentences(raw)
|
||||
|
||||
if not sentences:
|
||||
return title[:200] if title else ""
|
||||
|
||||
# Prefer a sentence that mentions the site name
|
||||
if title:
|
||||
title_words = [w.lower() for w in re.split(r'\W+', title) if len(w) >= 3]
|
||||
for s in sentences:
|
||||
s_lower = s.lower()
|
||||
if sum(1 for w in title_words if w in s_lower) >= max(1, len(title_words) // 2):
|
||||
return s[:200]
|
||||
|
||||
# Otherwise use the first quality sentence
|
||||
return sentences[0][:200]
|
||||
|
||||
|
||||
def index_url(url, note=""):
|
||||
url = clean_url(url)
|
||||
title, body, links, meta_desc, paragraphs = fetch_page(url)
|
||||
title, body, links, meta_desc = fetch_page(url)
|
||||
# Use meta description if available and meaningful, otherwise generate from body
|
||||
summary = meta_desc if meta_desc and len(meta_desc) > 20 else _generate_summary(title, body, paragraphs)
|
||||
summary = meta_desc if meta_desc and len(meta_desc) > 20 else ""
|
||||
db = get_db()
|
||||
try:
|
||||
now = __import__("datetime").datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
|
||||
|
|
@ -425,11 +361,12 @@ def index_url(url, note=""):
|
|||
(page_id, href, label),
|
||||
)
|
||||
db.commit()
|
||||
try:
|
||||
from embeddings import store_embeddings
|
||||
store_embeddings(page_id, title, body, db)
|
||||
except Exception:
|
||||
pass # embedding generation is best-effort
|
||||
if get_setting("semantic_search", "1") == "1":
|
||||
try:
|
||||
from embeddings import store_embeddings
|
||||
store_embeddings(page_id, title, body, db)
|
||||
except Exception:
|
||||
pass # embedding generation is best-effort
|
||||
finally:
|
||||
return_db(db)
|
||||
return title
|
||||
|
|
|
|||
|
|
@ -507,7 +507,7 @@ def hybrid_search(query_text, bm25_ranked_ids, limit=10, db=None, use_reranker=F
|
|||
|
||||
def reindex_all(db=None, progress_callback=None):
|
||||
"""Re-embed all pages and regenerate all summaries. Rebuilds HNSW index."""
|
||||
from db import get_db, return_db, _generate_summary
|
||||
from db import get_db, return_db
|
||||
own_db = db is None
|
||||
if own_db:
|
||||
db = get_db()
|
||||
|
|
@ -523,11 +523,6 @@ def reindex_all(db=None, progress_callback=None):
|
|||
total = len(rows)
|
||||
for i, row in enumerate(rows):
|
||||
store_embeddings(row["id"], row["title"], row["body"], db)
|
||||
# Only regenerate summary if missing
|
||||
if not row["summary"]:
|
||||
summary = _generate_summary(row["title"], row["body"])
|
||||
db.execute("UPDATE pages SET summary = ? WHERE id = ?", (summary, row["id"]))
|
||||
db.commit()
|
||||
if progress_callback:
|
||||
progress_callback(i + 1, total)
|
||||
|
||||
|
|
|
|||
89
handlers.py
89
handlers.py
|
|
@ -6,7 +6,7 @@ from datetime import datetime
|
|||
from urllib.parse import unquote
|
||||
|
||||
from db import get_db, return_db, get_setting, set_setting, get_site_name, index_url, clean_url
|
||||
from templates import esc, snippet, wrap_page, DEFAULT_TEMPLATE
|
||||
from templates import esc, wrap_page, DEFAULT_TEMPLATE
|
||||
from rns_client import fetch_remote_sites
|
||||
|
||||
_request_local = threading.local()
|
||||
|
|
@ -205,13 +205,16 @@ def handle_search(query):
|
|||
# Hybrid search: merge BM25 + semantic via RRF
|
||||
bm25_ids = [r["id"] for r in bm25_rows]
|
||||
chunk_snippets = {} # page_id -> best chunk text
|
||||
try:
|
||||
from embeddings import hybrid_search
|
||||
use_reranker = get_setting("use_reranker", "1") == "1"
|
||||
fused = hybrid_search(q, bm25_ids, limit=100, db=db, use_reranker=use_reranker)
|
||||
fused_ids = [pid for pid, _ in fused]
|
||||
chunk_snippets = {pid: text for pid, text in fused if text}
|
||||
except Exception:
|
||||
if get_setting("semantic_search", "1") == "1":
|
||||
try:
|
||||
from embeddings import hybrid_search
|
||||
use_reranker = get_setting("use_reranker", "1") == "1"
|
||||
fused = hybrid_search(q, bm25_ids, limit=100, db=db, use_reranker=use_reranker)
|
||||
fused_ids = [pid for pid, _ in fused]
|
||||
chunk_snippets = {pid: text for pid, text in fused if text}
|
||||
except Exception:
|
||||
fused_ids = bm25_ids
|
||||
else:
|
||||
fused_ids = bm25_ids
|
||||
|
||||
total_results = len(fused_ids)
|
||||
|
|
@ -239,13 +242,12 @@ def handle_search(query):
|
|||
if tags:
|
||||
tag_links = " ".join(f'<a href="/tags/{esc(t)}" class="tag">[{esc(t)}]</a>' for t in tags)
|
||||
tags_html = f'<div class="tags">{tag_links}</div>'
|
||||
# Use page summary as snippet (meta description or centroid sentence)
|
||||
snip = r["summary"] if r["summary"] else snippet(r["body"], q)
|
||||
snip_html = f'<br>{esc(r["summary"])}' if r["summary"] else ""
|
||||
result_html += (
|
||||
f'<div class="result">'
|
||||
f'<a href="{esc(r["url"])}">{esc(r["title"])}</a><br>'
|
||||
f'<small>{esc(r["url"])}</small><br>'
|
||||
f'{esc(snip)}'
|
||||
f'<small>{esc(r["url"])}</small>'
|
||||
f'{snip_html}'
|
||||
f'{note_html}{tags_html}'
|
||||
f'</div>'
|
||||
)
|
||||
|
|
@ -557,8 +559,12 @@ def handle_style_form(msg=""):
|
|||
name = get_site_name()
|
||||
sharing = get_setting("sharing_enabled", "0")
|
||||
checked = " checked" if sharing == "1" else ""
|
||||
semantic = get_setting("semantic_search", "1")
|
||||
semantic_checked = " checked" if semantic == "1" else ""
|
||||
reranker = get_setting("use_reranker", "1")
|
||||
reranker_checked = " checked" if reranker == "1" else ""
|
||||
disabled = "" if semantic == "1" else " disabled"
|
||||
dimmed = ' style="opacity:0.4"' if semantic != "1" else ""
|
||||
return _respond(
|
||||
f"<h1>customize</h1>"
|
||||
f"<h2>name your search engine</h2>"
|
||||
|
|
@ -569,9 +575,18 @@ def handle_style_form(msg=""):
|
|||
f'<label><input type="checkbox" name="sharing_enabled" value="1"{checked}>'
|
||||
f" share your site list publicly at /api/sites</label><br><br>"
|
||||
f"<h2>search</h2>"
|
||||
f'<label><input type="checkbox" name="use_reranker" value="1"{reranker_checked}>'
|
||||
f" cross-encoder reranking (more accurate, on by default)</label><br>"
|
||||
f"<h3>ai</h3>"
|
||||
f'<label><input type="checkbox" name="semantic_search" value="1"{semantic_checked} '
|
||||
f'onchange="var d=!this.checked;document.getElementById(\'reranker\').disabled=d;'
|
||||
f'document.getElementById(\'ai-extras\').style.opacity=d?\'0.4\':\'1\'">'
|
||||
f" semantic search (similarity matching)</label><br>"
|
||||
f"<small>Requires onnxruntime, tokenizers, hnswlib. Downloads ~30MB of models on first use.</small><br><br>"
|
||||
f'<div id="ai-extras"{dimmed}>'
|
||||
f'<label><input type="checkbox" id="reranker" name="use_reranker" value="1"{reranker_checked}{disabled}>'
|
||||
f" cross-encoder reranking (more accurate)</label><br>"
|
||||
f"<small>Uses a 22MB model. Adds ~50ms per search. Disable for faster results.</small><br><br>"
|
||||
f'<a href="/reindex">manage semantic index</a><br><br>'
|
||||
f"</div>"
|
||||
f"<h2>custom html</h2>"
|
||||
f"<p>Edit the full page template. Use <code>{esc('{{content}}')}</code> "
|
||||
f"where page content should appear.</p>"
|
||||
|
|
@ -596,10 +611,12 @@ def handle_style_submit(body):
|
|||
template = body.get("template", [""])[0].replace("\r\n", "\n").replace("\r", "\n")
|
||||
name = body.get("site_name", ["tinyweb"])[0].strip()
|
||||
sharing = "1" if body.get("sharing_enabled") else "0"
|
||||
semantic = "1" if body.get("semantic_search") else "0"
|
||||
reranker = "1" if body.get("use_reranker") else "0"
|
||||
set_setting("custom_template", template if template.strip() != DEFAULT_TEMPLATE.strip() else "")
|
||||
set_setting("site_name", name or "tinyweb")
|
||||
set_setting("sharing_enabled", sharing)
|
||||
set_setting("semantic_search", semantic)
|
||||
set_setting("use_reranker", reranker)
|
||||
return handle_style_form("Saved.")
|
||||
|
||||
|
|
@ -975,15 +992,16 @@ def handle_subscription_sync(sub_id):
|
|||
(sub_id, s["url"], s["title"], s.get("note", ""), tags_str),
|
||||
)
|
||||
# Embed remote page for semantic search
|
||||
try:
|
||||
from embeddings import store_remote_embeddings
|
||||
rp_id = db.execute(
|
||||
"SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
|
||||
(sub_id, s["url"]),
|
||||
).fetchone()["id"]
|
||||
store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
|
||||
except Exception:
|
||||
pass
|
||||
if get_setting("semantic_search", "1") == "1":
|
||||
try:
|
||||
from embeddings import store_remote_embeddings
|
||||
rp_id = db.execute(
|
||||
"SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
|
||||
(sub_id, s["url"]),
|
||||
).fetchone()["id"]
|
||||
store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
|
||||
except Exception:
|
||||
pass
|
||||
synced += 1
|
||||
except Exception:
|
||||
pass
|
||||
|
|
@ -1050,15 +1068,16 @@ def handle_subscription_syncall():
|
|||
"ON CONFLICT(subscription_id, url) DO UPDATE SET title=excluded.title, note=excluded.note, tags=excluded.tags",
|
||||
(sub["id"], s["url"], s["title"], s.get("note", ""), tags_str),
|
||||
)
|
||||
try:
|
||||
from embeddings import store_remote_embeddings
|
||||
rp_id = db.execute(
|
||||
"SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
|
||||
(sub["id"], s["url"]),
|
||||
).fetchone()["id"]
|
||||
store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
|
||||
except Exception:
|
||||
pass
|
||||
if get_setting("semantic_search", "1") == "1":
|
||||
try:
|
||||
from embeddings import store_remote_embeddings
|
||||
rp_id = db.execute(
|
||||
"SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?",
|
||||
(sub["id"], s["url"]),
|
||||
).fetchone()["id"]
|
||||
store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db)
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
now = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
|
||||
|
|
@ -1079,6 +1098,12 @@ _reindex_thread = None
|
|||
|
||||
|
||||
def handle_reindex_form():
|
||||
if get_setting("semantic_search", "1") != "1":
|
||||
return _respond(
|
||||
f"<h2>semantic search index</h2>"
|
||||
f"<p>Semantic search is disabled. Enable it in <a href=\"/style\">settings</a> to use embeddings.</p>"
|
||||
f'<p><a href="/">back to search</a></p>'
|
||||
)
|
||||
db = get_db()
|
||||
try:
|
||||
total_pages = db.execute("SELECT count(*) FROM pages").fetchone()[0]
|
||||
|
|
|
|||
|
|
@ -6,14 +6,6 @@ def esc(s):
|
|||
return html.escape(str(s))
|
||||
|
||||
|
||||
def snippet(text, query, ctx=80):
|
||||
pos = text.lower().find(query.lower())
|
||||
if pos == -1:
|
||||
return text[:200]
|
||||
start = max(0, pos - ctx)
|
||||
end = min(len(text), pos + len(query) + ctx)
|
||||
return ("..." if start > 0 else "") + text[start:end] + ("..." if end < len(text) else "")
|
||||
|
||||
|
||||
DEFAULT_TEMPLATE = "<html>\n<head>\n</head>\n<body>\n{{content}}\n</body>\n</html>"
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue