diff --git a/app.py b/app.py index 9f09a59..cde4d84 100644 --- a/app.py +++ b/app.py @@ -74,6 +74,9 @@ def ensure_rns_config(config_dir): def _preload_embeddings(): """Pre-load the embedding model and build the HNSW index in background.""" + if get_setting("semantic_search", "1") != "1": + print("Semantic search disabled.") + return try: from embeddings import _get_session, _get_reranker, build_index _get_session() # downloads model on first run, loads ONNX session diff --git a/db.py b/db.py index f31473f..065d65d 100644 --- a/db.py +++ b/db.py @@ -334,80 +334,16 @@ def fetch_page(url): tag.decompose() title = soup.title.string.strip() if soup.title and soup.title.string else url - # Extract paragraph text for better summary generation - paragraphs = [] - for p in soup.find_all("p"): - text = p.get_text(strip=True) - if len(text) >= 40: - paragraphs.append(text) - body = soup.get_text(separator=" ", strip=True) - return title, body, links, meta_desc, paragraphs + return title, body, links, meta_desc -def _generate_summary(title, body, paragraphs=None): - """Generate a summary by extracting the best sentence from the page. - - Priority: sentence mentioning the site name > first paragraph sentence - > first body sentence > title. - """ - import re - noise_patterns = re.compile( - r'arrow-|fedilink|message-square|link-external|' - r'skip to|cookie|subscribe|sign up|log in|' - r'privacy policy|terms of|©|\bads?\b', - re.IGNORECASE - ) - - def _filter_sentences(raw): - result = [] - for s in raw: - s = s.strip() - if len(s) < 40 or len(s.split()) < 7: - continue - alpha_chars = sum(1 for c in s if c.isalpha() or c == ' ') - if alpha_chars < len(s) * 0.6: - continue - if s.count('|') > 2 or s.count('·') > 2 or s.count('►') > 0: - continue - if noise_patterns.search(s): - continue - result.append(s) - return result - - # Prefer sentences from

tags (actual content, not UI) - sentences = [] - if paragraphs: - raw = [] - for p in paragraphs: - raw.extend(re.split(r'(?<=[.!?])\s+', p)) - sentences = _filter_sentences(raw) - - # Fall back to full body text - if not sentences: - raw = re.split(r'(?<=[.!?])\s+', body) - sentences = _filter_sentences(raw) - - if not sentences: - return title[:200] if title else "" - - # Prefer a sentence that mentions the site name - if title: - title_words = [w.lower() for w in re.split(r'\W+', title) if len(w) >= 3] - for s in sentences: - s_lower = s.lower() - if sum(1 for w in title_words if w in s_lower) >= max(1, len(title_words) // 2): - return s[:200] - - # Otherwise use the first quality sentence - return sentences[0][:200] - def index_url(url, note=""): url = clean_url(url) - title, body, links, meta_desc, paragraphs = fetch_page(url) + title, body, links, meta_desc = fetch_page(url) # Use meta description if available and meaningful, otherwise generate from body - summary = meta_desc if meta_desc and len(meta_desc) > 20 else _generate_summary(title, body, paragraphs) + summary = meta_desc if meta_desc and len(meta_desc) > 20 else "" db = get_db() try: now = __import__("datetime").datetime.now().strftime("%Y-%m-%dT%H:%M:%S") @@ -425,11 +361,12 @@ def index_url(url, note=""): (page_id, href, label), ) db.commit() - try: - from embeddings import store_embeddings - store_embeddings(page_id, title, body, db) - except Exception: - pass # embedding generation is best-effort + if get_setting("semantic_search", "1") == "1": + try: + from embeddings import store_embeddings + store_embeddings(page_id, title, body, db) + except Exception: + pass # embedding generation is best-effort finally: return_db(db) return title diff --git a/embeddings.py b/embeddings.py index 8ad1362..aa6a4ff 100644 --- a/embeddings.py +++ b/embeddings.py @@ -507,7 +507,7 @@ def hybrid_search(query_text, bm25_ranked_ids, limit=10, db=None, use_reranker=F def reindex_all(db=None, progress_callback=None): """Re-embed all pages and regenerate all summaries. Rebuilds HNSW index.""" - from db import get_db, return_db, _generate_summary + from db import get_db, return_db own_db = db is None if own_db: db = get_db() @@ -523,11 +523,6 @@ def reindex_all(db=None, progress_callback=None): total = len(rows) for i, row in enumerate(rows): store_embeddings(row["id"], row["title"], row["body"], db) - # Only regenerate summary if missing - if not row["summary"]: - summary = _generate_summary(row["title"], row["body"]) - db.execute("UPDATE pages SET summary = ? WHERE id = ?", (summary, row["id"])) - db.commit() if progress_callback: progress_callback(i + 1, total) diff --git a/handlers.py b/handlers.py index 2f6f31f..484a5ca 100644 --- a/handlers.py +++ b/handlers.py @@ -6,7 +6,7 @@ from datetime import datetime from urllib.parse import unquote from db import get_db, return_db, get_setting, set_setting, get_site_name, index_url, clean_url -from templates import esc, snippet, wrap_page, DEFAULT_TEMPLATE +from templates import esc, wrap_page, DEFAULT_TEMPLATE from rns_client import fetch_remote_sites _request_local = threading.local() @@ -205,13 +205,16 @@ def handle_search(query): # Hybrid search: merge BM25 + semantic via RRF bm25_ids = [r["id"] for r in bm25_rows] chunk_snippets = {} # page_id -> best chunk text - try: - from embeddings import hybrid_search - use_reranker = get_setting("use_reranker", "1") == "1" - fused = hybrid_search(q, bm25_ids, limit=100, db=db, use_reranker=use_reranker) - fused_ids = [pid for pid, _ in fused] - chunk_snippets = {pid: text for pid, text in fused if text} - except Exception: + if get_setting("semantic_search", "1") == "1": + try: + from embeddings import hybrid_search + use_reranker = get_setting("use_reranker", "1") == "1" + fused = hybrid_search(q, bm25_ids, limit=100, db=db, use_reranker=use_reranker) + fused_ids = [pid for pid, _ in fused] + chunk_snippets = {pid: text for pid, text in fused if text} + except Exception: + fused_ids = bm25_ids + else: fused_ids = bm25_ids total_results = len(fused_ids) @@ -239,13 +242,12 @@ def handle_search(query): if tags: tag_links = " ".join(f'[{esc(t)}]' for t in tags) tags_html = f'

{tag_links}
' - # Use page summary as snippet (meta description or centroid sentence) - snip = r["summary"] if r["summary"] else snippet(r["body"], q) + snip_html = f'
{esc(r["summary"])}' if r["summary"] else "" result_html += ( f'
' f'{esc(r["title"])}
' - f'{esc(r["url"])}
' - f'{esc(snip)}' + f'{esc(r["url"])}' + f'{snip_html}' f'{note_html}{tags_html}' f'
' ) @@ -557,8 +559,12 @@ def handle_style_form(msg=""): name = get_site_name() sharing = get_setting("sharing_enabled", "0") checked = " checked" if sharing == "1" else "" + semantic = get_setting("semantic_search", "1") + semantic_checked = " checked" if semantic == "1" else "" reranker = get_setting("use_reranker", "1") reranker_checked = " checked" if reranker == "1" else "" + disabled = "" if semantic == "1" else " disabled" + dimmed = ' style="opacity:0.4"' if semantic != "1" else "" return _respond( f"

customize

" f"

name your search engine

" @@ -569,9 +575,18 @@ def handle_style_form(msg=""): f'

" f"

search

" - f'
" + f"

ai

" + f'
" + f"Requires onnxruntime, tokenizers, hnswlib. Downloads ~30MB of models on first use.

" + f'
' + f'
" f"Uses a 22MB model. Adds ~50ms per search. Disable for faster results.

" + f'manage semantic index

' + f"
" f"

custom html

" f"

Edit the full page template. Use {esc('{{content}}')} " f"where page content should appear.

" @@ -596,10 +611,12 @@ def handle_style_submit(body): template = body.get("template", [""])[0].replace("\r\n", "\n").replace("\r", "\n") name = body.get("site_name", ["tinyweb"])[0].strip() sharing = "1" if body.get("sharing_enabled") else "0" + semantic = "1" if body.get("semantic_search") else "0" reranker = "1" if body.get("use_reranker") else "0" set_setting("custom_template", template if template.strip() != DEFAULT_TEMPLATE.strip() else "") set_setting("site_name", name or "tinyweb") set_setting("sharing_enabled", sharing) + set_setting("semantic_search", semantic) set_setting("use_reranker", reranker) return handle_style_form("Saved.") @@ -975,15 +992,16 @@ def handle_subscription_sync(sub_id): (sub_id, s["url"], s["title"], s.get("note", ""), tags_str), ) # Embed remote page for semantic search - try: - from embeddings import store_remote_embeddings - rp_id = db.execute( - "SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?", - (sub_id, s["url"]), - ).fetchone()["id"] - store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db) - except Exception: - pass + if get_setting("semantic_search", "1") == "1": + try: + from embeddings import store_remote_embeddings + rp_id = db.execute( + "SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?", + (sub_id, s["url"]), + ).fetchone()["id"] + store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db) + except Exception: + pass synced += 1 except Exception: pass @@ -1050,15 +1068,16 @@ def handle_subscription_syncall(): "ON CONFLICT(subscription_id, url) DO UPDATE SET title=excluded.title, note=excluded.note, tags=excluded.tags", (sub["id"], s["url"], s["title"], s.get("note", ""), tags_str), ) - try: - from embeddings import store_remote_embeddings - rp_id = db.execute( - "SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?", - (sub["id"], s["url"]), - ).fetchone()["id"] - store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db) - except Exception: - pass + if get_setting("semantic_search", "1") == "1": + try: + from embeddings import store_remote_embeddings + rp_id = db.execute( + "SELECT id FROM remote_pages WHERE subscription_id = ? AND url = ?", + (sub["id"], s["url"]), + ).fetchone()["id"] + store_remote_embeddings(rp_id, s["title"], s.get("note", ""), db) + except Exception: + pass except Exception: pass now = datetime.now().strftime("%Y-%m-%dT%H:%M:%S") @@ -1079,6 +1098,12 @@ _reindex_thread = None def handle_reindex_form(): + if get_setting("semantic_search", "1") != "1": + return _respond( + f"

semantic search index

" + f"

Semantic search is disabled. Enable it in settings to use embeddings.

" + f'

back to search

' + ) db = get_db() try: total_pages = db.execute("SELECT count(*) FROM pages").fetchone()[0] diff --git a/templates.py b/templates.py index 372e736..48beace 100644 --- a/templates.py +++ b/templates.py @@ -6,14 +6,6 @@ def esc(s): return html.escape(str(s)) -def snippet(text, query, ctx=80): - pos = text.lower().find(query.lower()) - if pos == -1: - return text[:200] - start = max(0, pos - ctx) - end = min(len(text), pos + len(query) + ctx) - return ("..." if start > 0 else "") + text[start:end] + ("..." if end < len(text) else "") - DEFAULT_TEMPLATE = "\n\n\n\n{{content}}\n\n"