fixed reindex, preserved summaries

Previously reindex skipped pages that already had chunks, leaving stale
embeddings in place. It also overwrote good meta description summaries
with auto-generated ones. Now it clears all chunks first so everything
is re-embedded, and only generates summaries for pages missing one.
This commit is contained in:
lichenblankie 2026-03-27 14:08:04 -07:00
parent cf536a860c
commit 3f8ebdab1d

View file

@ -506,21 +506,24 @@ def hybrid_search(query_text, bm25_ranked_ids, limit=10, db=None, use_reranker=F
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def reindex_all(db=None, progress_callback=None): def reindex_all(db=None, progress_callback=None):
"""Embed all pages that don't yet have chunks. Also generates missing summaries. Rebuilds HNSW index.""" """Re-embed all pages and regenerate all summaries. Rebuilds HNSW index."""
from db import get_db, return_db, _generate_summary from db import get_db, return_db, _generate_summary
own_db = db is None own_db = db is None
if own_db: if own_db:
db = get_db() db = get_db()
try: try:
# Clear existing chunks so everything is regenerated
db.execute("DELETE FROM chunks")
db.commit()
rows = db.execute( rows = db.execute(
"SELECT p.id, p.title, p.body, p.summary FROM pages p" "SELECT p.id, p.title, p.body, p.summary FROM pages p"
"WHERE p.id NOT IN (SELECT DISTINCT page_id FROM chunks WHERE page_id IS NOT NULL)"
).fetchall() ).fetchall()
total = len(rows) total = len(rows)
for i, row in enumerate(rows): for i, row in enumerate(rows):
store_embeddings(row["id"], row["title"], row["body"], db) store_embeddings(row["id"], row["title"], row["body"], db)
# Generate summary if missing # Only regenerate summary if missing
if not row["summary"]: if not row["summary"]:
summary = _generate_summary(row["title"], row["body"]) summary = _generate_summary(row["title"], row["body"])
db.execute("UPDATE pages SET summary = ? WHERE id = ?", (summary, row["id"])) db.execute("UPDATE pages SET summary = ? WHERE id = ?", (summary, row["id"]))
@ -528,20 +531,9 @@ def reindex_all(db=None, progress_callback=None):
if progress_callback: if progress_callback:
progress_callback(i + 1, total) progress_callback(i + 1, total)
# Generate summaries for pages that already have chunks but no summary
no_summary = db.execute(
"SELECT id, title, body FROM pages WHERE summary = '' OR summary IS NULL"
).fetchall()
for row in no_summary:
summary = _generate_summary(row["title"], row["body"])
db.execute("UPDATE pages SET summary = ? WHERE id = ?", (summary, row["id"]))
if no_summary:
db.commit()
# Also handle remote pages # Also handle remote pages
remote_rows = db.execute( remote_rows = db.execute(
"SELECT rp.id, rp.title, rp.note FROM remote_pages rp" "SELECT rp.id, rp.title, rp.note FROM remote_pages rp"
"WHERE rp.id NOT IN (SELECT DISTINCT remote_page_id FROM chunks WHERE remote_page_id IS NOT NULL)"
).fetchall() ).fetchall()
for rp in remote_rows: for rp in remote_rows: