diff --git a/embeddings.py b/embeddings.py index 5575c29..8ad1362 100644 --- a/embeddings.py +++ b/embeddings.py @@ -506,21 +506,24 @@ def hybrid_search(query_text, bm25_ranked_ids, limit=10, db=None, use_reranker=F # --------------------------------------------------------------------------- def reindex_all(db=None, progress_callback=None): - """Embed all pages that don't yet have chunks. Also generates missing summaries. Rebuilds HNSW index.""" + """Re-embed all pages and regenerate all summaries. Rebuilds HNSW index.""" from db import get_db, return_db, _generate_summary own_db = db is None if own_db: db = get_db() try: + # Clear existing chunks so everything is regenerated + db.execute("DELETE FROM chunks") + db.commit() + rows = db.execute( - "SELECT p.id, p.title, p.body, p.summary FROM pages p " - "WHERE p.id NOT IN (SELECT DISTINCT page_id FROM chunks WHERE page_id IS NOT NULL)" + "SELECT p.id, p.title, p.body, p.summary FROM pages p" ).fetchall() total = len(rows) for i, row in enumerate(rows): store_embeddings(row["id"], row["title"], row["body"], db) - # Generate summary if missing + # Only regenerate summary if missing if not row["summary"]: summary = _generate_summary(row["title"], row["body"]) db.execute("UPDATE pages SET summary = ? WHERE id = ?", (summary, row["id"])) @@ -528,20 +531,9 @@ def reindex_all(db=None, progress_callback=None): if progress_callback: progress_callback(i + 1, total) - # Generate summaries for pages that already have chunks but no summary - no_summary = db.execute( - "SELECT id, title, body FROM pages WHERE summary = '' OR summary IS NULL" - ).fetchall() - for row in no_summary: - summary = _generate_summary(row["title"], row["body"]) - db.execute("UPDATE pages SET summary = ? WHERE id = ?", (summary, row["id"])) - if no_summary: - db.commit() - # Also handle remote pages remote_rows = db.execute( - "SELECT rp.id, rp.title, rp.note FROM remote_pages rp " - "WHERE rp.id NOT IN (SELECT DISTINCT remote_page_id FROM chunks WHERE remote_page_id IS NOT NULL)" + "SELECT rp.id, rp.title, rp.note FROM remote_pages rp" ).fetchall() for rp in remote_rows: