fixed reindex, preserved summaries
Previously reindex skipped pages that already had chunks, leaving stale embeddings in place. It also overwrote good meta description summaries with auto-generated ones. Now it clears all chunks first so everything is re-embedded, and only generates summaries for pages missing one.
This commit is contained in:
parent
cf536a860c
commit
3f8ebdab1d
1 changed files with 8 additions and 16 deletions
|
|
@ -506,21 +506,24 @@ def hybrid_search(query_text, bm25_ranked_ids, limit=10, db=None, use_reranker=F
|
|||
# ---------------------------------------------------------------------------
|
||||
|
||||
def reindex_all(db=None, progress_callback=None):
|
||||
"""Embed all pages that don't yet have chunks. Also generates missing summaries. Rebuilds HNSW index."""
|
||||
"""Re-embed all pages and regenerate all summaries. Rebuilds HNSW index."""
|
||||
from db import get_db, return_db, _generate_summary
|
||||
own_db = db is None
|
||||
if own_db:
|
||||
db = get_db()
|
||||
try:
|
||||
# Clear existing chunks so everything is regenerated
|
||||
db.execute("DELETE FROM chunks")
|
||||
db.commit()
|
||||
|
||||
rows = db.execute(
|
||||
"SELECT p.id, p.title, p.body, p.summary FROM pages p "
|
||||
"WHERE p.id NOT IN (SELECT DISTINCT page_id FROM chunks WHERE page_id IS NOT NULL)"
|
||||
"SELECT p.id, p.title, p.body, p.summary FROM pages p"
|
||||
).fetchall()
|
||||
|
||||
total = len(rows)
|
||||
for i, row in enumerate(rows):
|
||||
store_embeddings(row["id"], row["title"], row["body"], db)
|
||||
# Generate summary if missing
|
||||
# Only regenerate summary if missing
|
||||
if not row["summary"]:
|
||||
summary = _generate_summary(row["title"], row["body"])
|
||||
db.execute("UPDATE pages SET summary = ? WHERE id = ?", (summary, row["id"]))
|
||||
|
|
@ -528,20 +531,9 @@ def reindex_all(db=None, progress_callback=None):
|
|||
if progress_callback:
|
||||
progress_callback(i + 1, total)
|
||||
|
||||
# Generate summaries for pages that already have chunks but no summary
|
||||
no_summary = db.execute(
|
||||
"SELECT id, title, body FROM pages WHERE summary = '' OR summary IS NULL"
|
||||
).fetchall()
|
||||
for row in no_summary:
|
||||
summary = _generate_summary(row["title"], row["body"])
|
||||
db.execute("UPDATE pages SET summary = ? WHERE id = ?", (summary, row["id"]))
|
||||
if no_summary:
|
||||
db.commit()
|
||||
|
||||
# Also handle remote pages
|
||||
remote_rows = db.execute(
|
||||
"SELECT rp.id, rp.title, rp.note FROM remote_pages rp "
|
||||
"WHERE rp.id NOT IN (SELECT DISTINCT remote_page_id FROM chunks WHERE remote_page_id IS NOT NULL)"
|
||||
"SELECT rp.id, rp.title, rp.note FROM remote_pages rp"
|
||||
).fetchall()
|
||||
|
||||
for rp in remote_rows:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue