fixed reindex, preserved summaries
Previously reindex skipped pages that already had chunks, leaving stale embeddings in place. It also overwrote good meta description summaries with auto-generated ones. Now it clears all chunks first so everything is re-embedded, and only generates summaries for pages missing one.
This commit is contained in:
parent
cf536a860c
commit
3f8ebdab1d
1 changed files with 8 additions and 16 deletions
|
|
@ -506,21 +506,24 @@ def hybrid_search(query_text, bm25_ranked_ids, limit=10, db=None, use_reranker=F
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
def reindex_all(db=None, progress_callback=None):
|
def reindex_all(db=None, progress_callback=None):
|
||||||
"""Embed all pages that don't yet have chunks. Also generates missing summaries. Rebuilds HNSW index."""
|
"""Re-embed all pages and regenerate all summaries. Rebuilds HNSW index."""
|
||||||
from db import get_db, return_db, _generate_summary
|
from db import get_db, return_db, _generate_summary
|
||||||
own_db = db is None
|
own_db = db is None
|
||||||
if own_db:
|
if own_db:
|
||||||
db = get_db()
|
db = get_db()
|
||||||
try:
|
try:
|
||||||
|
# Clear existing chunks so everything is regenerated
|
||||||
|
db.execute("DELETE FROM chunks")
|
||||||
|
db.commit()
|
||||||
|
|
||||||
rows = db.execute(
|
rows = db.execute(
|
||||||
"SELECT p.id, p.title, p.body, p.summary FROM pages p "
|
"SELECT p.id, p.title, p.body, p.summary FROM pages p"
|
||||||
"WHERE p.id NOT IN (SELECT DISTINCT page_id FROM chunks WHERE page_id IS NOT NULL)"
|
|
||||||
).fetchall()
|
).fetchall()
|
||||||
|
|
||||||
total = len(rows)
|
total = len(rows)
|
||||||
for i, row in enumerate(rows):
|
for i, row in enumerate(rows):
|
||||||
store_embeddings(row["id"], row["title"], row["body"], db)
|
store_embeddings(row["id"], row["title"], row["body"], db)
|
||||||
# Generate summary if missing
|
# Only regenerate summary if missing
|
||||||
if not row["summary"]:
|
if not row["summary"]:
|
||||||
summary = _generate_summary(row["title"], row["body"])
|
summary = _generate_summary(row["title"], row["body"])
|
||||||
db.execute("UPDATE pages SET summary = ? WHERE id = ?", (summary, row["id"]))
|
db.execute("UPDATE pages SET summary = ? WHERE id = ?", (summary, row["id"]))
|
||||||
|
|
@ -528,20 +531,9 @@ def reindex_all(db=None, progress_callback=None):
|
||||||
if progress_callback:
|
if progress_callback:
|
||||||
progress_callback(i + 1, total)
|
progress_callback(i + 1, total)
|
||||||
|
|
||||||
# Generate summaries for pages that already have chunks but no summary
|
|
||||||
no_summary = db.execute(
|
|
||||||
"SELECT id, title, body FROM pages WHERE summary = '' OR summary IS NULL"
|
|
||||||
).fetchall()
|
|
||||||
for row in no_summary:
|
|
||||||
summary = _generate_summary(row["title"], row["body"])
|
|
||||||
db.execute("UPDATE pages SET summary = ? WHERE id = ?", (summary, row["id"]))
|
|
||||||
if no_summary:
|
|
||||||
db.commit()
|
|
||||||
|
|
||||||
# Also handle remote pages
|
# Also handle remote pages
|
||||||
remote_rows = db.execute(
|
remote_rows = db.execute(
|
||||||
"SELECT rp.id, rp.title, rp.note FROM remote_pages rp "
|
"SELECT rp.id, rp.title, rp.note FROM remote_pages rp"
|
||||||
"WHERE rp.id NOT IN (SELECT DISTINCT remote_page_id FROM chunks WHERE remote_page_id IS NOT NULL)"
|
|
||||||
).fetchall()
|
).fetchall()
|
||||||
|
|
||||||
for rp in remote_rows:
|
for rp in remote_rows:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue