optimized storage, updated readme
This commit is contained in:
parent
7946225030
commit
30bc61212f
4 changed files with 177 additions and 34 deletions
61
handlers.py
61
handlers.py
|
|
@ -210,7 +210,7 @@ def handle_search(query):
|
|||
# Hybrid search: merge BM25 + semantic via RRF
|
||||
bm25_ids = [r["id"] for r in bm25_rows]
|
||||
chunk_snippets = {} # page_id -> best chunk text
|
||||
if get_setting("semantic_search", "1") == "1":
|
||||
if get_setting("semantic_search", "0") == "1":
|
||||
try:
|
||||
from embeddings import hybrid_search
|
||||
use_reranker = get_setting("use_reranker", "1") == "1"
|
||||
|
|
@ -468,7 +468,7 @@ def handle_add_manual_submit(body):
|
|||
db.commit()
|
||||
|
||||
# Generate embeddings for this page (if semantic search is enabled)
|
||||
if get_setting("semantic_search", "1") == "1":
|
||||
if get_setting("semantic_search", "0") == "1":
|
||||
try:
|
||||
from embeddings import store_embeddings
|
||||
# Pass the page_id, title, description, and db connection
|
||||
|
|
@ -684,10 +684,16 @@ def handle_bookmark(query):
|
|||
return _text_response(msg, headers={"Access-Control-Allow-Origin": "*"})
|
||||
|
||||
|
||||
MAX_EXPORT = 10000
|
||||
|
||||
def handle_export():
|
||||
batch = int((query or {}).get("batch", ["0"])[0])
|
||||
db = get_db()
|
||||
try:
|
||||
rows = db.execute("SELECT url, title, note FROM pages ORDER BY id").fetchall()
|
||||
rows = db.execute(
|
||||
"SELECT url, title, note FROM pages ORDER BY id LIMIT ? OFFSET ?",
|
||||
(MAX_EXPORT, batch * MAX_EXPORT),
|
||||
).fetchall()
|
||||
finally:
|
||||
return_db(db)
|
||||
data = [{"url": r["url"], "title": r["title"], "note": r["note"]} for r in rows]
|
||||
|
|
@ -752,6 +758,8 @@ def handle_style_form(msg=""):
|
|||
dimmed = ' style="opacity:0.4"' if semantic != "1" else ""
|
||||
transport_host = get_setting("transport_host", "reticulum.derickphan.com")
|
||||
transport_port = get_setting("transport_port", "4242")
|
||||
compress = get_setting("compress_embeddings", "0")
|
||||
compress_checked = " checked" if compress == "1" else ""
|
||||
return _respond(
|
||||
f"<h1>customize</h1>"
|
||||
f"<h2>name your search engine</h2>"
|
||||
|
|
@ -778,6 +786,9 @@ def handle_style_form(msg=""):
|
|||
f'<label><input type="checkbox" id="reranker" name="use_reranker" value="1"{reranker_checked}{disabled}>'
|
||||
f" cross-encoder reranking (more accurate)</label><br>"
|
||||
f"<small>Uses a 22MB model. Adds ~50ms per search. Disable for faster results.</small><br><br>"
|
||||
f'<label><input type="checkbox" name="compress_embeddings" value="1"{compress_checked}{disabled}>'
|
||||
f" compress embeddings (50% storage savings)</label><br>"
|
||||
f"<small>Saves ~50% on storage for embeddings. Slight quality reduction at large scale.</small><br><br>"
|
||||
f'<a href="/reindex">manage semantic index</a><br><br>'
|
||||
f"</div>"
|
||||
f"<h2>custom html</h2>"
|
||||
|
|
@ -794,6 +805,11 @@ def handle_style_form(msg=""):
|
|||
f'{_csrf_field()}'
|
||||
f'<button type="submit">reset template to default</button>'
|
||||
f"</form>"
|
||||
f"<h2>maintenance</h2>"
|
||||
f'<form method="post" action="/style/vacuum">'
|
||||
f'{_csrf_field()}'
|
||||
f'<button type="submit">vacuum database</button>'
|
||||
f"</form>"
|
||||
f"<p>{msg}</p>"
|
||||
f'<a href="/">back</a>',
|
||||
use_default=True,
|
||||
|
|
@ -806,6 +822,7 @@ def handle_style_submit(body):
|
|||
sharing = "1" if body.get("sharing_enabled") else "0"
|
||||
semantic = "1" if body.get("semantic_search") else "0"
|
||||
reranker = "1" if body.get("use_reranker") else "0"
|
||||
compress = "1" if body.get("compress_embeddings") else "0"
|
||||
transport_host = body.get("transport_host", [""])[0].strip()
|
||||
transport_port = body.get("transport_port", [""])[0].strip()
|
||||
set_setting("custom_template", template if template.strip() != DEFAULT_TEMPLATE.strip() else "")
|
||||
|
|
@ -813,6 +830,7 @@ def handle_style_submit(body):
|
|||
set_setting("sharing_enabled", sharing)
|
||||
set_setting("semantic_search", semantic)
|
||||
set_setting("use_reranker", reranker)
|
||||
set_setting("compress_embeddings", compress)
|
||||
if transport_host:
|
||||
set_setting("transport_host", transport_host)
|
||||
if transport_port:
|
||||
|
|
@ -930,6 +948,8 @@ def handle_tag_browse(tag_name, query=None):
|
|||
)
|
||||
|
||||
|
||||
MAX_API_SITES = 5000
|
||||
|
||||
def handle_api_sites(query=None):
|
||||
if get_setting("sharing_enabled", "0") != "1":
|
||||
return _json_response(
|
||||
|
|
@ -943,11 +963,14 @@ def handle_api_sites(query=None):
|
|||
if since:
|
||||
rows = db.execute(
|
||||
"SELECT id, url, title, note, last_modified FROM pages "
|
||||
"WHERE last_modified > ? ORDER BY id DESC",
|
||||
(since,),
|
||||
"WHERE last_modified > ? ORDER BY id DESC LIMIT ?",
|
||||
(since, MAX_API_SITES),
|
||||
).fetchall()
|
||||
else:
|
||||
rows = db.execute("SELECT id, url, title, note, last_modified FROM pages ORDER BY id DESC").fetchall()
|
||||
rows = db.execute(
|
||||
"SELECT id, url, title, note, last_modified FROM pages ORDER BY id DESC LIMIT ?",
|
||||
(MAX_API_SITES,),
|
||||
).fetchall()
|
||||
sites = []
|
||||
for r in rows:
|
||||
tags = _get_page_tags(r["id"], db)
|
||||
|
|
@ -955,8 +978,10 @@ def handle_api_sites(query=None):
|
|||
"url": r["url"], "title": r["title"], "note": r["note"],
|
||||
"tags": tags, "last_modified": r["last_modified"] or "",
|
||||
})
|
||||
# Include list of all current URLs so subscriber can detect deletions
|
||||
all_urls = [r["url"] for r in db.execute("SELECT url FROM pages").fetchall()] if not since else None
|
||||
# Include list of all current URLs so subscriber can detect deletions (limited)
|
||||
all_urls = None
|
||||
if not since:
|
||||
all_urls = [r["url"] for r in db.execute("SELECT url FROM pages LIMIT ?", (MAX_API_SITES,)).fetchall()]
|
||||
finally:
|
||||
return_db(db)
|
||||
data = {"name": get_site_name(), "sites": sites}
|
||||
|
|
@ -1040,18 +1065,20 @@ def handle_subscription_add(body):
|
|||
return handle_subscriptions(f"Subscribed to {esc(name or dest_hash)}.")
|
||||
|
||||
|
||||
MAX_BROWSE = 5000
|
||||
|
||||
def handle_subscription_browse(sub_id):
|
||||
db = get_db()
|
||||
try:
|
||||
sub = db.execute("SELECT * FROM subscriptions WHERE id = ?", (sub_id,)).fetchone()
|
||||
if not sub:
|
||||
return _error(404)
|
||||
local_urls = set(r["url"] for r in db.execute("SELECT url FROM pages").fetchall())
|
||||
local_urls = set(r["url"] for r in db.execute("SELECT url FROM pages LIMIT ?", (MAX_BROWSE,)).fetchall())
|
||||
|
||||
# Use locally synced data if available, otherwise fetch live
|
||||
remote_rows = db.execute(
|
||||
"SELECT url, title, note, tags FROM remote_pages WHERE subscription_id = ?",
|
||||
(sub_id,),
|
||||
"SELECT url, title, note, tags FROM remote_pages WHERE subscription_id = ? LIMIT ?",
|
||||
(sub_id, MAX_BROWSE),
|
||||
).fetchall()
|
||||
finally:
|
||||
return_db(db)
|
||||
|
|
@ -1121,7 +1148,7 @@ def handle_subscription_pick(body):
|
|||
remote_tags = {r["url"]: r["tags"] for r in remote_rows}
|
||||
|
||||
if import_all:
|
||||
local_urls = set(r["url"] for r in db.execute("SELECT url FROM pages").fetchall())
|
||||
local_urls = set(r["url"] for r in db.execute("SELECT url FROM pages LIMIT ?", (MAX_BROWSE,)).fetchall())
|
||||
urls = [r["url"] for r in remote_rows if r["url"] not in local_urls]
|
||||
else:
|
||||
urls = body.get("urls", [])
|
||||
|
|
@ -1192,7 +1219,7 @@ def handle_subscription_sync(sub_id):
|
|||
(sub_id, s["url"], s["title"], s.get("note", ""), tags_str),
|
||||
)
|
||||
# Embed remote page for semantic search
|
||||
if get_setting("semantic_search", "1") == "1":
|
||||
if get_setting("semantic_search", "0") == "1":
|
||||
try:
|
||||
from embeddings import store_remote_embeddings
|
||||
rp_id = db.execute(
|
||||
|
|
@ -1268,7 +1295,7 @@ def handle_subscription_syncall():
|
|||
"ON CONFLICT(subscription_id, url) DO UPDATE SET title=excluded.title, note=excluded.note, tags=excluded.tags",
|
||||
(sub["id"], s["url"], s["title"], s.get("note", ""), tags_str),
|
||||
)
|
||||
if get_setting("semantic_search", "1") == "1":
|
||||
if get_setting("semantic_search", "0") == "1":
|
||||
try:
|
||||
from embeddings import store_remote_embeddings
|
||||
rp_id = db.execute(
|
||||
|
|
@ -1298,7 +1325,7 @@ _reindex_thread = None
|
|||
|
||||
|
||||
def handle_reindex_form():
|
||||
if get_setting("semantic_search", "1") != "1":
|
||||
if get_setting("semantic_search", "0") != "1":
|
||||
return _respond(
|
||||
f"<h2>semantic search index</h2>"
|
||||
f"<p>Semantic search is disabled. Enable it in <a href=\"/style\">settings</a> to use embeddings.</p>"
|
||||
|
|
@ -1425,6 +1452,10 @@ def _dispatch_inner(data):
|
|||
elif path == "/style/reset":
|
||||
set_setting("custom_template", "")
|
||||
return handle_style_form("Template reset to default.")
|
||||
elif path == "/style/vacuum":
|
||||
from db import vacuum_db
|
||||
vacuum_db()
|
||||
return handle_style_form("Database vacuumed.")
|
||||
elif path == "/import":
|
||||
return handle_import_submit(body)
|
||||
elif path == "/reindex":
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue