Optimized storage and updated readme
All checks were successful
/ build (push) Successful in 2m19s

This commit is contained in:
lichenblankie 2026-04-11 21:59:55 +00:00
parent 552311b730
commit 8ecb963be4
4 changed files with 172 additions and 29 deletions

View file

@ -684,10 +684,16 @@ def handle_bookmark(query):
return _text_response(msg, headers={"Access-Control-Allow-Origin": "*"})
MAX_EXPORT = 10000
def handle_export():
batch = int((query or {}).get("batch", ["0"])[0])
db = get_db()
try:
rows = db.execute("SELECT url, title, note FROM pages ORDER BY id").fetchall()
rows = db.execute(
"SELECT url, title, note FROM pages ORDER BY id LIMIT ? OFFSET ?",
(MAX_EXPORT, batch * MAX_EXPORT),
).fetchall()
finally:
return_db(db)
data = [{"url": r["url"], "title": r["title"], "note": r["note"]} for r in rows]
@ -752,6 +758,8 @@ def handle_style_form(msg=""):
dimmed = ' style="opacity:0.4"' if semantic != "1" else ""
transport_host = get_setting("transport_host", "reticulum.derickphan.com")
transport_port = get_setting("transport_port", "4242")
compress = get_setting("compress_embeddings", "0")
compress_checked = " checked" if compress == "1" else ""
return _respond(
f"<h1>customize</h1>"
f"<h2>name your search engine</h2>"
@ -778,6 +786,9 @@ def handle_style_form(msg=""):
f'<label><input type="checkbox" id="reranker" name="use_reranker" value="1"{reranker_checked}{disabled}>'
f" cross-encoder reranking (more accurate)</label><br>"
f"<small>Uses a 22MB model. Adds ~50ms per search. Disable for faster results.</small><br><br>"
f'<label><input type="checkbox" name="compress_embeddings" value="1"{compress_checked}{disabled}>'
f" compress embeddings (50% storage savings)</label><br>"
f"<small>Saves ~50% on storage for embeddings. Slight quality reduction at large scale.</small><br><br>"
f'<a href="/reindex">manage semantic index</a><br><br>'
f"</div>"
f"<h2>custom html</h2>"
@ -794,6 +805,11 @@ def handle_style_form(msg=""):
f'{_csrf_field()}'
f'<button type="submit">reset template to default</button>'
f"</form>"
f"<h2>maintenance</h2>"
f'<form method="post" action="/style/vacuum">'
f'{_csrf_field()}'
f'<button type="submit">vacuum database</button>'
f"</form>"
f"<p>{msg}</p>"
f'<a href="/">back</a>',
use_default=True,
@ -806,6 +822,7 @@ def handle_style_submit(body):
sharing = "1" if body.get("sharing_enabled") else "0"
semantic = "1" if body.get("semantic_search") else "0"
reranker = "1" if body.get("use_reranker") else "0"
compress = "1" if body.get("compress_embeddings") else "0"
transport_host = body.get("transport_host", [""])[0].strip()
transport_port = body.get("transport_port", [""])[0].strip()
set_setting("custom_template", template if template.strip() != DEFAULT_TEMPLATE.strip() else "")
@ -813,6 +830,7 @@ def handle_style_submit(body):
set_setting("sharing_enabled", sharing)
set_setting("semantic_search", semantic)
set_setting("use_reranker", reranker)
set_setting("compress_embeddings", compress)
if transport_host:
set_setting("transport_host", transport_host)
if transport_port:
@ -930,6 +948,8 @@ def handle_tag_browse(tag_name, query=None):
)
MAX_API_SITES = 5000
def handle_api_sites(query=None):
if get_setting("sharing_enabled", "0") != "1":
return _json_response(
@ -943,11 +963,14 @@ def handle_api_sites(query=None):
if since:
rows = db.execute(
"SELECT id, url, title, note, last_modified FROM pages "
"WHERE last_modified > ? ORDER BY id DESC",
(since,),
"WHERE last_modified > ? ORDER BY id DESC LIMIT ?",
(since, MAX_API_SITES),
).fetchall()
else:
rows = db.execute("SELECT id, url, title, note, last_modified FROM pages ORDER BY id DESC").fetchall()
rows = db.execute(
"SELECT id, url, title, note, last_modified FROM pages ORDER BY id DESC LIMIT ?",
(MAX_API_SITES,),
).fetchall()
sites = []
for r in rows:
tags = _get_page_tags(r["id"], db)
@ -955,8 +978,10 @@ def handle_api_sites(query=None):
"url": r["url"], "title": r["title"], "note": r["note"],
"tags": tags, "last_modified": r["last_modified"] or "",
})
# Include list of all current URLs so subscriber can detect deletions
all_urls = [r["url"] for r in db.execute("SELECT url FROM pages").fetchall()] if not since else None
# Include list of all current URLs so subscriber can detect deletions (limited)
all_urls = None
if not since:
all_urls = [r["url"] for r in db.execute("SELECT url FROM pages LIMIT ?", (MAX_API_SITES,)).fetchall()]
finally:
return_db(db)
data = {"name": get_site_name(), "sites": sites}
@ -1040,18 +1065,20 @@ def handle_subscription_add(body):
return handle_subscriptions(f"Subscribed to {esc(name or dest_hash)}.")
MAX_BROWSE = 5000
def handle_subscription_browse(sub_id):
db = get_db()
try:
sub = db.execute("SELECT * FROM subscriptions WHERE id = ?", (sub_id,)).fetchone()
if not sub:
return _error(404)
local_urls = set(r["url"] for r in db.execute("SELECT url FROM pages").fetchall())
local_urls = set(r["url"] for r in db.execute("SELECT url FROM pages LIMIT ?", (MAX_BROWSE,)).fetchall())
# Use locally synced data if available, otherwise fetch live
remote_rows = db.execute(
"SELECT url, title, note, tags FROM remote_pages WHERE subscription_id = ?",
(sub_id,),
"SELECT url, title, note, tags FROM remote_pages WHERE subscription_id = ? LIMIT ?",
(sub_id, MAX_BROWSE),
).fetchall()
finally:
return_db(db)
@ -1121,7 +1148,7 @@ def handle_subscription_pick(body):
remote_tags = {r["url"]: r["tags"] for r in remote_rows}
if import_all:
local_urls = set(r["url"] for r in db.execute("SELECT url FROM pages").fetchall())
local_urls = set(r["url"] for r in db.execute("SELECT url FROM pages LIMIT ?", (MAX_BROWSE,)).fetchall())
urls = [r["url"] for r in remote_rows if r["url"] not in local_urls]
else:
urls = body.get("urls", [])
@ -1425,6 +1452,10 @@ def _dispatch_inner(data):
elif path == "/style/reset":
set_setting("custom_template", "")
return handle_style_form("Template reset to default.")
elif path == "/style/vacuum":
from db import vacuum_db
vacuum_db()
return handle_style_form("Database vacuumed.")
elif path == "/import":
return handle_import_submit(body)
elif path == "/reindex":