diff --git a/README.md b/README.md index 91224e8..693ee29 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,30 @@ A personal, decentralized search engine built on the [Reticulum](https://reticul - **Import/export** — JSON-based backup and restore - **Mesh-native** — Works over Reticulum without the internet; encrypted and decentralized by default +## Performance & Scale + +### Search Speed + +| Pages indexed | Search speed | Notes | +|--------------|-------------|-------| +| 1,000 | ~50ms | Fast local FTS5 | +| 10,000 | ~50-100ms | Full-text search | +| 100,000 | ~100-200ms | Combined BM25 + semantic | +| 500,000 | ~200-400ms | With semantic enabled | +| 1,000,000 | ~300-500ms | Hybrid search | + +*Times are estimates for combined BM25 + semantic search. Actual performance varies by hardware, storage type (SSD/HDD), and search complexity.* + +### Concurrent Connections + +- Database pool: 16 simultaneous connections +- Suitable for single-user + a few subscriptions + +### Export + +- Paginated at 10,000 pages per request +- Use `?batch=N` to export in chunks: `/export?batch=0`, `/export?batch=1`, etc. + ## Download (pre-built binaries) Download the latest release for your platform from the [Releases](https://git.derickphan.com/lichenblankie/tinyweb/releases) page: @@ -55,6 +79,21 @@ volumes: Run with `docker compose up -d`. +### Storage Estimates + +Average web page content is ~15KB per page: + +| Pages | Database | Embeddings* | Total | +|-------|----------|------------|-------| +| 10,000 | 150MB | 80MB | ~250MB | +| 100,000 | 1.5GB | 800MB | ~2.5GB | +| 500,000 | 7.5GB | 4GB | ~12GB | +| 1,000,000 | 15GB | 8GB | ~25GB | + +*Embeddings require semantic search to be enabled. With compression enabled (Settings > Search > AI), embeddings use ~50% less storage. + +Enable optional compression in Settings > Search > AI to reduce embedding storage by ~50%. + ## Data storage ### Local (Python/binary) @@ -139,6 +178,23 @@ TinyWeb includes several hardening measures: - **Bookmark authentication** — The bookmarklet endpoint requires a secret token - **Identity file protection** — The Reticulum identity key is restricted to owner-only permissions (0600) +## Maintenance + +### Database Vacuum + +Over time, deleted pages leave empty space in the database. Run the vacuum tool periodically to reclaim space: + +1. Go to `/style` in your browser +2. Click "vacuum database" at the bottom of the page + +### Optional Compression + +To reduce storage for semantic search embeddings (~50% savings): + +1. Go to `/style` > Search > AI +2. Enable "compress embeddings" +3. Re-index your existing pages for the compression to apply to existing embeddings + ## Dependencies - [requests](https://docs.python-requests.org/) — HTTP fetching diff --git a/db.py b/db.py index 295da86..058822c 100644 --- a/db.py +++ b/db.py @@ -97,7 +97,7 @@ def clean_url(url): _pool = [] _pool_lock = __import__("threading").Lock() -_POOL_SIZE = 4 +_POOL_SIZE = 16 def get_db(): @@ -271,8 +271,15 @@ def init_db(): ) db.execute("CREATE INDEX IF NOT EXISTS idx_chunks_page ON chunks(page_id)") db.execute("CREATE INDEX IF NOT EXISTS idx_chunks_remote ON chunks(remote_page_id)") + db.execute("CREATE INDEX IF NOT EXISTS idx_chunks_page_idx ON chunks(page_id, chunk_index)") + db.execute("CREATE INDEX IF NOT EXISTS idx_pages_url ON pages(url)") + db.execute("CREATE INDEX IF NOT EXISTS idx_pages_modified ON pages(last_modified)") + db.execute("CREATE INDEX IF NOT EXISTS idx_page_tags_page ON page_tags(page_id)") + db.execute("CREATE INDEX IF NOT EXISTS idx_page_tags_tag ON page_tags(tag_id)") db.execute("PRAGMA journal_mode=WAL") + db.execute("PRAGMA synchronous=NORMAL") + db.execute("PRAGMA cache_size=-64000") db.commit() db.close() @@ -286,6 +293,16 @@ def get_setting(key, default=""): return_db(db) +def vacuum_db(): + """Run VACUUM and WAL checkpoint to reclaim space after deletions.""" + db = get_db() + try: + db.execute("PRAGMA wal_checkpoint(TRUNCATE)") + db.execute("VACUUM") + finally: + return_db(db) + + def set_setting(key, value): db = get_db() try: diff --git a/embeddings.py b/embeddings.py index 302a31f..0362945 100644 --- a/embeddings.py +++ b/embeddings.py @@ -233,24 +233,42 @@ def embed(texts, is_query=False): "token_type_ids": token_type_ids, }, ) - # CLS token pooling — take the first token's hidden state emb = outputs[0][:, 0, :] all_embeddings.append(emb) embeddings = np.concatenate(all_embeddings, axis=0) - # L2 normalize norms = np.linalg.norm(embeddings, axis=1, keepdims=True) norms = np.maximum(norms, 1e-12) embeddings = embeddings / norms - return embeddings.astype(np.float32) + return _maybe_compress(embeddings.astype(np.float32)) + + +def _maybe_compress(embeddings): + """Compress embeddings to float16 if compression is enabled.""" + try: + from db import get_setting + if get_setting("compress_embeddings", "0") == "1": + return embeddings.astype(np.float16) + except Exception: + pass + return embeddings + + +def _decompress(embeddings): + """Decompress float16 embeddings to float32 if needed.""" + if embeddings.dtype == np.float16: + return embeddings.astype(np.float32) + return embeddings # --------------------------------------------------------------------------- # HNSW index management # --------------------------------------------------------------------------- +BATCH_SIZE = 50000 + def build_index(db=None): - """Load all embeddings from chunks table and build HNSW index.""" + """Load all embeddings from chunks table and build HNSW index in batches.""" import hnswlib global _hnsw_index, _hnsw_ids @@ -258,29 +276,49 @@ def build_index(db=None): own_db = db is None if own_db: db = get_db() + try: - rows = db.execute("SELECT id, embedding FROM chunks ORDER BY id").fetchall() + total = db.execute("SELECT COUNT(*) FROM chunks").fetchone()[0] + if total == 0: + with _hnsw_lock: + _hnsw_index = None + _hnsw_ids = [] + return + + all_ids = [] + all_embeddings = [] + + for offset in range(0, total, BATCH_SIZE): + rows = db.execute( + "SELECT id, embedding FROM chunks ORDER BY id LIMIT ? OFFSET ?", + (BATCH_SIZE, offset), + ).fetchall() + for r in rows: + emb = np.frombuffer(r["embedding"], dtype=np.float32) + if emb.dtype == np.float16: + emb = emb.astype(np.float32) + all_ids.append(r["id"]) + all_embeddings.append(emb) finally: if own_db: return_db(db) - with _hnsw_lock: - if not rows: + if not all_ids: + with _hnsw_lock: _hnsw_index = None _hnsw_ids = [] - return + return - n = len(rows) - ids = [r["id"] for r in rows] - matrix = np.frombuffer(b"".join(r["embedding"] for r in rows), dtype=np.float32).reshape(n, DIMS) + matrix = np.stack(all_embeddings) + n = len(all_ids) + ids = all_ids - index = hnswlib.Index(space="cosine", dim=DIMS) - # ef_construction and M balance build speed vs recall; - # these defaults give >99% recall at reasonable build time - index.init_index(max_elements=max(n, 1024), ef_construction=200, M=16) - index.add_items(matrix, list(range(n))) - index.set_ef(50) # query-time accuracy parameter + index = hnswlib.Index(space="cosine", dim=DIMS) + index.init_index(max_elements=max(n, 1024), ef_construction=200, M=16) + index.add_items(matrix, list(range(n))) + index.set_ef(50) + with _hnsw_lock: _hnsw_index = index _hnsw_ids = ids @@ -319,8 +357,8 @@ def store_embeddings(page_id, title, body, db): return embeddings_matrix = embed(chunks) + embeddings_matrix = _decompress(embeddings_matrix) - # Delete old chunks for this page db.execute("DELETE FROM chunks WHERE page_id = ?", (page_id,)) new_ids = [] @@ -343,6 +381,7 @@ def store_remote_embeddings(remote_page_id, title, note, db): return embeddings_matrix = embed([text]) + embeddings_matrix = _decompress(embeddings_matrix) db.execute("DELETE FROM chunks WHERE remote_page_id = ?", (remote_page_id,)) cursor = db.execute( diff --git a/handlers.py b/handlers.py index c3240ce..353d86f 100644 --- a/handlers.py +++ b/handlers.py @@ -684,10 +684,16 @@ def handle_bookmark(query): return _text_response(msg, headers={"Access-Control-Allow-Origin": "*"}) +MAX_EXPORT = 10000 + def handle_export(): + batch = int((query or {}).get("batch", ["0"])[0]) db = get_db() try: - rows = db.execute("SELECT url, title, note FROM pages ORDER BY id").fetchall() + rows = db.execute( + "SELECT url, title, note FROM pages ORDER BY id LIMIT ? OFFSET ?", + (MAX_EXPORT, batch * MAX_EXPORT), + ).fetchall() finally: return_db(db) data = [{"url": r["url"], "title": r["title"], "note": r["note"]} for r in rows] @@ -752,6 +758,8 @@ def handle_style_form(msg=""): dimmed = ' style="opacity:0.4"' if semantic != "1" else "" transport_host = get_setting("transport_host", "reticulum.derickphan.com") transport_port = get_setting("transport_port", "4242") + compress = get_setting("compress_embeddings", "0") + compress_checked = " checked" if compress == "1" else "" return _respond( f"
{msg}
" f'back', use_default=True, @@ -806,6 +822,7 @@ def handle_style_submit(body): sharing = "1" if body.get("sharing_enabled") else "0" semantic = "1" if body.get("semantic_search") else "0" reranker = "1" if body.get("use_reranker") else "0" + compress = "1" if body.get("compress_embeddings") else "0" transport_host = body.get("transport_host", [""])[0].strip() transport_port = body.get("transport_port", [""])[0].strip() set_setting("custom_template", template if template.strip() != DEFAULT_TEMPLATE.strip() else "") @@ -813,6 +830,7 @@ def handle_style_submit(body): set_setting("sharing_enabled", sharing) set_setting("semantic_search", semantic) set_setting("use_reranker", reranker) + set_setting("compress_embeddings", compress) if transport_host: set_setting("transport_host", transport_host) if transport_port: @@ -930,6 +948,8 @@ def handle_tag_browse(tag_name, query=None): ) +MAX_API_SITES = 5000 + def handle_api_sites(query=None): if get_setting("sharing_enabled", "0") != "1": return _json_response( @@ -943,11 +963,14 @@ def handle_api_sites(query=None): if since: rows = db.execute( "SELECT id, url, title, note, last_modified FROM pages " - "WHERE last_modified > ? ORDER BY id DESC", - (since,), + "WHERE last_modified > ? ORDER BY id DESC LIMIT ?", + (since, MAX_API_SITES), ).fetchall() else: - rows = db.execute("SELECT id, url, title, note, last_modified FROM pages ORDER BY id DESC").fetchall() + rows = db.execute( + "SELECT id, url, title, note, last_modified FROM pages ORDER BY id DESC LIMIT ?", + (MAX_API_SITES,), + ).fetchall() sites = [] for r in rows: tags = _get_page_tags(r["id"], db) @@ -955,8 +978,10 @@ def handle_api_sites(query=None): "url": r["url"], "title": r["title"], "note": r["note"], "tags": tags, "last_modified": r["last_modified"] or "", }) - # Include list of all current URLs so subscriber can detect deletions - all_urls = [r["url"] for r in db.execute("SELECT url FROM pages").fetchall()] if not since else None + # Include list of all current URLs so subscriber can detect deletions (limited) + all_urls = None + if not since: + all_urls = [r["url"] for r in db.execute("SELECT url FROM pages LIMIT ?", (MAX_API_SITES,)).fetchall()] finally: return_db(db) data = {"name": get_site_name(), "sites": sites} @@ -1040,18 +1065,20 @@ def handle_subscription_add(body): return handle_subscriptions(f"Subscribed to {esc(name or dest_hash)}.") +MAX_BROWSE = 5000 + def handle_subscription_browse(sub_id): db = get_db() try: sub = db.execute("SELECT * FROM subscriptions WHERE id = ?", (sub_id,)).fetchone() if not sub: return _error(404) - local_urls = set(r["url"] for r in db.execute("SELECT url FROM pages").fetchall()) + local_urls = set(r["url"] for r in db.execute("SELECT url FROM pages LIMIT ?", (MAX_BROWSE,)).fetchall()) # Use locally synced data if available, otherwise fetch live remote_rows = db.execute( - "SELECT url, title, note, tags FROM remote_pages WHERE subscription_id = ?", - (sub_id,), + "SELECT url, title, note, tags FROM remote_pages WHERE subscription_id = ? LIMIT ?", + (sub_id, MAX_BROWSE), ).fetchall() finally: return_db(db) @@ -1121,7 +1148,7 @@ def handle_subscription_pick(body): remote_tags = {r["url"]: r["tags"] for r in remote_rows} if import_all: - local_urls = set(r["url"] for r in db.execute("SELECT url FROM pages").fetchall()) + local_urls = set(r["url"] for r in db.execute("SELECT url FROM pages LIMIT ?", (MAX_BROWSE,)).fetchall()) urls = [r["url"] for r in remote_rows if r["url"] not in local_urls] else: urls = body.get("urls", []) @@ -1425,6 +1452,10 @@ def _dispatch_inner(data): elif path == "/style/reset": set_setting("custom_template", "") return handle_style_form("Template reset to default.") + elif path == "/style/vacuum": + from db import vacuum_db + vacuum_db() + return handle_style_form("Database vacuumed.") elif path == "/import": return handle_import_submit(body) elif path == "/reindex":