diff --git a/.dockerignore b/.dockerignore index 031c6d8..9adf733 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,5 +1,15 @@ __pycache__/ +**/__pycache__/ +*.pyc index.db* +index.hnsw tinyweb_identity .git/ +.gitignore *.md +.env +.env.* +.venv/ +venv/ +models/ +.DS_Store diff --git a/README.md b/README.md index 693ee29..de6f038 100644 --- a/README.md +++ b/README.md @@ -122,10 +122,13 @@ Or with docker-compose (see above) — data persists in the named volume. ### Command line options ```bash -./TinyWeb --version # Show version -./TinyWeb -p 9000 # Use port 9000 instead of default 8080 +./TinyWeb --version # Show version +./TinyWeb -p 9000 # Use port 9000 instead of default 8080 +./TinyWeb --bind 0.0.0.0 # Expose the web UI to your LAN (see warning below) ``` +By default, the web UI binds to `127.0.0.1` and is only reachable from the machine running TinyWeb. **The UI has no authentication** — anyone who can reach the port can read, add, and delete entries, and change settings. Only pass `--bind 0.0.0.0` if you fully trust your network, or put TinyWeb behind an authenticating reverse proxy. + ## Getting started ```bash @@ -133,7 +136,7 @@ pip install -r requirements.txt python app.py ``` -This starts the Reticulum server and an HTTP gateway on `http://localhost:8080`. Open it in your browser. +This starts the Reticulum server and an HTTP gateway on `http://127.0.0.1:8080`. Open it in your browser. The UI is localhost-only by default; see `--bind` under *Command line options* if you want to reach it from another machine. Your destination hash is printed on startup — share it with friends so they can subscribe to your index. @@ -168,7 +171,9 @@ themes/ — Saved HTML templates (e.g. kodama.html) ## Security -TinyWeb includes several hardening measures: +**The web UI has no authentication.** It is bound to `127.0.0.1` by default, so only processes on the local machine can reach it. If you pass `--bind 0.0.0.0` (or run inside a container with a published port), anyone who can reach that address can fully control your instance — reading private entries, changing settings, and modifying the HTML template (which runs in your browser). Put TinyWeb behind a reverse proxy with auth before exposing it beyond localhost. + +Other hardening measures: - **CSRF protection** — All POST forms use per-session tokens via double-submit cookies - **SSRF prevention** — URL fetching validates hostnames against private IP ranges, with redirect re-validation diff --git a/app.py b/app.py index 71d0200..b1c4fe6 100644 --- a/app.py +++ b/app.py @@ -8,7 +8,8 @@ from http.server import HTTPServer from db import init_db, get_setting, set_setting from handlers import dispatch_request -from gateway import GatewayState, GatewayHandler, GATEWAY_PORT +import gateway +from gateway import GatewayState, GatewayHandler APP_NAME = "tinyweb" ASPECTS = ["server"] @@ -24,13 +25,13 @@ def get_transport_config(): return host, int(port) -def find_available_port(start=8080, max_attempts=20): +def find_available_port(start=8080, max_attempts=20, host="127.0.0.1"): """Find an available port starting from start.""" import socket for port in range(start, start + max_attempts): try: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind(("0.0.0.0", port)) + s.bind((host, port)) return port except OSError: continue @@ -71,16 +72,32 @@ def load_or_create_identity(): return identity +# Remote peers on the Reticulum mesh can only reach a narrow, read-only surface. +# Any other method/path is rejected here — CSRF cannot authenticate mesh callers +# (the attacker controls both the "cookie" and the "form" side of the check), so +# gating by whitelist is the only safe option. +_RNS_ALLOWED = {("GET", "/api/sites")} + + def rns_request_handler(path, data, request_id, link_id, remote_identity, requested_at): if data is None: data = {"method": "GET", "path": "/", "query": {}, "body": {}, "gateway_host": ""} + method = data.get("method", "GET") + req_path = data.get("path", "/") + if (method, req_path) not in _RNS_ALLOWED: + return { + "status": 403, + "content_type": "text/plain; charset=utf-8", + "body": "Forbidden: this endpoint is not available over Reticulum.", + "headers": {}, + } return dispatch_request(data) -def start_gateway(reticulum): +def start_gateway(reticulum, bind_host="127.0.0.1"): GatewayState.reticulum = reticulum GatewayState.local_dispatch = dispatch_request - server = HTTPServer(("0.0.0.0", GATEWAY_PORT), GatewayHandler) + server = HTTPServer((bind_host, gateway.GATEWAY_PORT), GatewayHandler) thread = threading.Thread(target=server.serve_forever, daemon=True) thread.start() @@ -126,7 +143,21 @@ def ensure_rns_config(config_dir, transport_host=None, transport_port=None): if transport_port is None: transport_port = int(get_setting("transport_port", str(DEFAULT_TRANSPORT_PORT))) + managed_sentinel = "# managed by tinyweb" if os.path.exists(config_file): + try: + with open(config_file) as f: + existing = f.read() + except OSError: + existing = "" + if managed_sentinel not in existing: + # User-authored config — don't clobber it. + if not _config_settings_match(config_file, transport_host, transport_port): + print( + f"Warning: {config_file} was not created by tinyweb; " + "leaving it alone. Edit it manually to change transport/LoRa settings." + ) + return if _config_settings_match(config_file, transport_host, transport_port): return @@ -164,7 +195,8 @@ def ensure_rns_config(config_dir, transport_host=None, transport_port=None): os.makedirs(config_dir, exist_ok=True) with open(config_file, "w") as f: - f.write(f"""[reticulum] + f.write(f"""{managed_sentinel} +[reticulum] enable_transport = False share_instance = No @@ -201,15 +233,20 @@ def main(): parser = argparse.ArgumentParser(prog="tinyweb", description="Personal decentralized search engine") parser.add_argument("--version", "-v", action="store_true", help="Show version") parser.add_argument("--port", "-p", type=int, default=None, help="HTTP gateway port (default: 8080)") + parser.add_argument( + "--bind", "-b", default="127.0.0.1", + help="Address to bind the HTTP gateway to (default: 127.0.0.1). " + "Use 0.0.0.0 to expose to the LAN; note that the web UI has no authentication.", + ) args = parser.parse_args() if args.version: print(f"TinyWeb {get_version()}") return + bind_host = args.bind port = args.port or 8080 - import gateway - gateway.GATEWAY_PORT = find_available_port(port) + gateway.GATEWAY_PORT = find_available_port(port, host=bind_host) init_db() transport_host = get_setting("transport_host", DEFAULT_TRANSPORT_HOST) @@ -238,10 +275,15 @@ def main(): time.sleep(2) destination.announce() set_setting("dest_hash", destination.hash.hex()) - start_gateway(reticulum) + start_gateway(reticulum, bind_host=bind_host) print(f"TinyWeb running!") - print(f"Open http://localhost:{GATEWAY_PORT} in your browser") + if bind_host in ("0.0.0.0", "::"): + print(f"Open http://localhost:{gateway.GATEWAY_PORT} in your browser") + print(f"WARNING: listening on {bind_host} — the web UI has no authentication. " + "Anyone on your network can control this instance.") + else: + print(f"Open http://{bind_host}:{gateway.GATEWAY_PORT} in your browser") print(f"Destination hash: {RNS.prettyhexrep(destination.hash)} (share this so friends can subscribe)") while True: diff --git a/db.py b/db.py index 9ae6f55..8a645ab 100644 --- a/db.py +++ b/db.py @@ -123,6 +123,14 @@ def get_db(): def return_db(db): + try: + db.rollback() + except Exception: + try: + db.close() + except Exception: + pass + return with _pool_lock: if len(_pool) < _POOL_SIZE: _pool.append(db) @@ -412,7 +420,7 @@ def index_url(url, note="", reticulum_dest=""): (page_id, href, label), ) db.commit() - if get_setting("semantic_search", "1") == "1": + if get_setting("semantic_search", "0") == "1": try: from embeddings import store_embeddings store_embeddings(page_id, title, body, db) diff --git a/embeddings.py b/embeddings.py index 0362945..03f6f13 100644 --- a/embeddings.py +++ b/embeddings.py @@ -261,6 +261,13 @@ def _decompress(embeddings): return embeddings +def _blob_to_vec(buf): + """Decode a stored embedding blob to a float32 vector, inferring dtype from length.""" + if len(buf) == DIMS * 2: + return np.frombuffer(buf, dtype=np.float16).astype(np.float32) + return np.frombuffer(buf, dtype=np.float32) + + # --------------------------------------------------------------------------- # HNSW index management # --------------------------------------------------------------------------- @@ -294,9 +301,7 @@ def build_index(db=None): (BATCH_SIZE, offset), ).fetchall() for r in rows: - emb = np.frombuffer(r["embedding"], dtype=np.float32) - if emb.dtype == np.float16: - emb = emb.astype(np.float32) + emb = _blob_to_vec(r["embedding"]) all_ids.append(r["id"]) all_embeddings.append(emb) finally: diff --git a/entrypoint.sh b/entrypoint.sh index e4a9719..1f49fcb 100755 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -30,4 +30,5 @@ EOF fi export RNS_CONFIG_DIR="$CONFIG_DIR" -exec python app.py +# Bind to 0.0.0.0 inside the container; isolation is handled by Docker's port mapping. +exec python app.py --bind 0.0.0.0 "$@" diff --git a/gateway.py b/gateway.py index ffafc6a..d07924d 100644 --- a/gateway.py +++ b/gateway.py @@ -1,3 +1,4 @@ +import re import sys import time import threading @@ -9,6 +10,7 @@ APP_NAME = "tinyweb" ASPECTS = ["server"] GATEWAY_PORT = 8080 REQUEST_TIMEOUT = 60 +MAX_BODY_SIZE = 16 * 1024 * 1024 # 16 MiB — covers /import and every other form class GatewayState: @@ -71,8 +73,18 @@ class GatewayHandler(BaseHTTPRequestHandler): body = {} if method == "POST": - length = int(self.headers.get("Content-Length", 0)) - raw = self.rfile.read(length).decode() + try: + length = int(self.headers.get("Content-Length", 0)) + except ValueError: + self.send_error(400, "Invalid Content-Length") + return + if length < 0: + self.send_error(400, "Invalid Content-Length") + return + if length > MAX_BODY_SIZE: + self.send_error(413, "Request body too large") + return + raw = self.rfile.read(length).decode("utf-8", errors="replace") body = parse_qs(raw) # Parse cookies @@ -152,7 +164,14 @@ class GatewayHandler(BaseHTTPRequestHandler): self._forward("POST") def log_message(self, format, *args): - print(f"[Gateway] {args[0]}") + try: + msg = format % args + except TypeError: + msg = format + # /bookmark carries a long-lived token and the URL being indexed — + # redact the query so it doesn't end up in stdout, journald, docker logs, etc. + msg = re.sub(r'(/bookmark)\?\S*', r'\1?[redacted]', msg) + print(f"[Gateway] {msg}") def main(): diff --git a/handlers.py b/handlers.py index 74f66b7..47f38e1 100644 --- a/handlers.py +++ b/handlers.py @@ -50,14 +50,18 @@ def _sanitize_fts_query(query): if not words: return '""' tokens = [] + last_idx = len(words) - 1 for i, w in enumerate(words): - # Strip FTS5 special characters to prevent injection - cleaned = re.sub(r'["\'\(\)\*\+\-\^~]', '', w).strip() + # Strip FTS5 special characters (operators, column filter colon) to prevent injection + cleaned = re.sub(r'["\'\(\)\*\+\-\^~:]', '', w).strip() if not cleaned: continue if cleaned.lower() in _STOPWORDS: continue - if i == len(words) - 1: + # Drop FTS5 operator words so they aren't parsed as operators on the unquoted last token + if cleaned.upper() in ("AND", "OR", "NOT", "NEAR"): + continue + if i == last_idx: # Prefix match on the last token for partial word matching tokens.append(f"{cleaned}*") else: @@ -688,8 +692,11 @@ def handle_bookmark(query): MAX_EXPORT = 10000 -def handle_export(): - batch = int((query or {}).get("batch", ["0"])[0]) +def handle_export(query=None): + try: + batch = int((query or {}).get("batch", ["0"])[0]) + except (TypeError, ValueError): + batch = 0 db = get_db() try: rows = db.execute( @@ -752,6 +759,10 @@ def handle_style_form(msg=""): name = get_site_name() sharing = get_setting("sharing_enabled", "0") checked = " checked" if sharing == "1" else "" + sharing_mode = get_setting("sharing_mode", "exclude_private") + exclude_checked = " checked" if sharing_mode != "require_public" else "" + require_checked = " checked" if sharing_mode == "require_public" else "" + shared_count = _count_shared_pages() semantic = get_setting("semantic_search", "0") semantic_checked = " checked" if semantic == "1" else "" reranker = get_setting("use_reranker", "0") @@ -784,7 +795,18 @@ def handle_style_form(msg=""): f"
private tag always excludes a page, even in public-only mode.'
+ f'' + f'Currently sharing {shared_count} page(s). ' + f'preview what subscribers would see' + f'
' f"Choose how to connect to the mesh. You can enable both for maximum reach.
" f"public"
+ if mode == "require_public"
+ else "all pages except those tagged private"
+ )
+ sharing_on = get_setting("sharing_enabled", "0") == "1"
+ status = (
+ 'Sharing is enabled. Subscribers see the pages listed below.
' + if sharing_on else + 'Sharing is disabled. Nothing is actually being shared right now; ' + 'this is the list that would be exposed if you enabled it.
' + ) + db = get_db() + try: + sites = _shared_sites(db) + finally: + return_db(db) + if not sites: + body = ( + "Rule: {mode_label}.
" + f"{status}" + "No pages match the current rule.
" + '' + ) + return _respond(body) + rows = "" + for s in sites: + tags_html = "" + if s["tags"]: + tags_html = " " + " ".join(f"[{esc(t)}]" for t in s["tags"]) + note_html = f' — {esc(s["note"])}' if s["note"] else "" + rows += ( + f'Rule: {mode_label}.
" + f"{status}" + f"{len(sites)} page(s) visible to subscribers.
" + f"