From a1358c1f3d7e979b739461a2b0231e85878d30d6 Mon Sep 17 00:00:00 2001 From: lichenblankie Date: Sat, 28 Mar 2026 21:24:10 -0700 Subject: [PATCH] added manual URL entry --- db.py | 20 +++++++++++++ handlers.py | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 99 insertions(+), 2 deletions(-) diff --git a/db.py b/db.py index 065d65d..a0a3580 100644 --- a/db.py +++ b/db.py @@ -20,6 +20,22 @@ BLOCKED_NETWORKS = [ ] +def _is_blocked_response(html, status_code): + """Check if response is a CDN challenge/block page.""" + if status_code == 403: + return True + html_lower = html.lower() + if "just a moment" in html_lower or "cloudflare" in html_lower: + return True + if "enable javascript and cookies" in html_lower: + return True + if "request rejected" in html_lower: + return True + if "access denied" in html_lower: + return True + return False + + def _validate_url_target(url): """Resolve hostname and block private/internal IPs to prevent SSRF.""" parsed = urlparse(url) @@ -281,6 +297,10 @@ def get_site_name(): def fetch_page(url): _validate_url_target(url) resp = requests.get(url, timeout=10, headers={"User-Agent": "TinyWeb/1.0"}, allow_redirects=False) + + if _is_blocked_response(resp.text, resp.status_code): + raise Exception(f"Site blocks automated access: {resp.status_code}") + # Follow redirects manually, re-validating each target max_redirects = 5 while resp.is_redirect and max_redirects > 0: diff --git a/handlers.py b/handlers.py index 484a5ca..5903d0b 100644 --- a/handlers.py +++ b/handlers.py @@ -357,10 +357,13 @@ def handle_add_submit(body): url = clean_url(body.get("url", [""])[0].strip()) note = body.get("note", [""])[0].strip() tags = body.get("tags", [""])[0].strip() + if not url: return handle_add_form("URL is required.") if not url.startswith(("http://", "https://")): return handle_add_form("URL must start with http:// or https://") + + # Try auto-index first try: title = index_url(url, note) if tags: @@ -373,10 +376,82 @@ def handle_add_submit(body): finally: return_db(db) return handle_add_form(f'Indexed: {esc(title)}') + except ValueError as e: return handle_add_form(f"Error: {esc(str(e))}") - except Exception: - return handle_add_form("Error: could not fetch or index that URL.") + + except Exception as e: + error_msg = str(e).lower() + # Check if it's a block response + if "block" in error_msg or "cloudflare" in error_msg or "403" in error_msg: + # Show manual entry form for blocked sites + return _respond( + f"

add url (manual entry)

" + f"

{esc(url)} blocks automated access. " + f"You can still save it manually:

" + f'
' + f'{_csrf_field()}' + f'' + f'' + f'' + f'
' + f'

' + f'
' + f'

' + f'' + f"
" + f'back' + ) + return handle_add_form(f"Error: could not fetch or index that URL. {esc(str(e)[:100])}") + + +def handle_add_manual_submit(body): + url = clean_url(body.get("url", [""])[0].strip()) + note = body.get("note", [""])[0].strip() + tags = body.get("tags", [""])[0].strip() + manual_title = body.get("manual_title", [""])[0].strip() + manual_desc = body.get("manual_description", [""])[0].strip() + + if not url: + return handle_add_form("URL is required.") + if not manual_title or not manual_desc: + return handle_add_form("Title and description are required for manual entry.") + + db = get_db() + try: + now = __import__("datetime").datetime.now().strftime("%Y-%m-%dT%H:%M:%S") + + # Insert the page + db.execute( + "INSERT INTO pages (url, title, body, note, last_modified, summary) VALUES (?, ?, ?, ?, ?, ?) " + "ON CONFLICT(url) DO UPDATE SET title=excluded.title, body=excluded.body, " + "note=excluded.note, last_modified=excluded.last_modified, summary=excluded.summary", + (url, manual_title, manual_desc, note, now, manual_desc[:200]), + ) + + # Get the page ID + page_id = db.execute("SELECT id FROM pages WHERE url = ?", (url,)).fetchone()[0] + + # Add tags if provided + if tags: + _set_page_tags(page_id, tags, db) + + db.commit() + + # Generate embeddings for this page (if semantic search is enabled) + if get_setting("semantic_search", "1") == "1": + try: + from embeddings import store_embeddings + # Pass the page_id, title, description, and db connection + store_embeddings(page_id, manual_title, manual_desc, db) + db.commit() + except Exception as e: + # Log error but don't fail the whole operation + print(f"Error generating embeddings: {e}") + + return handle_add_form(f'Added manually: {esc(manual_title)}') + finally: + return_db(db) def handle_pages(query=None): @@ -1209,6 +1284,8 @@ def _dispatch_inner(data): return _respond("

403 Forbidden

Invalid or missing CSRF token.

", status=403) if path == "/add": return handle_add_submit(body) + elif path == "/add/manual": + return handle_add_manual_submit(body) elif path.startswith("/edit/"): pid = extract_id("/edit/") return handle_edit_submit(pid, body) if pid is not None else _error(400)