added manual URL entry
This commit is contained in:
parent
9bc5abd32f
commit
a1358c1f3d
2 changed files with 99 additions and 2 deletions
20
db.py
20
db.py
|
|
@ -20,6 +20,22 @@ BLOCKED_NETWORKS = [
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _is_blocked_response(html, status_code):
|
||||||
|
"""Check if response is a CDN challenge/block page."""
|
||||||
|
if status_code == 403:
|
||||||
|
return True
|
||||||
|
html_lower = html.lower()
|
||||||
|
if "just a moment" in html_lower or "cloudflare" in html_lower:
|
||||||
|
return True
|
||||||
|
if "enable javascript and cookies" in html_lower:
|
||||||
|
return True
|
||||||
|
if "request rejected" in html_lower:
|
||||||
|
return True
|
||||||
|
if "access denied" in html_lower:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _validate_url_target(url):
|
def _validate_url_target(url):
|
||||||
"""Resolve hostname and block private/internal IPs to prevent SSRF."""
|
"""Resolve hostname and block private/internal IPs to prevent SSRF."""
|
||||||
parsed = urlparse(url)
|
parsed = urlparse(url)
|
||||||
|
|
@ -281,6 +297,10 @@ def get_site_name():
|
||||||
def fetch_page(url):
|
def fetch_page(url):
|
||||||
_validate_url_target(url)
|
_validate_url_target(url)
|
||||||
resp = requests.get(url, timeout=10, headers={"User-Agent": "TinyWeb/1.0"}, allow_redirects=False)
|
resp = requests.get(url, timeout=10, headers={"User-Agent": "TinyWeb/1.0"}, allow_redirects=False)
|
||||||
|
|
||||||
|
if _is_blocked_response(resp.text, resp.status_code):
|
||||||
|
raise Exception(f"Site blocks automated access: {resp.status_code}")
|
||||||
|
|
||||||
# Follow redirects manually, re-validating each target
|
# Follow redirects manually, re-validating each target
|
||||||
max_redirects = 5
|
max_redirects = 5
|
||||||
while resp.is_redirect and max_redirects > 0:
|
while resp.is_redirect and max_redirects > 0:
|
||||||
|
|
|
||||||
81
handlers.py
81
handlers.py
|
|
@ -357,10 +357,13 @@ def handle_add_submit(body):
|
||||||
url = clean_url(body.get("url", [""])[0].strip())
|
url = clean_url(body.get("url", [""])[0].strip())
|
||||||
note = body.get("note", [""])[0].strip()
|
note = body.get("note", [""])[0].strip()
|
||||||
tags = body.get("tags", [""])[0].strip()
|
tags = body.get("tags", [""])[0].strip()
|
||||||
|
|
||||||
if not url:
|
if not url:
|
||||||
return handle_add_form("URL is required.")
|
return handle_add_form("URL is required.")
|
||||||
if not url.startswith(("http://", "https://")):
|
if not url.startswith(("http://", "https://")):
|
||||||
return handle_add_form("URL must start with http:// or https://")
|
return handle_add_form("URL must start with http:// or https://")
|
||||||
|
|
||||||
|
# Try auto-index first
|
||||||
try:
|
try:
|
||||||
title = index_url(url, note)
|
title = index_url(url, note)
|
||||||
if tags:
|
if tags:
|
||||||
|
|
@ -373,10 +376,82 @@ def handle_add_submit(body):
|
||||||
finally:
|
finally:
|
||||||
return_db(db)
|
return_db(db)
|
||||||
return handle_add_form(f'Indexed: <a href="{esc(url)}">{esc(title)}</a>')
|
return handle_add_form(f'Indexed: <a href="{esc(url)}">{esc(title)}</a>')
|
||||||
|
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
return handle_add_form(f"Error: {esc(str(e))}")
|
return handle_add_form(f"Error: {esc(str(e))}")
|
||||||
except Exception:
|
|
||||||
return handle_add_form("Error: could not fetch or index that URL.")
|
except Exception as e:
|
||||||
|
error_msg = str(e).lower()
|
||||||
|
# Check if it's a block response
|
||||||
|
if "block" in error_msg or "cloudflare" in error_msg or "403" in error_msg:
|
||||||
|
# Show manual entry form for blocked sites
|
||||||
|
return _respond(
|
||||||
|
f"<h1>add url (manual entry)</h1>"
|
||||||
|
f"<p><strong>{esc(url)}</strong> blocks automated access. "
|
||||||
|
f"You can still save it manually:</p>"
|
||||||
|
f'<form method="post" action="/add/manual">'
|
||||||
|
f'{_csrf_field()}'
|
||||||
|
f'<input type="hidden" name="url" value="{esc(url)}">'
|
||||||
|
f'<input type="hidden" name="note" value="{esc(note)}">'
|
||||||
|
f'<input type="hidden" name="tags" value="{esc(tags)}">'
|
||||||
|
f'<label>Title:</label><br>'
|
||||||
|
f'<input name="manual_title" size="50" placeholder="page title" required><br><br>'
|
||||||
|
f'<label>Description:</label><br>'
|
||||||
|
f'<textarea name="manual_description" rows="4" cols="50" placeholder="what is this site about?" required></textarea><br><br>'
|
||||||
|
f'<button type="submit">save manually</button>'
|
||||||
|
f"</form>"
|
||||||
|
f'<a href="/">back</a>'
|
||||||
|
)
|
||||||
|
return handle_add_form(f"Error: could not fetch or index that URL. {esc(str(e)[:100])}")
|
||||||
|
|
||||||
|
|
||||||
|
def handle_add_manual_submit(body):
|
||||||
|
url = clean_url(body.get("url", [""])[0].strip())
|
||||||
|
note = body.get("note", [""])[0].strip()
|
||||||
|
tags = body.get("tags", [""])[0].strip()
|
||||||
|
manual_title = body.get("manual_title", [""])[0].strip()
|
||||||
|
manual_desc = body.get("manual_description", [""])[0].strip()
|
||||||
|
|
||||||
|
if not url:
|
||||||
|
return handle_add_form("URL is required.")
|
||||||
|
if not manual_title or not manual_desc:
|
||||||
|
return handle_add_form("Title and description are required for manual entry.")
|
||||||
|
|
||||||
|
db = get_db()
|
||||||
|
try:
|
||||||
|
now = __import__("datetime").datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
|
||||||
|
|
||||||
|
# Insert the page
|
||||||
|
db.execute(
|
||||||
|
"INSERT INTO pages (url, title, body, note, last_modified, summary) VALUES (?, ?, ?, ?, ?, ?) "
|
||||||
|
"ON CONFLICT(url) DO UPDATE SET title=excluded.title, body=excluded.body, "
|
||||||
|
"note=excluded.note, last_modified=excluded.last_modified, summary=excluded.summary",
|
||||||
|
(url, manual_title, manual_desc, note, now, manual_desc[:200]),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get the page ID
|
||||||
|
page_id = db.execute("SELECT id FROM pages WHERE url = ?", (url,)).fetchone()[0]
|
||||||
|
|
||||||
|
# Add tags if provided
|
||||||
|
if tags:
|
||||||
|
_set_page_tags(page_id, tags, db)
|
||||||
|
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
# Generate embeddings for this page (if semantic search is enabled)
|
||||||
|
if get_setting("semantic_search", "1") == "1":
|
||||||
|
try:
|
||||||
|
from embeddings import store_embeddings
|
||||||
|
# Pass the page_id, title, description, and db connection
|
||||||
|
store_embeddings(page_id, manual_title, manual_desc, db)
|
||||||
|
db.commit()
|
||||||
|
except Exception as e:
|
||||||
|
# Log error but don't fail the whole operation
|
||||||
|
print(f"Error generating embeddings: {e}")
|
||||||
|
|
||||||
|
return handle_add_form(f'Added manually: <a href="{esc(url)}">{esc(manual_title)}</a>')
|
||||||
|
finally:
|
||||||
|
return_db(db)
|
||||||
|
|
||||||
|
|
||||||
def handle_pages(query=None):
|
def handle_pages(query=None):
|
||||||
|
|
@ -1209,6 +1284,8 @@ def _dispatch_inner(data):
|
||||||
return _respond("<h1>403 Forbidden</h1><p>Invalid or missing CSRF token.</p>", status=403)
|
return _respond("<h1>403 Forbidden</h1><p>Invalid or missing CSRF token.</p>", status=403)
|
||||||
if path == "/add":
|
if path == "/add":
|
||||||
return handle_add_submit(body)
|
return handle_add_submit(body)
|
||||||
|
elif path == "/add/manual":
|
||||||
|
return handle_add_manual_submit(body)
|
||||||
elif path.startswith("/edit/"):
|
elif path.startswith("/edit/"):
|
||||||
pid = extract_id("/edit/")
|
pid = extract_id("/edit/")
|
||||||
return handle_edit_submit(pid, body) if pid is not None else _error(400)
|
return handle_edit_submit(pid, body) if pid is not None else _error(400)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue