diff --git a/app.py b/app.py new file mode 100644 index 0000000..f6f85d1 --- /dev/null +++ b/app.py @@ -0,0 +1,524 @@ +import json +import sqlite3 +import html +import requests +from http.server import HTTPServer, BaseHTTPRequestHandler +from urllib.parse import parse_qs, urlparse, urljoin +from bs4 import BeautifulSoup + +DATABASE = "index.db" + + +def get_db(): + db = sqlite3.connect(DATABASE) + db.row_factory = sqlite3.Row + return db + + +def init_db(): + db = sqlite3.connect(DATABASE) + db.execute( + "CREATE TABLE IF NOT EXISTS pages (" + " id INTEGER PRIMARY KEY AUTOINCREMENT," + " url TEXT UNIQUE NOT NULL," + " title TEXT," + " body TEXT," + " note TEXT DEFAULT ''" + ")" + ) + db.execute( + "CREATE VIRTUAL TABLE IF NOT EXISTS pages_fts " + "USING fts5(title, body, url, note, content=pages, content_rowid=id)" + ) + db.execute( + "CREATE TABLE IF NOT EXISTS links (" + " id INTEGER PRIMARY KEY AUTOINCREMENT," + " page_id INTEGER NOT NULL," + " url TEXT NOT NULL," + " label TEXT," + " FOREIGN KEY (page_id) REFERENCES pages(id) ON DELETE CASCADE" + ")" + ) + db.execute( + "CREATE TABLE IF NOT EXISTS settings (" + " key TEXT PRIMARY KEY," + " value TEXT" + ")" + ) + db.executescript(""" + CREATE TRIGGER IF NOT EXISTS pages_ai AFTER INSERT ON pages BEGIN + INSERT INTO pages_fts(rowid, title, body, url, note) + VALUES (new.id, new.title, new.body, new.url, new.note); + END; + CREATE TRIGGER IF NOT EXISTS pages_ad AFTER DELETE ON pages BEGIN + INSERT INTO pages_fts(pages_fts, rowid, title, body, url, note) + VALUES ('delete', old.id, old.title, old.body, old.url, old.note); + END; + CREATE TRIGGER IF NOT EXISTS pages_au AFTER UPDATE ON pages BEGIN + INSERT INTO pages_fts(pages_fts, rowid, title, body, url, note) + VALUES ('delete', old.id, old.title, old.body, old.url, old.note); + INSERT INTO pages_fts(rowid, title, body, url, note) + VALUES (new.id, new.title, new.body, new.url, new.note); + END; + """) + db.commit() + db.close() + + +SKIP_EXT = (".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf", ".zip", ".mp3", ".mp4", ".css", ".js", ".ico", ".xml", ".json") + + +def fetch_page(url): + resp = requests.get(url, timeout=10, headers={"User-Agent": "TinyWeb/1.0"}, verify=False) + resp.raise_for_status() + soup = BeautifulSoup(resp.text, "html.parser") + + # extract links before stripping tags + domain = urlparse(url).netloc + seen = set() + links = [] + for a in soup.find_all("a", href=True): + href = urljoin(url, a["href"]).split("#")[0] + parsed = urlparse(href) + if parsed.netloc != domain: + continue + if any(href.lower().endswith(ext) for ext in SKIP_EXT): + continue + if parsed.query or "action=" in href: + continue + path = parsed.path.lower() + if any(s in path for s in ("/special:", "/talk:", "/user:", "/wikipedia:", "/help:", "/portal:", "/file:", "/category:")): + continue + if href in seen or href == url: + continue + seen.add(href) + label = a.get_text(strip=True) or href + links.append((href, label[:200])) + + for tag in soup(["script", "style", "nav", "footer", "header"]): + tag.decompose() + title = soup.title.string.strip() if soup.title and soup.title.string else url + body = soup.get_text(separator=" ", strip=True) + return title, body, links + + +def snippet(text, query, ctx=80): + pos = text.lower().find(query.lower()) + if pos == -1: + return text[:200] + start = max(0, pos - ctx) + end = min(len(text), pos + len(query) + ctx) + return ("..." if start > 0 else "") + text[start:end] + ("..." if end < len(text) else "") + + +def esc(s): + return html.escape(str(s)) + + +def get_setting(key, default=""): + db = get_db() + row = db.execute("SELECT value FROM settings WHERE key = ?", (key,)).fetchone() + db.close() + return row["value"] if row else default + + +def set_setting(key, value): + db = get_db() + db.execute( + "INSERT INTO settings (key, value) VALUES (?, ?) " + "ON CONFLICT(key) DO UPDATE SET value=excluded.value", + (key, value), + ) + db.commit() + db.close() + + +def get_site_name(): + return get_setting("site_name", "tinyweb") + + +def wrap_page(body_html): + css = get_setting("custom_css") + style = f"" if css else "" + return f"{style}{body_html}" + + +class Handler(BaseHTTPRequestHandler): + + def respond(self, body, status=200): + self.send_response(status) + self.send_header("Content-Type", "text/html; charset=utf-8") + self.end_headers() + self.wfile.write(wrap_page(body).encode()) + + def do_GET(self): + parsed = urlparse(self.path) + path = parsed.path + params = parse_qs(parsed.query) + + if path == "/": + self.handle_search(params) + elif path == "/add": + self.handle_add_form() + elif path == "/pages": + self.handle_pages() + elif path.startswith("/delete/"): + self.handle_delete(path) + elif path.startswith("/edit/"): + self.handle_edit_form(path) + elif path == "/style": + self.handle_style_form() + elif path == "/bookmark": + self.handle_bookmark(params) + elif path == "/export": + self.handle_export() + elif path == "/import": + self.handle_import_form() + else: + self.respond("

404

", 404) + + def do_POST(self): + length = int(self.headers.get("Content-Length", 0)) + body = self.rfile.read(length).decode() + params = parse_qs(body) + + if self.path == "/add": + self.handle_add_submit(params) + elif self.path.startswith("/edit/"): + self.handle_edit_submit(self.path, params) + elif self.path == "/style": + self.handle_style_submit(params) + elif self.path == "/import": + self.handle_import_submit(params) + else: + self.respond("

404

", 404) + + def handle_search(self, params): + q = params.get("q", [""])[0].strip() + db = get_db() + count = db.execute("SELECT count(*) FROM pages").fetchone()[0] + name = get_site_name() + + result_html = "" + trusted_html = "" + if q: + rows = db.execute( + "SELECT p.id, p.url, p.title, p.body, p.note " + "FROM pages_fts f JOIN pages p ON f.rowid = p.id " + "WHERE pages_fts MATCH ? ORDER BY rank LIMIT 50", + (q,), + ).fetchall() + if rows: + for r in rows: + note_html = "" + if r["note"]: + note_html = f'
{esc(r["note"])}
' + result_html += ( + f'
' + f'{esc(r["title"])}
' + f'{esc(r["url"])}
' + f'{esc(snippet(r["body"], q))}' + f'{note_html}' + f'
' + ) + else: + result_html = "

No results in your index.

" + + # search all linked pages from trusted sites + words = q.lower().split() + all_links = db.execute( + "SELECT l.url, l.label, p.title AS source_title " + "FROM links l JOIN pages p ON l.page_id = p.id", + ).fetchall() + indexed_urls = set(r["url"] for r in rows) if rows else set() + seen = set() + trusted = [] + for l in all_links: + if l["url"] in indexed_urls or l["url"] in seen: + continue + if any(w in l["label"].lower() for w in words): + seen.add(l["url"]) + trusted.append(l) + if len(trusted) >= 20: + break + + if trusted: + items = "" + for l in trusted: + items += ( + f'
  • {esc(l["label"])} ' + f'— from {esc(l["source_title"])}
  • ' + ) + trusted_html = ( + f'
    ' + f'from your trusted sites ({len(trusted)})' + f'' + f'
    ' + ) + + db.close() + self.respond( + f'

    {esc(name)}

    ' + f'
    ' + f'' + f' ' + f'
    ' + f'

    {count} page(s) indexed.' + f' + add url' + f' | browse' + f' | customize

    ' + f'
    {result_html}{trusted_html}' + ) + + def handle_add_form(self, msg=""): + self.respond( + f"

    add url

    " + f'
    ' + f'

    ' + f'

    ' + f'' + f"
    " + f"

    {msg}

    " + f'back' + ) + + def handle_add_submit(self, params): + url = params.get("url", [""])[0].strip() + note = params.get("note", [""])[0].strip() + if not url: + return self.handle_add_form("URL is required.") + if not url.startswith(("http://", "https://")): + return self.handle_add_form("URL must start with http:// or https://") + try: + title, body, links = fetch_page(url) + db = get_db() + cur = db.execute( + "INSERT INTO pages (url, title, body, note) VALUES (?, ?, ?, ?) " + "ON CONFLICT(url) DO UPDATE SET title=excluded.title, body=excluded.body, note=excluded.note", + (url, title, body, note), + ) + page_id = cur.lastrowid + db.execute("DELETE FROM links WHERE page_id = ?", (page_id,)) + for href, label in links: + db.execute( + "INSERT INTO links (page_id, url, label) VALUES (?, ?, ?)", + (page_id, href, label), + ) + db.commit() + db.close() + self.handle_add_form(f'Indexed: {esc(title)}') + except Exception as e: + self.handle_add_form(f"Error: {esc(str(e))}") + + def handle_pages(self): + db = get_db() + rows = db.execute("SELECT id, url, title, note FROM pages ORDER BY id DESC").fetchall() + db.close() + items = "" + for r in rows: + note_html = f' — {esc(r["note"])}' if r["note"] else "" + items += ( + f'
  • {esc(r["title"])}{note_html} ' + f'({esc(r["url"])}) ' + f'edit ' + f'remove
  • ' + ) + self.respond( + f"

    indexed pages ({len(rows)})

    " + f"" + f'

    export | import

    ' + f'back' + ) + + def handle_edit_form(self, path, msg=""): + try: + page_id = int(path.split("/")[-1]) + except ValueError: + return self.respond("

    400

    ", 400) + db = get_db() + row = db.execute("SELECT id, url, title, note FROM pages WHERE id = ?", (page_id,)).fetchone() + db.close() + if not row: + return self.respond("

    404

    ", 404) + self.respond( + f"

    edit note

    " + f"

    {esc(row['title'])}
    " + f"{esc(row['url'])}

    " + f'
    ' + f'

    ' + f'' + f"
    " + f"

    {msg}

    " + f'back' + ) + + def handle_edit_submit(self, path, params): + try: + page_id = int(path.split("/")[-1]) + except ValueError: + return self.respond("

    400

    ", 400) + note = params.get("note", [""])[0].strip() + db = get_db() + db.execute("UPDATE pages SET note = ? WHERE id = ?", (note, page_id)) + db.commit() + db.close() + self.send_response(302) + self.send_header("Location", "/pages") + self.end_headers() + + def handle_delete(self, path): + try: + page_id = int(path.split("/")[-1]) + except ValueError: + return self.respond("

    400

    ", 400) + db = get_db() + db.execute("DELETE FROM links WHERE page_id = ?", (page_id,)) + db.execute("DELETE FROM pages WHERE id = ?", (page_id,)) + db.commit() + db.close() + self.send_response(302) + self.send_header("Location", "/pages") + self.end_headers() + + def handle_bookmark(self, params): + url = params.get("url", [""])[0].strip() + if not url or not url.startswith(("http://", "https://")): + self.send_response(200) + self.send_header("Content-Type", "text/plain") + self.send_header("Access-Control-Allow-Origin", "*") + self.end_headers() + self.wfile.write(b"error: invalid url") + return + try: + title, body, links = fetch_page(url) + db = get_db() + cur = db.execute( + "INSERT INTO pages (url, title, body, note) VALUES (?, ?, ?, '') " + "ON CONFLICT(url) DO UPDATE SET title=excluded.title, body=excluded.body", + (url, title, body), + ) + page_id = cur.lastrowid + db.execute("DELETE FROM links WHERE page_id = ?", (page_id,)) + for href, label in links: + db.execute( + "INSERT INTO links (page_id, url, label) VALUES (?, ?, ?)", + (page_id, href, label), + ) + db.commit() + db.close() + msg = f"ok: {title}" + except Exception as e: + msg = f"error: {e}" + self.send_response(200) + self.send_header("Content-Type", "text/plain") + self.send_header("Access-Control-Allow-Origin", "*") + self.end_headers() + self.wfile.write(msg.encode()) + + def handle_export(self): + db = get_db() + rows = db.execute("SELECT url, title, note FROM pages ORDER BY id").fetchall() + db.close() + data = [{"url": r["url"], "title": r["title"], "note": r["note"]} for r in rows] + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Disposition", "attachment; filename=tinyweb-export.json") + self.end_headers() + self.wfile.write(json.dumps(data, indent=2).encode()) + + def handle_import_form(self, msg=""): + self.respond( + f"

    import

    " + f"

    Paste the contents of a tinyweb export file (JSON).

    " + f'
    ' + f'

    ' + f'' + f"
    " + f"

    {msg}

    " + f'back' + ) + + def handle_import_submit(self, params): + raw = params.get("data", [""])[0].strip() + if not raw: + return self.handle_import_form("Paste JSON data.") + try: + data = json.loads(raw) + except json.JSONDecodeError: + return self.handle_import_form("Invalid JSON.") + if not isinstance(data, list): + return self.handle_import_form("Expected a JSON array.") + + imported = 0 + errors = 0 + for entry in data: + url = entry.get("url", "").strip() + note = entry.get("note", "").strip() + if not url: + continue + try: + title, body, links = fetch_page(url) + db = get_db() + cur = db.execute( + "INSERT INTO pages (url, title, body, note) VALUES (?, ?, ?, ?) " + "ON CONFLICT(url) DO UPDATE SET title=excluded.title, body=excluded.body, note=excluded.note", + (url, title, body, note), + ) + page_id = cur.lastrowid + db.execute("DELETE FROM links WHERE page_id = ?", (page_id,)) + for href, label in links: + db.execute( + "INSERT INTO links (page_id, url, label) VALUES (?, ?, ?)", + (page_id, href, label), + ) + db.commit() + db.close() + imported += 1 + except Exception: + errors += 1 + + self.handle_import_form(f"Imported {imported} page(s). {errors} error(s).") + + def handle_style_form(self, msg=""): + css = get_setting("custom_css") + name = get_site_name() + self.respond( + f"

    customize

    " + f"

    name your search engine

    " + f'
    ' + f'

    ' + f"

    custom css

    " + f"

    Some classes you can target:

    " + f"
    "
    +            f"body          - page background, font\n"
    +            f"h1            - page titles\n"
    +            f"input, button - search bar\n"
    +            f"a             - links\n"
    +            f".result       - each search result\n"
    +            f".note         - your notes on results\n"
    +            f".trusted      - trusted sites dropdown\n"
    +            f"small         - url text\n"
    +            f"ul, li        - browse page list"
    +            f"
    " + f'

    ' + f'' + f"
    " + f"

    bookmarklet

    " + f"

    Drag this link to your bookmarks bar. Click it on any page to index it instantly.

    " + f'

    + save to {esc(name)}

    ' + f"

    {msg}

    " + f'back' + ) + + def handle_style_submit(self, params): + css = params.get("css", [""])[0] + name = params.get("site_name", ["tinyweb"])[0].strip() + set_setting("custom_css", css) + set_setting("site_name", name or "tinyweb") + self.handle_style_form("Saved.") + + +if __name__ == "__main__": + init_db() + print("running on http://localhost:5001") + HTTPServer(("localhost", 5001), Handler).serve_forever() diff --git a/index.db b/index.db new file mode 100644 index 0000000..797cd93 Binary files /dev/null and b/index.db differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1190bd8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +requests +beautifulsoup4