tinyweb/db.py

import sqlite3
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup

DATABASE = "index.db"

SKIP_EXT = (".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf", ".zip", ".mp3", ".mp4", ".css", ".js", ".ico", ".xml", ".json")


def get_db():
    db = sqlite3.connect(DATABASE)
    db.row_factory = sqlite3.Row
    return db


def init_db():
    db = sqlite3.connect(DATABASE)
    db.execute(
        "CREATE TABLE IF NOT EXISTS pages ("
        "  id INTEGER PRIMARY KEY AUTOINCREMENT,"
        "  url TEXT UNIQUE NOT NULL,"
        "  title TEXT,"
        "  body TEXT,"
        "  note TEXT DEFAULT ''"
        ")"
    )
    db.execute(
        "CREATE VIRTUAL TABLE IF NOT EXISTS pages_fts "
        "USING fts5(title, body, url, note, content=pages, content_rowid=id)"
    )
    db.execute(
        "CREATE TABLE IF NOT EXISTS links ("
        "  id INTEGER PRIMARY KEY AUTOINCREMENT,"
        "  page_id INTEGER NOT NULL,"
        "  url TEXT NOT NULL,"
        "  label TEXT,"
        "  FOREIGN KEY (page_id) REFERENCES pages(id) ON DELETE CASCADE"
        ")"
    )
    db.execute(
        "CREATE TABLE IF NOT EXISTS settings ("
        "  key TEXT PRIMARY KEY,"
        "  value TEXT"
        ")"
    )
    db.execute(
        "CREATE TABLE IF NOT EXISTS subscriptions ("
        "  id INTEGER PRIMARY KEY AUTOINCREMENT,"
        "  url TEXT UNIQUE NOT NULL,"
        "  name TEXT DEFAULT '',"
        "  auto_sync INTEGER DEFAULT 0,"
        "  last_sync TEXT DEFAULT ''"
        ")"
    )
    db.executescript("""
        CREATE TRIGGER IF NOT EXISTS pages_ai AFTER INSERT ON pages BEGIN
            INSERT INTO pages_fts(rowid, title, body, url, note)
            VALUES (new.id, new.title, new.body, new.url, new.note);
        END;
        CREATE TRIGGER IF NOT EXISTS pages_ad AFTER DELETE ON pages BEGIN
            INSERT INTO pages_fts(pages_fts, rowid, title, body, url, note)
            VALUES ('delete', old.id, old.title, old.body, old.url, old.note);
        END;
        CREATE TRIGGER IF NOT EXISTS pages_au AFTER UPDATE ON pages BEGIN
            INSERT INTO pages_fts(pages_fts, rowid, title, body, url, note)
            VALUES ('delete', old.id, old.title, old.body, old.url, old.note);
            INSERT INTO pages_fts(rowid, title, body, url, note)
            VALUES (new.id, new.title, new.body, new.url, new.note);
        END;
    """)
    db.commit()
    db.close()


def get_setting(key, default=""):
    db = get_db()
    row = db.execute("SELECT value FROM settings WHERE key = ?", (key,)).fetchone()
    db.close()
    return row["value"] if row else default


def set_setting(key, value):
    db = get_db()
    db.execute(
        "INSERT INTO settings (key, value) VALUES (?, ?) "
        "ON CONFLICT(key) DO UPDATE SET value=excluded.value",
        (key, value),
    )
    db.commit()
    db.close()


def get_site_name():
    return get_setting("site_name", "tinyweb")


def fetch_page(url):
    resp = requests.get(url, timeout=10, headers={"User-Agent": "TinyWeb/1.0"}, verify=False)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    # extract links before stripping tags
    domain = urlparse(url).netloc
    seen = set()
    links = []
    for a in soup.find_all("a", href=True):
        href = urljoin(url, a["href"]).split("#")[0]
        parsed = urlparse(href)
        if parsed.netloc != domain:
            continue
        if any(href.lower().endswith(ext) for ext in SKIP_EXT):
            continue
        if parsed.query or "action=" in href:
            continue
        path = parsed.path.lower()
        if any(s in path for s in ("/special:", "/talk:", "/user:", "/wikipedia:", "/help:", "/portal:", "/file:", "/category:")):
            continue
        if href in seen or href == url:
            continue
        seen.add(href)
        label = a.get_text(strip=True) or href
        links.append((href, label[:200]))

    for tag in soup(["script", "style", "nav", "footer", "header"]):
        tag.decompose()
    title = soup.title.string.strip() if soup.title and soup.title.string else url
    body = soup.get_text(separator=" ", strip=True)
    return title, body, links


def index_url(url, note=""):
    title, body, links = fetch_page(url)
    db = get_db()
    cur = db.execute(
        "INSERT INTO pages (url, title, body, note) VALUES (?, ?, ?, ?) "
        "ON CONFLICT(url) DO UPDATE SET title=excluded.title, body=excluded.body, note=excluded.note",
        (url, title, body, note),
    )
    page_id = cur.lastrowid
    db.execute("DELETE FROM links WHERE page_id = ?", (page_id,))
    for href, label in links:
        db.execute(
            "INSERT INTO links (page_id, url, label) VALUES (?, ?, ?)",
            (page_id, href, label),
        )
    db.commit()
    db.close()
    return title