From 62055a578dfc628d78df8e190becc82c8690b3ac Mon Sep 17 00:00:00 2001 From: Derick Phan Date: Wed, 25 Mar 2026 23:15:28 -0700 Subject: [PATCH] Strip tracking params from URLs and add tags/collections URLs are cleaned of tracking parameters (utm_*, fbclid, gclid, etc.) before indexing. Tags can be added when saving or editing pages, browsed at /tags, and are included in search results. Tags are shared via /api/sites and preserved when syncing/importing from subscriptions. Co-Authored-By: Claude Opus 4.6 --- db.py | 39 +++++++++++- handlers.py | 180 ++++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 192 insertions(+), 27 deletions(-) diff --git a/db.py b/db.py index 5a86a1c..b523c79 100644 --- a/db.py +++ b/db.py @@ -1,12 +1,26 @@ import sqlite3 import requests -from urllib.parse import urlparse, urljoin +from urllib.parse import urlparse, urljoin, parse_qs, urlencode, urlunparse from bs4 import BeautifulSoup DATABASE = "index.db" SKIP_EXT = (".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf", ".zip", ".mp3", ".mp4", ".css", ".js", ".ico", ".xml", ".json") +TRACKING_PARAMS = { + "utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content", + "fbclid", "gclid", "msclkid", "mc_cid", "mc_eid", "ref", "ref_src", + "ref_url", "_ga", "_gl", "yclid", "twclid", "igshid", +} + + +def clean_url(url): + parsed = urlparse(url) + params = parse_qs(parsed.query) + cleaned = {k: v for k, v in params.items() if k.lower() not in TRACKING_PARAMS} + new_query = urlencode(cleaned, doseq=True) + return urlunparse(parsed._replace(query=new_query)) + def get_db(): db = sqlite3.connect(DATABASE) @@ -60,6 +74,7 @@ def init_db(): " url TEXT NOT NULL," " title TEXT," " note TEXT DEFAULT ''," + " tags TEXT DEFAULT ''," " FOREIGN KEY (subscription_id) REFERENCES subscriptions(id) ON DELETE CASCADE," " UNIQUE(subscription_id, url)" ")" @@ -68,6 +83,21 @@ def init_db(): "CREATE VIRTUAL TABLE IF NOT EXISTS remote_pages_fts " "USING fts5(title, url, note, content=remote_pages, content_rowid=id)" ) + db.execute( + "CREATE TABLE IF NOT EXISTS tags (" + " id INTEGER PRIMARY KEY AUTOINCREMENT," + " name TEXT UNIQUE NOT NULL" + ")" + ) + db.execute( + "CREATE TABLE IF NOT EXISTS page_tags (" + " page_id INTEGER NOT NULL," + " tag_id INTEGER NOT NULL," + " PRIMARY KEY (page_id, tag_id)," + " FOREIGN KEY (page_id) REFERENCES pages(id) ON DELETE CASCADE," + " FOREIGN KEY (tag_id) REFERENCES tags(id) ON DELETE CASCADE" + ")" + ) db.executescript(""" CREATE TRIGGER IF NOT EXISTS pages_ai AFTER INSERT ON pages BEGIN INSERT INTO pages_fts(rowid, title, body, url, note) @@ -104,6 +134,12 @@ def init_db(): db.execute("ALTER TABLE subscriptions RENAME COLUMN url TO dest_hash") db.commit() + # Migrate remote_pages: add tags column if missing + rp_cols = [row[1] for row in db.execute("PRAGMA table_info(remote_pages)").fetchall()] + if "tags" not in rp_cols: + db.execute("ALTER TABLE remote_pages ADD COLUMN tags TEXT DEFAULT ''") + db.commit() + db.commit() db.close() @@ -165,6 +201,7 @@ def fetch_page(url): def index_url(url, note=""): + url = clean_url(url) title, body, links = fetch_page(url) db = get_db() cur = db.execute( diff --git a/handlers.py b/handlers.py index 4550e15..42944a5 100644 --- a/handlers.py +++ b/handlers.py @@ -1,7 +1,7 @@ import json from datetime import datetime -from db import get_db, get_setting, set_setting, get_site_name, index_url +from db import get_db, get_setting, set_setting, get_site_name, index_url, clean_url from templates import esc, snippet, wrap_page from rns_client import fetch_remote_sites @@ -46,6 +46,38 @@ def _error(status): return _respond(f"

{status}

", status) +# --- Tag helpers --- + + +def _get_page_tags(page_id, db=None): + close = False + if db is None: + db = get_db() + close = True + rows = db.execute( + "SELECT t.name FROM tags t JOIN page_tags pt ON t.id = pt.tag_id " + "WHERE pt.page_id = ? ORDER BY t.name", (page_id,) + ).fetchall() + if close: + db.close() + return [r["name"] for r in rows] + + +def _set_page_tags(page_id, tag_string, db=None): + close = False + if db is None: + db = get_db() + close = True + db.execute("DELETE FROM page_tags WHERE page_id = ?", (page_id,)) + for name in (t.strip().lower() for t in tag_string.split(",") if t.strip()): + db.execute("INSERT OR IGNORE INTO tags (name) VALUES (?)", (name,)) + tag_id = db.execute("SELECT id FROM tags WHERE name = ?", (name,)).fetchone()["id"] + db.execute("INSERT OR IGNORE INTO page_tags (page_id, tag_id) VALUES (?, ?)", (page_id, tag_id)) + if close: + db.commit() + db.close() + + # --- Route handlers --- @@ -69,12 +101,17 @@ def handle_search(query): note_html = "" if r["note"]: note_html = f'
{esc(r["note"])}
' + tags = _get_page_tags(r["id"], db) + tags_html = "" + if tags: + tag_links = " ".join(f'[{esc(t)}]' for t in tags) + tags_html = f'
{tag_links}
' result_html += ( f'
' f'{esc(r["title"])}
' f'{esc(r["url"])}
' f'{esc(snippet(r["body"], q))}' - f'{note_html}' + f'{note_html}{tags_html}' f'
' ) else: @@ -157,6 +194,7 @@ def handle_search(query): f'

{count} page(s) indexed.' f' + add url' f' | browse' + f' | tags' f' | subscriptions' f' | customize

' f'
{result_html}{trusted_html}{remote_html}' @@ -169,6 +207,7 @@ def handle_add_form(msg=""): f'
' f'

' f'

' + f'

' f'' f"
" f"

{msg}

" @@ -177,14 +216,22 @@ def handle_add_form(msg=""): def handle_add_submit(body): - url = body.get("url", [""])[0].strip() + url = clean_url(body.get("url", [""])[0].strip()) note = body.get("note", [""])[0].strip() + tags = body.get("tags", [""])[0].strip() if not url: return handle_add_form("URL is required.") if not url.startswith(("http://", "https://")): return handle_add_form("URL must start with http:// or https://") try: title = index_url(url, note) + if tags: + db = get_db() + row = db.execute("SELECT id FROM pages WHERE url = ?", (url,)).fetchone() + if row: + _set_page_tags(row["id"], tags, db) + db.commit() + db.close() return handle_add_form(f'Indexed: {esc(title)}') except Exception as e: return handle_add_form(f"Error: {esc(str(e))}") @@ -193,16 +240,21 @@ def handle_add_submit(body): def handle_pages(): db = get_db() rows = db.execute("SELECT id, url, title, note FROM pages ORDER BY id DESC").fetchall() - db.close() items = "" for r in rows: note_html = f' — {esc(r["note"])}' if r["note"] else "" + tags = _get_page_tags(r["id"], db) + tags_html = "" + if tags: + tag_links = " ".join(f'[{esc(t)}]' for t in tags) + tags_html = f' {tag_links}' items += ( - f'
  • {esc(r["title"])}{note_html} ' + f'
  • {esc(r["title"])}{note_html}{tags_html} ' f'({esc(r["url"])}) ' f'edit ' f'remove
  • ' ) + db.close() return _respond( f"

    indexed pages ({len(rows)})

    " f"" @@ -214,15 +266,18 @@ def handle_pages(): def handle_edit_form(page_id, msg=""): db = get_db() row = db.execute("SELECT id, url, title, note FROM pages WHERE id = ?", (page_id,)).fetchone() - db.close() if not row: + db.close() return _error(404) + tags = ", ".join(_get_page_tags(page_id, db)) + db.close() return _respond( - f"

    edit note

    " + f"

    edit page

    " f"

    {esc(row['title'])}
    " f"{esc(row['url'])}

    " f'
    ' f'

    ' + f'

    ' f'' f"
    " f"

    {msg}

    " @@ -232,8 +287,10 @@ def handle_edit_form(page_id, msg=""): def handle_edit_submit(page_id, body): note = body.get("note", [""])[0].strip() + tags = body.get("tags", [""])[0].strip() db = get_db() db.execute("UPDATE pages SET note = ? WHERE id = ?", (note, page_id)) + _set_page_tags(page_id, tags, db) db.commit() db.close() return _redirect("/pages") @@ -249,7 +306,7 @@ def handle_delete(page_id): def handle_bookmark(query): - url = query.get("url", [""])[0].strip() + url = clean_url(query.get("url", [""])[0].strip()) if not url or not url.startswith(("http://", "https://")): return _text_response("error: invalid url", headers={"Access-Control-Allow-Origin": "*"}) try: @@ -355,6 +412,51 @@ def handle_style_submit(body): return handle_style_form("Saved.") +def handle_tags(): + db = get_db() + rows = db.execute( + "SELECT t.name, COUNT(pt.page_id) AS cnt FROM tags t " + "JOIN page_tags pt ON t.id = pt.tag_id " + "GROUP BY t.id ORDER BY t.name" + ).fetchall() + db.close() + items = "" + for r in rows: + items += f'
  • {esc(r["name"])} ({r["cnt"]})
  • ' + return _respond( + f"

    tags

    " + f"" if items else "

    No tags yet. Add tags when saving or editing pages.

    " + f'back' + ) + + +def handle_tag_browse(tag_name): + db = get_db() + rows = db.execute( + "SELECT p.id, p.url, p.title, p.note FROM pages p " + "JOIN page_tags pt ON p.id = pt.page_id " + "JOIN tags t ON t.id = pt.tag_id " + "WHERE t.name = ? ORDER BY p.id DESC", + (tag_name,), + ).fetchall() + items = "" + for r in rows: + note_html = f' — {esc(r["note"])}' if r["note"] else "" + tags = _get_page_tags(r["id"], db) + tag_links = " ".join(f'[{esc(t)}]' for t in tags) + items += ( + f'
  • {esc(r["title"])}{note_html} {tag_links} ' + f'({esc(r["url"])})
  • ' + ) + db.close() + return _respond( + f'

    tag: {esc(tag_name)}

    ' + f'

    {len(rows)} page(s)

    ' + f'' + f'all tags | back' + ) + + def handle_api_sites(): if get_setting("sharing_enabled", "0") != "1": return _json_response( @@ -363,12 +465,13 @@ def handle_api_sites(): headers={"Access-Control-Allow-Origin": "*"}, ) db = get_db() - rows = db.execute("SELECT url, title, note FROM pages ORDER BY id DESC").fetchall() + rows = db.execute("SELECT id, url, title, note FROM pages ORDER BY id DESC").fetchall() + sites = [] + for r in rows: + tags = _get_page_tags(r["id"], db) + sites.append({"url": r["url"], "title": r["title"], "note": r["note"], "tags": tags}) db.close() - data = { - "name": get_site_name(), - "sites": [{"url": r["url"], "title": r["title"], "note": r["note"]} for r in rows], - } + data = {"name": get_site_name(), "sites": sites} return _json_response(data, headers={"Access-Control-Allow-Origin": "*"}) @@ -455,13 +558,16 @@ def handle_subscription_browse(sub_id): # Use locally synced data if available, otherwise fetch live remote_rows = db.execute( - "SELECT url, title, note FROM remote_pages WHERE subscription_id = ?", + "SELECT url, title, note, tags FROM remote_pages WHERE subscription_id = ?", (sub_id,), ).fetchall() db.close() if remote_rows: - sites = [{"url": r["url"], "title": r["title"], "note": r["note"]} for r in remote_rows] + sites = [] + for r in remote_rows: + tags = [t for t in r["tags"].split(",") if t] if r["tags"] else [] + sites.append({"url": r["url"], "title": r["title"], "note": r["note"], "tags": tags}) else: try: data = fetch_remote_sites(sub["dest_hash"]) @@ -483,9 +589,12 @@ def handle_subscription_browse(sub_id): else: new_count += 1 note_html = f' — {esc(s["note"])}' if s.get("note") else "" + tags_html = "" + if s.get("tags"): + tags_html = " " + " ".join(f'[{esc(t)}]' for t in s["tags"]) new_items += ( f'
  • ' ) @@ -509,16 +618,19 @@ def handle_subscription_pick(body): sub_id = body.get("sub_id", [""])[0] import_all = body.get("import_all", [""])[0] + # Build a url->tags map from remote_pages for this subscription + db = get_db() + remote_rows = db.execute( + "SELECT url, tags FROM remote_pages WHERE subscription_id = ?", (sub_id,) + ).fetchall() + remote_tags = {r["url"]: r["tags"] for r in remote_rows} + if import_all: - db = get_db() local_urls = set(r["url"] for r in db.execute("SELECT url FROM pages").fetchall()) - remote = db.execute( - "SELECT url FROM remote_pages WHERE subscription_id = ?", (sub_id,) - ).fetchall() - db.close() - urls = [r["url"] for r in remote if r["url"] not in local_urls] + urls = [r["url"] for r in remote_rows if r["url"] not in local_urls] else: urls = body.get("urls", []) + db.close() if not urls: return handle_subscriptions("No sites selected.") @@ -528,6 +640,15 @@ def handle_subscription_pick(body): for url in urls: try: index_url(url) + # Import tags from the remote page + tags_str = remote_tags.get(url, "") + if tags_str: + db = get_db() + row = db.execute("SELECT id FROM pages WHERE url = ?", (url,)).fetchone() + if row: + _set_page_tags(row["id"], tags_str, db) + db.commit() + db.close() imported += 1 except Exception: errors += 1 @@ -556,9 +677,10 @@ def handle_subscription_sync(sub_id): synced = 0 for s in sites: try: + tags_str = ",".join(s.get("tags", [])) db.execute( - "INSERT INTO remote_pages (subscription_id, url, title, note) VALUES (?, ?, ?, ?)", - (sub_id, s["url"], s["title"], s.get("note", "")), + "INSERT INTO remote_pages (subscription_id, url, title, note, tags) VALUES (?, ?, ?, ?, ?)", + (sub_id, s["url"], s["title"], s.get("note", ""), tags_str), ) synced += 1 except Exception: @@ -602,9 +724,10 @@ def handle_subscription_syncall(): db.execute("DELETE FROM remote_pages WHERE subscription_id = ?", (sub["id"],)) for s in sites: try: + tags_str = ",".join(s.get("tags", [])) db.execute( - "INSERT INTO remote_pages (subscription_id, url, title, note) VALUES (?, ?, ?, ?)", - (sub["id"], s["url"], s["title"], s.get("note", "")), + "INSERT INTO remote_pages (subscription_id, url, title, note, tags) VALUES (?, ?, ?, ?, ?)", + (sub["id"], s["url"], s["title"], s.get("note", ""), tags_str), ) except Exception: pass @@ -655,6 +778,11 @@ def dispatch_request(data): return handle_export() elif path == "/import": return handle_import_form() + elif path == "/tags": + return handle_tags() + elif path.startswith("/tags/"): + tag_name = path[len("/tags/"):] + return handle_tag_browse(tag_name) if tag_name else _error(400) elif path == "/api/sites": return handle_api_sites() elif path == "/subscriptions":