Strip tracking params from URLs and add tags/collections

URLs are cleaned of tracking parameters (utm_*, fbclid, gclid, etc.)
before indexing. Tags can be added when saving or editing pages,
browsed at /tags, and are included in search results. Tags are shared
via /api/sites and preserved when syncing/importing from subscriptions.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Derick Phan 2026-03-25 23:15:28 -07:00
parent 4e4cc69e0f
commit 62055a578d
No known key found for this signature in database
2 changed files with 192 additions and 27 deletions

39
db.py
View file

@ -1,12 +1,26 @@
import sqlite3 import sqlite3
import requests import requests
from urllib.parse import urlparse, urljoin from urllib.parse import urlparse, urljoin, parse_qs, urlencode, urlunparse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
DATABASE = "index.db" DATABASE = "index.db"
SKIP_EXT = (".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf", ".zip", ".mp3", ".mp4", ".css", ".js", ".ico", ".xml", ".json") SKIP_EXT = (".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf", ".zip", ".mp3", ".mp4", ".css", ".js", ".ico", ".xml", ".json")
TRACKING_PARAMS = {
"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
"fbclid", "gclid", "msclkid", "mc_cid", "mc_eid", "ref", "ref_src",
"ref_url", "_ga", "_gl", "yclid", "twclid", "igshid",
}
def clean_url(url):
parsed = urlparse(url)
params = parse_qs(parsed.query)
cleaned = {k: v for k, v in params.items() if k.lower() not in TRACKING_PARAMS}
new_query = urlencode(cleaned, doseq=True)
return urlunparse(parsed._replace(query=new_query))
def get_db(): def get_db():
db = sqlite3.connect(DATABASE) db = sqlite3.connect(DATABASE)
@ -60,6 +74,7 @@ def init_db():
" url TEXT NOT NULL," " url TEXT NOT NULL,"
" title TEXT," " title TEXT,"
" note TEXT DEFAULT ''," " note TEXT DEFAULT '',"
" tags TEXT DEFAULT '',"
" FOREIGN KEY (subscription_id) REFERENCES subscriptions(id) ON DELETE CASCADE," " FOREIGN KEY (subscription_id) REFERENCES subscriptions(id) ON DELETE CASCADE,"
" UNIQUE(subscription_id, url)" " UNIQUE(subscription_id, url)"
")" ")"
@ -68,6 +83,21 @@ def init_db():
"CREATE VIRTUAL TABLE IF NOT EXISTS remote_pages_fts " "CREATE VIRTUAL TABLE IF NOT EXISTS remote_pages_fts "
"USING fts5(title, url, note, content=remote_pages, content_rowid=id)" "USING fts5(title, url, note, content=remote_pages, content_rowid=id)"
) )
db.execute(
"CREATE TABLE IF NOT EXISTS tags ("
" id INTEGER PRIMARY KEY AUTOINCREMENT,"
" name TEXT UNIQUE NOT NULL"
")"
)
db.execute(
"CREATE TABLE IF NOT EXISTS page_tags ("
" page_id INTEGER NOT NULL,"
" tag_id INTEGER NOT NULL,"
" PRIMARY KEY (page_id, tag_id),"
" FOREIGN KEY (page_id) REFERENCES pages(id) ON DELETE CASCADE,"
" FOREIGN KEY (tag_id) REFERENCES tags(id) ON DELETE CASCADE"
")"
)
db.executescript(""" db.executescript("""
CREATE TRIGGER IF NOT EXISTS pages_ai AFTER INSERT ON pages BEGIN CREATE TRIGGER IF NOT EXISTS pages_ai AFTER INSERT ON pages BEGIN
INSERT INTO pages_fts(rowid, title, body, url, note) INSERT INTO pages_fts(rowid, title, body, url, note)
@ -104,6 +134,12 @@ def init_db():
db.execute("ALTER TABLE subscriptions RENAME COLUMN url TO dest_hash") db.execute("ALTER TABLE subscriptions RENAME COLUMN url TO dest_hash")
db.commit() db.commit()
# Migrate remote_pages: add tags column if missing
rp_cols = [row[1] for row in db.execute("PRAGMA table_info(remote_pages)").fetchall()]
if "tags" not in rp_cols:
db.execute("ALTER TABLE remote_pages ADD COLUMN tags TEXT DEFAULT ''")
db.commit()
db.commit() db.commit()
db.close() db.close()
@ -165,6 +201,7 @@ def fetch_page(url):
def index_url(url, note=""): def index_url(url, note=""):
url = clean_url(url)
title, body, links = fetch_page(url) title, body, links = fetch_page(url)
db = get_db() db = get_db()
cur = db.execute( cur = db.execute(

View file

@ -1,7 +1,7 @@
import json import json
from datetime import datetime from datetime import datetime
from db import get_db, get_setting, set_setting, get_site_name, index_url from db import get_db, get_setting, set_setting, get_site_name, index_url, clean_url
from templates import esc, snippet, wrap_page from templates import esc, snippet, wrap_page
from rns_client import fetch_remote_sites from rns_client import fetch_remote_sites
@ -46,6 +46,38 @@ def _error(status):
return _respond(f"<h1>{status}</h1>", status) return _respond(f"<h1>{status}</h1>", status)
# --- Tag helpers ---
def _get_page_tags(page_id, db=None):
close = False
if db is None:
db = get_db()
close = True
rows = db.execute(
"SELECT t.name FROM tags t JOIN page_tags pt ON t.id = pt.tag_id "
"WHERE pt.page_id = ? ORDER BY t.name", (page_id,)
).fetchall()
if close:
db.close()
return [r["name"] for r in rows]
def _set_page_tags(page_id, tag_string, db=None):
close = False
if db is None:
db = get_db()
close = True
db.execute("DELETE FROM page_tags WHERE page_id = ?", (page_id,))
for name in (t.strip().lower() for t in tag_string.split(",") if t.strip()):
db.execute("INSERT OR IGNORE INTO tags (name) VALUES (?)", (name,))
tag_id = db.execute("SELECT id FROM tags WHERE name = ?", (name,)).fetchone()["id"]
db.execute("INSERT OR IGNORE INTO page_tags (page_id, tag_id) VALUES (?, ?)", (page_id, tag_id))
if close:
db.commit()
db.close()
# --- Route handlers --- # --- Route handlers ---
@ -69,12 +101,17 @@ def handle_search(query):
note_html = "" note_html = ""
if r["note"]: if r["note"]:
note_html = f'<div class="note"><em>{esc(r["note"])}</em></div>' note_html = f'<div class="note"><em>{esc(r["note"])}</em></div>'
tags = _get_page_tags(r["id"], db)
tags_html = ""
if tags:
tag_links = " ".join(f'<a href="/tags/{esc(t)}" class="tag">[{esc(t)}]</a>' for t in tags)
tags_html = f'<div class="tags">{tag_links}</div>'
result_html += ( result_html += (
f'<div class="result">' f'<div class="result">'
f'<a href="{esc(r["url"])}">{esc(r["title"])}</a><br>' f'<a href="{esc(r["url"])}">{esc(r["title"])}</a><br>'
f'<small>{esc(r["url"])}</small><br>' f'<small>{esc(r["url"])}</small><br>'
f'{esc(snippet(r["body"], q))}' f'{esc(snippet(r["body"], q))}'
f'{note_html}' f'{note_html}{tags_html}'
f'</div>' f'</div>'
) )
else: else:
@ -157,6 +194,7 @@ def handle_search(query):
f'<p>{count} page(s) indexed.' f'<p>{count} page(s) indexed.'
f' <a href="/add">+ add url</a>' f' <a href="/add">+ add url</a>'
f' | <a href="/pages">browse</a>' f' | <a href="/pages">browse</a>'
f' | <a href="/tags">tags</a>'
f' | <a href="/subscriptions">subscriptions</a>' f' | <a href="/subscriptions">subscriptions</a>'
f' | <a href="/style">customize</a></p>' f' | <a href="/style">customize</a></p>'
f'<hr>{result_html}{trusted_html}{remote_html}' f'<hr>{result_html}{trusted_html}{remote_html}'
@ -169,6 +207,7 @@ def handle_add_form(msg=""):
f'<form method="post" action="/add">' f'<form method="post" action="/add">'
f'<input name="url" placeholder="https://example.com" size="50"><br><br>' f'<input name="url" placeholder="https://example.com" size="50"><br><br>'
f'<input name="note" placeholder="why are you saving this? (optional)" size="50"><br><br>' f'<input name="note" placeholder="why are you saving this? (optional)" size="50"><br><br>'
f'<input name="tags" placeholder="tags (comma-separated, e.g. solarpunk, mesh)" size="50"><br><br>'
f'<button type="submit">index</button>' f'<button type="submit">index</button>'
f"</form>" f"</form>"
f"<p>{msg}</p>" f"<p>{msg}</p>"
@ -177,14 +216,22 @@ def handle_add_form(msg=""):
def handle_add_submit(body): def handle_add_submit(body):
url = body.get("url", [""])[0].strip() url = clean_url(body.get("url", [""])[0].strip())
note = body.get("note", [""])[0].strip() note = body.get("note", [""])[0].strip()
tags = body.get("tags", [""])[0].strip()
if not url: if not url:
return handle_add_form("URL is required.") return handle_add_form("URL is required.")
if not url.startswith(("http://", "https://")): if not url.startswith(("http://", "https://")):
return handle_add_form("URL must start with http:// or https://") return handle_add_form("URL must start with http:// or https://")
try: try:
title = index_url(url, note) title = index_url(url, note)
if tags:
db = get_db()
row = db.execute("SELECT id FROM pages WHERE url = ?", (url,)).fetchone()
if row:
_set_page_tags(row["id"], tags, db)
db.commit()
db.close()
return handle_add_form(f'Indexed: <a href="{esc(url)}">{esc(title)}</a>') return handle_add_form(f'Indexed: <a href="{esc(url)}">{esc(title)}</a>')
except Exception as e: except Exception as e:
return handle_add_form(f"Error: {esc(str(e))}") return handle_add_form(f"Error: {esc(str(e))}")
@ -193,16 +240,21 @@ def handle_add_submit(body):
def handle_pages(): def handle_pages():
db = get_db() db = get_db()
rows = db.execute("SELECT id, url, title, note FROM pages ORDER BY id DESC").fetchall() rows = db.execute("SELECT id, url, title, note FROM pages ORDER BY id DESC").fetchall()
db.close()
items = "" items = ""
for r in rows: for r in rows:
note_html = f' — <em>{esc(r["note"])}</em>' if r["note"] else "" note_html = f' — <em>{esc(r["note"])}</em>' if r["note"] else ""
tags = _get_page_tags(r["id"], db)
tags_html = ""
if tags:
tag_links = " ".join(f'<a href="/tags/{esc(t)}">[{esc(t)}]</a>' for t in tags)
tags_html = f' {tag_links}'
items += ( items += (
f'<li>{esc(r["title"])}{note_html} ' f'<li>{esc(r["title"])}{note_html}{tags_html} '
f'<small>(<a href="{esc(r["url"])}">{esc(r["url"])}</a>)</small> ' f'<small>(<a href="{esc(r["url"])}">{esc(r["url"])}</a>)</small> '
f'<a href="/edit/{r["id"]}">edit</a> ' f'<a href="/edit/{r["id"]}">edit</a> '
f'<a href="/delete/{r["id"]}">remove</a></li>' f'<a href="/delete/{r["id"]}">remove</a></li>'
) )
db.close()
return _respond( return _respond(
f"<h1>indexed pages ({len(rows)})</h1>" f"<h1>indexed pages ({len(rows)})</h1>"
f"<ul>{items}</ul>" f"<ul>{items}</ul>"
@ -214,15 +266,18 @@ def handle_pages():
def handle_edit_form(page_id, msg=""): def handle_edit_form(page_id, msg=""):
db = get_db() db = get_db()
row = db.execute("SELECT id, url, title, note FROM pages WHERE id = ?", (page_id,)).fetchone() row = db.execute("SELECT id, url, title, note FROM pages WHERE id = ?", (page_id,)).fetchone()
db.close()
if not row: if not row:
db.close()
return _error(404) return _error(404)
tags = ", ".join(_get_page_tags(page_id, db))
db.close()
return _respond( return _respond(
f"<h1>edit note</h1>" f"<h1>edit page</h1>"
f"<p><b>{esc(row['title'])}</b><br>" f"<p><b>{esc(row['title'])}</b><br>"
f"<small>{esc(row['url'])}</small></p>" f"<small>{esc(row['url'])}</small></p>"
f'<form method="post" action="/edit/{row["id"]}">' f'<form method="post" action="/edit/{row["id"]}">'
f'<input name="note" value="{esc(row["note"])}" placeholder="why did you save this?" size="50"><br><br>' f'<input name="note" value="{esc(row["note"])}" placeholder="why did you save this?" size="50"><br><br>'
f'<input name="tags" value="{esc(tags)}" placeholder="tags (comma-separated)" size="50"><br><br>'
f'<button type="submit">save</button>' f'<button type="submit">save</button>'
f"</form>" f"</form>"
f"<p>{msg}</p>" f"<p>{msg}</p>"
@ -232,8 +287,10 @@ def handle_edit_form(page_id, msg=""):
def handle_edit_submit(page_id, body): def handle_edit_submit(page_id, body):
note = body.get("note", [""])[0].strip() note = body.get("note", [""])[0].strip()
tags = body.get("tags", [""])[0].strip()
db = get_db() db = get_db()
db.execute("UPDATE pages SET note = ? WHERE id = ?", (note, page_id)) db.execute("UPDATE pages SET note = ? WHERE id = ?", (note, page_id))
_set_page_tags(page_id, tags, db)
db.commit() db.commit()
db.close() db.close()
return _redirect("/pages") return _redirect("/pages")
@ -249,7 +306,7 @@ def handle_delete(page_id):
def handle_bookmark(query): def handle_bookmark(query):
url = query.get("url", [""])[0].strip() url = clean_url(query.get("url", [""])[0].strip())
if not url or not url.startswith(("http://", "https://")): if not url or not url.startswith(("http://", "https://")):
return _text_response("error: invalid url", headers={"Access-Control-Allow-Origin": "*"}) return _text_response("error: invalid url", headers={"Access-Control-Allow-Origin": "*"})
try: try:
@ -355,6 +412,51 @@ def handle_style_submit(body):
return handle_style_form("Saved.") return handle_style_form("Saved.")
def handle_tags():
db = get_db()
rows = db.execute(
"SELECT t.name, COUNT(pt.page_id) AS cnt FROM tags t "
"JOIN page_tags pt ON t.id = pt.tag_id "
"GROUP BY t.id ORDER BY t.name"
).fetchall()
db.close()
items = ""
for r in rows:
items += f'<li><a href="/tags/{esc(r["name"])}">{esc(r["name"])}</a> ({r["cnt"]})</li>'
return _respond(
f"<h1>tags</h1>"
f"<ul>{items}</ul>" if items else "<p>No tags yet. Add tags when saving or editing pages.</p>"
f'<a href="/">back</a>'
)
def handle_tag_browse(tag_name):
db = get_db()
rows = db.execute(
"SELECT p.id, p.url, p.title, p.note FROM pages p "
"JOIN page_tags pt ON p.id = pt.page_id "
"JOIN tags t ON t.id = pt.tag_id "
"WHERE t.name = ? ORDER BY p.id DESC",
(tag_name,),
).fetchall()
items = ""
for r in rows:
note_html = f' — <em>{esc(r["note"])}</em>' if r["note"] else ""
tags = _get_page_tags(r["id"], db)
tag_links = " ".join(f'<a href="/tags/{esc(t)}">[{esc(t)}]</a>' for t in tags)
items += (
f'<li>{esc(r["title"])}{note_html} {tag_links} '
f'<small>(<a href="{esc(r["url"])}">{esc(r["url"])}</a>)</small></li>'
)
db.close()
return _respond(
f'<h1>tag: {esc(tag_name)}</h1>'
f'<p>{len(rows)} page(s)</p>'
f'<ul>{items}</ul>'
f'<a href="/tags">all tags</a> | <a href="/">back</a>'
)
def handle_api_sites(): def handle_api_sites():
if get_setting("sharing_enabled", "0") != "1": if get_setting("sharing_enabled", "0") != "1":
return _json_response( return _json_response(
@ -363,12 +465,13 @@ def handle_api_sites():
headers={"Access-Control-Allow-Origin": "*"}, headers={"Access-Control-Allow-Origin": "*"},
) )
db = get_db() db = get_db()
rows = db.execute("SELECT url, title, note FROM pages ORDER BY id DESC").fetchall() rows = db.execute("SELECT id, url, title, note FROM pages ORDER BY id DESC").fetchall()
sites = []
for r in rows:
tags = _get_page_tags(r["id"], db)
sites.append({"url": r["url"], "title": r["title"], "note": r["note"], "tags": tags})
db.close() db.close()
data = { data = {"name": get_site_name(), "sites": sites}
"name": get_site_name(),
"sites": [{"url": r["url"], "title": r["title"], "note": r["note"]} for r in rows],
}
return _json_response(data, headers={"Access-Control-Allow-Origin": "*"}) return _json_response(data, headers={"Access-Control-Allow-Origin": "*"})
@ -455,13 +558,16 @@ def handle_subscription_browse(sub_id):
# Use locally synced data if available, otherwise fetch live # Use locally synced data if available, otherwise fetch live
remote_rows = db.execute( remote_rows = db.execute(
"SELECT url, title, note FROM remote_pages WHERE subscription_id = ?", "SELECT url, title, note, tags FROM remote_pages WHERE subscription_id = ?",
(sub_id,), (sub_id,),
).fetchall() ).fetchall()
db.close() db.close()
if remote_rows: if remote_rows:
sites = [{"url": r["url"], "title": r["title"], "note": r["note"]} for r in remote_rows] sites = []
for r in remote_rows:
tags = [t for t in r["tags"].split(",") if t] if r["tags"] else []
sites.append({"url": r["url"], "title": r["title"], "note": r["note"], "tags": tags})
else: else:
try: try:
data = fetch_remote_sites(sub["dest_hash"]) data = fetch_remote_sites(sub["dest_hash"])
@ -483,9 +589,12 @@ def handle_subscription_browse(sub_id):
else: else:
new_count += 1 new_count += 1
note_html = f' — <em>{esc(s["note"])}</em>' if s.get("note") else "" note_html = f' — <em>{esc(s["note"])}</em>' if s.get("note") else ""
tags_html = ""
if s.get("tags"):
tags_html = " " + " ".join(f'[{esc(t)}]' for t in s["tags"])
new_items += ( new_items += (
f'<li><label><input type="checkbox" name="urls" value="{esc(s["url"])}">' f'<li><label><input type="checkbox" name="urls" value="{esc(s["url"])}">'
f' {esc(s["title"])}{note_html}' f' {esc(s["title"])}{note_html}{tags_html}'
f' <small>({esc(s["url"])})</small></label></li>' f' <small>({esc(s["url"])})</small></label></li>'
) )
@ -509,16 +618,19 @@ def handle_subscription_pick(body):
sub_id = body.get("sub_id", [""])[0] sub_id = body.get("sub_id", [""])[0]
import_all = body.get("import_all", [""])[0] import_all = body.get("import_all", [""])[0]
# Build a url->tags map from remote_pages for this subscription
db = get_db()
remote_rows = db.execute(
"SELECT url, tags FROM remote_pages WHERE subscription_id = ?", (sub_id,)
).fetchall()
remote_tags = {r["url"]: r["tags"] for r in remote_rows}
if import_all: if import_all:
db = get_db()
local_urls = set(r["url"] for r in db.execute("SELECT url FROM pages").fetchall()) local_urls = set(r["url"] for r in db.execute("SELECT url FROM pages").fetchall())
remote = db.execute( urls = [r["url"] for r in remote_rows if r["url"] not in local_urls]
"SELECT url FROM remote_pages WHERE subscription_id = ?", (sub_id,)
).fetchall()
db.close()
urls = [r["url"] for r in remote if r["url"] not in local_urls]
else: else:
urls = body.get("urls", []) urls = body.get("urls", [])
db.close()
if not urls: if not urls:
return handle_subscriptions("No sites selected.") return handle_subscriptions("No sites selected.")
@ -528,6 +640,15 @@ def handle_subscription_pick(body):
for url in urls: for url in urls:
try: try:
index_url(url) index_url(url)
# Import tags from the remote page
tags_str = remote_tags.get(url, "")
if tags_str:
db = get_db()
row = db.execute("SELECT id FROM pages WHERE url = ?", (url,)).fetchone()
if row:
_set_page_tags(row["id"], tags_str, db)
db.commit()
db.close()
imported += 1 imported += 1
except Exception: except Exception:
errors += 1 errors += 1
@ -556,9 +677,10 @@ def handle_subscription_sync(sub_id):
synced = 0 synced = 0
for s in sites: for s in sites:
try: try:
tags_str = ",".join(s.get("tags", []))
db.execute( db.execute(
"INSERT INTO remote_pages (subscription_id, url, title, note) VALUES (?, ?, ?, ?)", "INSERT INTO remote_pages (subscription_id, url, title, note, tags) VALUES (?, ?, ?, ?, ?)",
(sub_id, s["url"], s["title"], s.get("note", "")), (sub_id, s["url"], s["title"], s.get("note", ""), tags_str),
) )
synced += 1 synced += 1
except Exception: except Exception:
@ -602,9 +724,10 @@ def handle_subscription_syncall():
db.execute("DELETE FROM remote_pages WHERE subscription_id = ?", (sub["id"],)) db.execute("DELETE FROM remote_pages WHERE subscription_id = ?", (sub["id"],))
for s in sites: for s in sites:
try: try:
tags_str = ",".join(s.get("tags", []))
db.execute( db.execute(
"INSERT INTO remote_pages (subscription_id, url, title, note) VALUES (?, ?, ?, ?)", "INSERT INTO remote_pages (subscription_id, url, title, note, tags) VALUES (?, ?, ?, ?, ?)",
(sub["id"], s["url"], s["title"], s.get("note", "")), (sub["id"], s["url"], s["title"], s.get("note", ""), tags_str),
) )
except Exception: except Exception:
pass pass
@ -655,6 +778,11 @@ def dispatch_request(data):
return handle_export() return handle_export()
elif path == "/import": elif path == "/import":
return handle_import_form() return handle_import_form()
elif path == "/tags":
return handle_tags()
elif path.startswith("/tags/"):
tag_name = path[len("/tags/"):]
return handle_tag_browse(tag_name) if tag_name else _error(400)
elif path == "/api/sites": elif path == "/api/sites":
return handle_api_sites() return handle_api_sites()
elif path == "/subscriptions": elif path == "/subscriptions":