stripped tracking params, added tags

URLs are cleaned of tracking parameters (utm_*, fbclid, gclid, etc.)
before indexing. Tags can be added when saving or editing pages,
browsed at /tags, and are included in search results. Tags are shared
via /api/sites and preserved when syncing/importing from subscriptions.
This commit is contained in:
lichenblankie 2026-03-25 23:15:28 -07:00
parent c5d8d350a6
commit acfa9f6d4f
2 changed files with 192 additions and 27 deletions

39
db.py
View file

@ -1,12 +1,26 @@
import sqlite3
import requests
from urllib.parse import urlparse, urljoin
from urllib.parse import urlparse, urljoin, parse_qs, urlencode, urlunparse
from bs4 import BeautifulSoup
DATABASE = "index.db"
SKIP_EXT = (".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf", ".zip", ".mp3", ".mp4", ".css", ".js", ".ico", ".xml", ".json")
TRACKING_PARAMS = {
"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
"fbclid", "gclid", "msclkid", "mc_cid", "mc_eid", "ref", "ref_src",
"ref_url", "_ga", "_gl", "yclid", "twclid", "igshid",
}
def clean_url(url):
parsed = urlparse(url)
params = parse_qs(parsed.query)
cleaned = {k: v for k, v in params.items() if k.lower() not in TRACKING_PARAMS}
new_query = urlencode(cleaned, doseq=True)
return urlunparse(parsed._replace(query=new_query))
def get_db():
db = sqlite3.connect(DATABASE)
@ -60,6 +74,7 @@ def init_db():
" url TEXT NOT NULL,"
" title TEXT,"
" note TEXT DEFAULT '',"
" tags TEXT DEFAULT '',"
" FOREIGN KEY (subscription_id) REFERENCES subscriptions(id) ON DELETE CASCADE,"
" UNIQUE(subscription_id, url)"
")"
@ -68,6 +83,21 @@ def init_db():
"CREATE VIRTUAL TABLE IF NOT EXISTS remote_pages_fts "
"USING fts5(title, url, note, content=remote_pages, content_rowid=id)"
)
db.execute(
"CREATE TABLE IF NOT EXISTS tags ("
" id INTEGER PRIMARY KEY AUTOINCREMENT,"
" name TEXT UNIQUE NOT NULL"
")"
)
db.execute(
"CREATE TABLE IF NOT EXISTS page_tags ("
" page_id INTEGER NOT NULL,"
" tag_id INTEGER NOT NULL,"
" PRIMARY KEY (page_id, tag_id),"
" FOREIGN KEY (page_id) REFERENCES pages(id) ON DELETE CASCADE,"
" FOREIGN KEY (tag_id) REFERENCES tags(id) ON DELETE CASCADE"
")"
)
db.executescript("""
CREATE TRIGGER IF NOT EXISTS pages_ai AFTER INSERT ON pages BEGIN
INSERT INTO pages_fts(rowid, title, body, url, note)
@ -104,6 +134,12 @@ def init_db():
db.execute("ALTER TABLE subscriptions RENAME COLUMN url TO dest_hash")
db.commit()
# Migrate remote_pages: add tags column if missing
rp_cols = [row[1] for row in db.execute("PRAGMA table_info(remote_pages)").fetchall()]
if "tags" not in rp_cols:
db.execute("ALTER TABLE remote_pages ADD COLUMN tags TEXT DEFAULT ''")
db.commit()
db.commit()
db.close()
@ -165,6 +201,7 @@ def fetch_page(url):
def index_url(url, note=""):
url = clean_url(url)
title, body, links = fetch_page(url)
db = get_db()
cur = db.execute(