Strip tracking params from URLs and add tags/collections
URLs are cleaned of tracking parameters (utm_*, fbclid, gclid, etc.) before indexing. Tags can be added when saving or editing pages, browsed at /tags, and are included in search results. Tags are shared via /api/sites and preserved when syncing/importing from subscriptions. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
4e4cc69e0f
commit
62055a578d
2 changed files with 192 additions and 27 deletions
39
db.py
39
db.py
|
|
@ -1,12 +1,26 @@
|
|||
import sqlite3
|
||||
import requests
|
||||
from urllib.parse import urlparse, urljoin
|
||||
from urllib.parse import urlparse, urljoin, parse_qs, urlencode, urlunparse
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
DATABASE = "index.db"
|
||||
|
||||
SKIP_EXT = (".png", ".jpg", ".jpeg", ".gif", ".svg", ".pdf", ".zip", ".mp3", ".mp4", ".css", ".js", ".ico", ".xml", ".json")
|
||||
|
||||
TRACKING_PARAMS = {
|
||||
"utm_source", "utm_medium", "utm_campaign", "utm_term", "utm_content",
|
||||
"fbclid", "gclid", "msclkid", "mc_cid", "mc_eid", "ref", "ref_src",
|
||||
"ref_url", "_ga", "_gl", "yclid", "twclid", "igshid",
|
||||
}
|
||||
|
||||
|
||||
def clean_url(url):
|
||||
parsed = urlparse(url)
|
||||
params = parse_qs(parsed.query)
|
||||
cleaned = {k: v for k, v in params.items() if k.lower() not in TRACKING_PARAMS}
|
||||
new_query = urlencode(cleaned, doseq=True)
|
||||
return urlunparse(parsed._replace(query=new_query))
|
||||
|
||||
|
||||
def get_db():
|
||||
db = sqlite3.connect(DATABASE)
|
||||
|
|
@ -60,6 +74,7 @@ def init_db():
|
|||
" url TEXT NOT NULL,"
|
||||
" title TEXT,"
|
||||
" note TEXT DEFAULT '',"
|
||||
" tags TEXT DEFAULT '',"
|
||||
" FOREIGN KEY (subscription_id) REFERENCES subscriptions(id) ON DELETE CASCADE,"
|
||||
" UNIQUE(subscription_id, url)"
|
||||
")"
|
||||
|
|
@ -68,6 +83,21 @@ def init_db():
|
|||
"CREATE VIRTUAL TABLE IF NOT EXISTS remote_pages_fts "
|
||||
"USING fts5(title, url, note, content=remote_pages, content_rowid=id)"
|
||||
)
|
||||
db.execute(
|
||||
"CREATE TABLE IF NOT EXISTS tags ("
|
||||
" id INTEGER PRIMARY KEY AUTOINCREMENT,"
|
||||
" name TEXT UNIQUE NOT NULL"
|
||||
")"
|
||||
)
|
||||
db.execute(
|
||||
"CREATE TABLE IF NOT EXISTS page_tags ("
|
||||
" page_id INTEGER NOT NULL,"
|
||||
" tag_id INTEGER NOT NULL,"
|
||||
" PRIMARY KEY (page_id, tag_id),"
|
||||
" FOREIGN KEY (page_id) REFERENCES pages(id) ON DELETE CASCADE,"
|
||||
" FOREIGN KEY (tag_id) REFERENCES tags(id) ON DELETE CASCADE"
|
||||
")"
|
||||
)
|
||||
db.executescript("""
|
||||
CREATE TRIGGER IF NOT EXISTS pages_ai AFTER INSERT ON pages BEGIN
|
||||
INSERT INTO pages_fts(rowid, title, body, url, note)
|
||||
|
|
@ -104,6 +134,12 @@ def init_db():
|
|||
db.execute("ALTER TABLE subscriptions RENAME COLUMN url TO dest_hash")
|
||||
db.commit()
|
||||
|
||||
# Migrate remote_pages: add tags column if missing
|
||||
rp_cols = [row[1] for row in db.execute("PRAGMA table_info(remote_pages)").fetchall()]
|
||||
if "tags" not in rp_cols:
|
||||
db.execute("ALTER TABLE remote_pages ADD COLUMN tags TEXT DEFAULT ''")
|
||||
db.commit()
|
||||
|
||||
db.commit()
|
||||
db.close()
|
||||
|
||||
|
|
@ -165,6 +201,7 @@ def fetch_page(url):
|
|||
|
||||
|
||||
def index_url(url, note=""):
|
||||
url = clean_url(url)
|
||||
title, body, links = fetch_page(url)
|
||||
db = get_db()
|
||||
cur = db.execute(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue