diff --git a/db.py b/db.py index 7d038e7..72128a3 100644 --- a/db.py +++ b/db.py @@ -2,7 +2,7 @@ import socket import ipaddress import sqlite3 import requests -from urllib.parse import urlparse, urljoin, parse_qs, urlencode, urlunparse +from urllib.parse import urlparse, urljoin, parse_qs, urlencode, urlunparse, quote from bs4 import BeautifulSoup DATABASE = "index.db" @@ -48,10 +48,33 @@ TRACKING_PARAMS = { def clean_url(url): parsed = urlparse(url) + + # Prefer https + scheme = "https" if parsed.scheme in ("http", "https") else parsed.scheme + + # Normalize hostname: lowercase, strip www. + hostname = (parsed.hostname or "").lower() + if hostname.startswith("www."): + hostname = hostname[4:] + + # Preserve explicit non-default ports + port = parsed.port + if port and ((scheme == "https" and port == 443) or (scheme == "http" and port == 80)): + port = None + netloc = f"{hostname}:{port}" if port else hostname + + # Strip trailing slash (keep root "/" as-is) + path = parsed.path.rstrip("/") or "/" + + # Remove tracking params and sort remaining for consistent ordering params = parse_qs(parsed.query) - cleaned = {k: v for k, v in params.items() if k.lower() not in TRACKING_PARAMS} - new_query = urlencode(cleaned, doseq=True) - return urlunparse(parsed._replace(query=new_query)) + cleaned = sorted( + ((k, sorted(v)) for k, v in params.items() if k.lower() not in TRACKING_PARAMS), + key=lambda x: x[0], + ) + new_query = urlencode(cleaned, doseq=True, quote_via=quote) + + return urlunparse((scheme, netloc, path, "", new_query, "")) def get_db():