From 6981d39dddd1e613aef32ef4b40ce5432a732c44 Mon Sep 17 00:00:00 2001 From: Derick Phan Date: Thu, 26 Mar 2026 11:34:15 -0700 Subject: [PATCH] Normalize URLs to prevent duplicate indexing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit clean_url() now canonicalizes: http→https, strips www., removes trailing slashes, drops default ports, and sorts query params. Prevents the same page from being indexed multiple times under different URL variations. Co-Authored-By: Claude Opus 4.6 --- db.py | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/db.py b/db.py index 7d038e7..72128a3 100644 --- a/db.py +++ b/db.py @@ -2,7 +2,7 @@ import socket import ipaddress import sqlite3 import requests -from urllib.parse import urlparse, urljoin, parse_qs, urlencode, urlunparse +from urllib.parse import urlparse, urljoin, parse_qs, urlencode, urlunparse, quote from bs4 import BeautifulSoup DATABASE = "index.db" @@ -48,10 +48,33 @@ TRACKING_PARAMS = { def clean_url(url): parsed = urlparse(url) + + # Prefer https + scheme = "https" if parsed.scheme in ("http", "https") else parsed.scheme + + # Normalize hostname: lowercase, strip www. + hostname = (parsed.hostname or "").lower() + if hostname.startswith("www."): + hostname = hostname[4:] + + # Preserve explicit non-default ports + port = parsed.port + if port and ((scheme == "https" and port == 443) or (scheme == "http" and port == 80)): + port = None + netloc = f"{hostname}:{port}" if port else hostname + + # Strip trailing slash (keep root "/" as-is) + path = parsed.path.rstrip("/") or "/" + + # Remove tracking params and sort remaining for consistent ordering params = parse_qs(parsed.query) - cleaned = {k: v for k, v in params.items() if k.lower() not in TRACKING_PARAMS} - new_query = urlencode(cleaned, doseq=True) - return urlunparse(parsed._replace(query=new_query)) + cleaned = sorted( + ((k, sorted(v)) for k, v in params.items() if k.lower() not in TRACKING_PARAMS), + key=lambda x: x[0], + ) + new_query = urlencode(cleaned, doseq=True, quote_via=quote) + + return urlunparse((scheme, netloc, path, "", new_query, "")) def get_db():