Normalize URLs to prevent duplicate indexing

clean_url() now canonicalizes: http→https, strips www., removes
trailing slashes, drops default ports, and sorts query params.
Prevents the same page from being indexed multiple times under
different URL variations.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Derick Phan 2026-03-26 11:34:15 -07:00
parent d2cb0d00bc
commit 6981d39ddd
No known key found for this signature in database

31
db.py
View file

@ -2,7 +2,7 @@ import socket
import ipaddress
import sqlite3
import requests
from urllib.parse import urlparse, urljoin, parse_qs, urlencode, urlunparse
from urllib.parse import urlparse, urljoin, parse_qs, urlencode, urlunparse, quote
from bs4 import BeautifulSoup
DATABASE = "index.db"
@ -48,10 +48,33 @@ TRACKING_PARAMS = {
def clean_url(url):
parsed = urlparse(url)
# Prefer https
scheme = "https" if parsed.scheme in ("http", "https") else parsed.scheme
# Normalize hostname: lowercase, strip www.
hostname = (parsed.hostname or "").lower()
if hostname.startswith("www."):
hostname = hostname[4:]
# Preserve explicit non-default ports
port = parsed.port
if port and ((scheme == "https" and port == 443) or (scheme == "http" and port == 80)):
port = None
netloc = f"{hostname}:{port}" if port else hostname
# Strip trailing slash (keep root "/" as-is)
path = parsed.path.rstrip("/") or "/"
# Remove tracking params and sort remaining for consistent ordering
params = parse_qs(parsed.query)
cleaned = {k: v for k, v in params.items() if k.lower() not in TRACKING_PARAMS}
new_query = urlencode(cleaned, doseq=True)
return urlunparse(parsed._replace(query=new_query))
cleaned = sorted(
((k, sorted(v)) for k, v in params.items() if k.lower() not in TRACKING_PARAMS),
key=lambda x: x[0],
)
new_query = urlencode(cleaned, doseq=True, quote_via=quote)
return urlunparse((scheme, netloc, path, "", new_query, ""))
def get_db():