From 6981d39dddd1e613aef32ef4b40ce5432a732c44 Mon Sep 17 00:00:00 2001
From: Derick Phan <derickphan@fico.com>
Date: Thu, 26 Mar 2026 11:34:15 -0700
Subject: [PATCH] Normalize URLs to prevent duplicate indexing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

clean_url() now canonicalizes: http→https, strips www., removes
trailing slashes, drops default ports, and sorts query params.
Prevents the same page from being indexed multiple times under
different URL variations.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 db.py | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/db.py b/db.py
index 7d038e7..72128a3 100644
--- a/db.py
+++ b/db.py
@@ -2,7 +2,7 @@ import socket
 import ipaddress
 import sqlite3
 import requests
-from urllib.parse import urlparse, urljoin, parse_qs, urlencode, urlunparse
+from urllib.parse import urlparse, urljoin, parse_qs, urlencode, urlunparse, quote
 from bs4 import BeautifulSoup
 
 DATABASE = "index.db"
@@ -48,10 +48,33 @@ TRACKING_PARAMS = {
 
 def clean_url(url):
     parsed = urlparse(url)
+
+    # Prefer https
+    scheme = "https" if parsed.scheme in ("http", "https") else parsed.scheme
+
+    # Normalize hostname: lowercase, strip www.
+    hostname = (parsed.hostname or "").lower()
+    if hostname.startswith("www."):
+        hostname = hostname[4:]
+
+    # Preserve explicit non-default ports
+    port = parsed.port
+    if port and ((scheme == "https" and port == 443) or (scheme == "http" and port == 80)):
+        port = None
+    netloc = f"{hostname}:{port}" if port else hostname
+
+    # Strip trailing slash (keep root "/" as-is)
+    path = parsed.path.rstrip("/") or "/"
+
+    # Remove tracking params and sort remaining for consistent ordering
     params = parse_qs(parsed.query)
-    cleaned = {k: v for k, v in params.items() if k.lower() not in TRACKING_PARAMS}
-    new_query = urlencode(cleaned, doseq=True)
-    return urlunparse(parsed._replace(query=new_query))
+    cleaned = sorted(
+        ((k, sorted(v)) for k, v in params.items() if k.lower() not in TRACKING_PARAMS),
+        key=lambda x: x[0],
+    )
+    new_query = urlencode(cleaned, doseq=True, quote_via=quote)
+
+    return urlunparse((scheme, netloc, path, "", new_query, ""))
 
 
 def get_db():