Normalize URLs to prevent duplicate indexing
clean_url() now canonicalizes: http→https, strips www., removes trailing slashes, drops default ports, and sorts query params. Prevents the same page from being indexed multiple times under different URL variations. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
d2cb0d00bc
commit
6981d39ddd
1 changed files with 27 additions and 4 deletions
31
db.py
31
db.py
|
|
@ -2,7 +2,7 @@ import socket
|
||||||
import ipaddress
|
import ipaddress
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import requests
|
import requests
|
||||||
from urllib.parse import urlparse, urljoin, parse_qs, urlencode, urlunparse
|
from urllib.parse import urlparse, urljoin, parse_qs, urlencode, urlunparse, quote
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
DATABASE = "index.db"
|
DATABASE = "index.db"
|
||||||
|
|
@ -48,10 +48,33 @@ TRACKING_PARAMS = {
|
||||||
|
|
||||||
def clean_url(url):
|
def clean_url(url):
|
||||||
parsed = urlparse(url)
|
parsed = urlparse(url)
|
||||||
|
|
||||||
|
# Prefer https
|
||||||
|
scheme = "https" if parsed.scheme in ("http", "https") else parsed.scheme
|
||||||
|
|
||||||
|
# Normalize hostname: lowercase, strip www.
|
||||||
|
hostname = (parsed.hostname or "").lower()
|
||||||
|
if hostname.startswith("www."):
|
||||||
|
hostname = hostname[4:]
|
||||||
|
|
||||||
|
# Preserve explicit non-default ports
|
||||||
|
port = parsed.port
|
||||||
|
if port and ((scheme == "https" and port == 443) or (scheme == "http" and port == 80)):
|
||||||
|
port = None
|
||||||
|
netloc = f"{hostname}:{port}" if port else hostname
|
||||||
|
|
||||||
|
# Strip trailing slash (keep root "/" as-is)
|
||||||
|
path = parsed.path.rstrip("/") or "/"
|
||||||
|
|
||||||
|
# Remove tracking params and sort remaining for consistent ordering
|
||||||
params = parse_qs(parsed.query)
|
params = parse_qs(parsed.query)
|
||||||
cleaned = {k: v for k, v in params.items() if k.lower() not in TRACKING_PARAMS}
|
cleaned = sorted(
|
||||||
new_query = urlencode(cleaned, doseq=True)
|
((k, sorted(v)) for k, v in params.items() if k.lower() not in TRACKING_PARAMS),
|
||||||
return urlunparse(parsed._replace(query=new_query))
|
key=lambda x: x[0],
|
||||||
|
)
|
||||||
|
new_query = urlencode(cleaned, doseq=True, quote_via=quote)
|
||||||
|
|
||||||
|
return urlunparse((scheme, netloc, path, "", new_query, ""))
|
||||||
|
|
||||||
|
|
||||||
def get_db():
|
def get_db():
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue