normalized URLs to prevent dupes
clean_url() now canonicalizes: http→https, strips www., removes trailing slashes, drops default ports, and sorts query params. Prevents the same page from being indexed multiple times under different URL variations.
This commit is contained in:
parent
5d9b81db95
commit
b574c4b7f5
1 changed files with 27 additions and 4 deletions
31
db.py
31
db.py
|
|
@ -2,7 +2,7 @@ import socket
|
|||
import ipaddress
|
||||
import sqlite3
|
||||
import requests
|
||||
from urllib.parse import urlparse, urljoin, parse_qs, urlencode, urlunparse
|
||||
from urllib.parse import urlparse, urljoin, parse_qs, urlencode, urlunparse, quote
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
DATABASE = "index.db"
|
||||
|
|
@ -48,10 +48,33 @@ TRACKING_PARAMS = {
|
|||
|
||||
def clean_url(url):
|
||||
parsed = urlparse(url)
|
||||
|
||||
# Prefer https
|
||||
scheme = "https" if parsed.scheme in ("http", "https") else parsed.scheme
|
||||
|
||||
# Normalize hostname: lowercase, strip www.
|
||||
hostname = (parsed.hostname or "").lower()
|
||||
if hostname.startswith("www."):
|
||||
hostname = hostname[4:]
|
||||
|
||||
# Preserve explicit non-default ports
|
||||
port = parsed.port
|
||||
if port and ((scheme == "https" and port == 443) or (scheme == "http" and port == 80)):
|
||||
port = None
|
||||
netloc = f"{hostname}:{port}" if port else hostname
|
||||
|
||||
# Strip trailing slash (keep root "/" as-is)
|
||||
path = parsed.path.rstrip("/") or "/"
|
||||
|
||||
# Remove tracking params and sort remaining for consistent ordering
|
||||
params = parse_qs(parsed.query)
|
||||
cleaned = {k: v for k, v in params.items() if k.lower() not in TRACKING_PARAMS}
|
||||
new_query = urlencode(cleaned, doseq=True)
|
||||
return urlunparse(parsed._replace(query=new_query))
|
||||
cleaned = sorted(
|
||||
((k, sorted(v)) for k, v in params.items() if k.lower() not in TRACKING_PARAMS),
|
||||
key=lambda x: x[0],
|
||||
)
|
||||
new_query = urlencode(cleaned, doseq=True, quote_via=quote)
|
||||
|
||||
return urlunparse((scheme, netloc, path, "", new_query, ""))
|
||||
|
||||
|
||||
def get_db():
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue