improved snippet extraction (heuristic)
- Case-insensitive meta description extraction (fixes sites like Lemmy with capitalized "Description" meta name) - Strip aside and noscript tags for cleaner body text - Extract paragraph text separately for better sentence quality - Prefer sentences mentioning the site name, then first quality paragraph, then title as fallback - Skip meta descriptions under 20 chars (e.g. just "Lemmy") - Remove embedding/centroid dependency from summary generation
This commit is contained in:
parent
e8915fa381
commit
e72afbb22e
1 changed files with 69 additions and 60 deletions
129
db.py
129
db.py
|
|
@ -317,88 +317,97 @@ def fetch_page(url):
|
||||||
label = a.get_text(strip=True) or href
|
label = a.get_text(strip=True) or href
|
||||||
links.append((href, label[:200]))
|
links.append((href, label[:200]))
|
||||||
|
|
||||||
# Extract meta description before stripping tags
|
# Extract meta description before stripping tags (case-insensitive)
|
||||||
meta_desc = ""
|
meta_desc = ""
|
||||||
meta_tag = soup.find("meta", attrs={"name": "description"})
|
for m in soup.find_all("meta"):
|
||||||
if meta_tag and meta_tag.get("content"):
|
name = (m.get("name") or "").lower()
|
||||||
meta_desc = meta_tag["content"].strip()
|
prop = (m.get("property") or "").lower()
|
||||||
if not meta_desc:
|
content = (m.get("content") or "").strip()
|
||||||
# Try og:description as fallback
|
if not content:
|
||||||
og_tag = soup.find("meta", attrs={"property": "og:description"})
|
continue
|
||||||
if og_tag and og_tag.get("content"):
|
if name == "description" and len(content) > len(meta_desc):
|
||||||
meta_desc = og_tag["content"].strip()
|
meta_desc = content
|
||||||
|
elif prop == "og:description" and not meta_desc:
|
||||||
|
meta_desc = content
|
||||||
|
|
||||||
for tag in soup(["script", "style", "nav", "footer", "header", "noscript"]):
|
for tag in soup(["script", "style", "nav", "footer", "header", "noscript", "aside"]):
|
||||||
tag.decompose()
|
tag.decompose()
|
||||||
title = soup.title.string.strip() if soup.title and soup.title.string else url
|
title = soup.title.string.strip() if soup.title and soup.title.string else url
|
||||||
|
|
||||||
|
# Extract paragraph text for better summary generation
|
||||||
|
paragraphs = []
|
||||||
|
for p in soup.find_all("p"):
|
||||||
|
text = p.get_text(strip=True)
|
||||||
|
if len(text) >= 40:
|
||||||
|
paragraphs.append(text)
|
||||||
|
|
||||||
body = soup.get_text(separator=" ", strip=True)
|
body = soup.get_text(separator=" ", strip=True)
|
||||||
return title, body, links, meta_desc
|
return title, body, links, meta_desc, paragraphs
|
||||||
|
|
||||||
|
|
||||||
def _generate_summary(title, body):
|
def _generate_summary(title, body, paragraphs=None):
|
||||||
"""Generate a summary from body text using centroid extractive method.
|
"""Generate a summary by extracting the best sentence from the page.
|
||||||
|
|
||||||
Filters out UI debris, embeds remaining sentences, finds the one
|
Priority: sentence mentioning the site name > first paragraph sentence
|
||||||
closest to the centroid (most representative of the page).
|
> first body sentence > title.
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
# Split on sentence boundaries
|
|
||||||
raw = re.split(r'(?<=[.!?])\s+', body)
|
|
||||||
sentences = []
|
|
||||||
noise_patterns = re.compile(
|
noise_patterns = re.compile(
|
||||||
r'arrow-|fedilink|message-square|link-external|'
|
r'arrow-|fedilink|message-square|link-external|'
|
||||||
r'skip to|cookie|subscribe|sign up|log in|'
|
r'skip to|cookie|subscribe|sign up|log in|'
|
||||||
r'privacy policy|terms of|©|\bads?\b',
|
r'privacy policy|terms of|©|\bads?\b',
|
||||||
re.IGNORECASE
|
re.IGNORECASE
|
||||||
)
|
)
|
||||||
for s in raw:
|
|
||||||
s = s.strip()
|
def _filter_sentences(raw):
|
||||||
if len(s) < 40:
|
result = []
|
||||||
continue
|
for s in raw:
|
||||||
words = s.split()
|
s = s.strip()
|
||||||
if len(words) < 7:
|
if len(s) < 40 or len(s.split()) < 7:
|
||||||
continue
|
continue
|
||||||
# Skip if mostly non-alpha (icons, arrows, encoded chars)
|
alpha_chars = sum(1 for c in s if c.isalpha() or c == ' ')
|
||||||
alpha_chars = sum(1 for c in s if c.isalpha() or c == ' ')
|
if alpha_chars < len(s) * 0.6:
|
||||||
if alpha_chars < len(s) * 0.6:
|
continue
|
||||||
continue
|
if s.count('|') > 2 or s.count('·') > 2 or s.count('►') > 0:
|
||||||
# Skip nav/menu patterns
|
continue
|
||||||
if s.count('|') > 2 or s.count('·') > 2 or s.count('►') > 0:
|
if noise_patterns.search(s):
|
||||||
continue
|
continue
|
||||||
# Skip UI debris
|
result.append(s)
|
||||||
if noise_patterns.search(s):
|
return result
|
||||||
continue
|
|
||||||
sentences.append(s)
|
# Prefer sentences from <p> tags (actual content, not UI)
|
||||||
|
sentences = []
|
||||||
|
if paragraphs:
|
||||||
|
raw = []
|
||||||
|
for p in paragraphs:
|
||||||
|
raw.extend(re.split(r'(?<=[.!?])\s+', p))
|
||||||
|
sentences = _filter_sentences(raw)
|
||||||
|
|
||||||
|
# Fall back to full body text
|
||||||
|
if not sentences:
|
||||||
|
raw = re.split(r'(?<=[.!?])\s+', body)
|
||||||
|
sentences = _filter_sentences(raw)
|
||||||
|
|
||||||
if not sentences:
|
if not sentences:
|
||||||
# Last resort: take the first chunk of body that looks like prose
|
return title[:200] if title else ""
|
||||||
clean = re.sub(r'\s+', ' ', body).strip()
|
|
||||||
return clean[:160] + "..." if len(clean) > 160 else clean
|
# Prefer a sentence that mentions the site name
|
||||||
if len(sentences) == 1:
|
if title:
|
||||||
s = sentences[0]
|
title_words = [w.lower() for w in re.split(r'\W+', title) if len(w) >= 3]
|
||||||
return s[:200] if len(s) > 200 else s
|
for s in sentences:
|
||||||
try:
|
s_lower = s.lower()
|
||||||
from embeddings import embed
|
if sum(1 for w in title_words if w in s_lower) >= max(1, len(title_words) // 2):
|
||||||
import numpy as np
|
return s[:200]
|
||||||
embs = embed(sentences[:50]) # cap to avoid embedding too many
|
|
||||||
centroid = embs.mean(axis=0, keepdims=True)
|
# Otherwise use the first quality sentence
|
||||||
centroid = centroid / max(np.linalg.norm(centroid), 1e-12)
|
return sentences[0][:200]
|
||||||
scores = (embs @ centroid.T).flatten()
|
|
||||||
best_idx = int(np.argmax(scores))
|
|
||||||
result = sentences[best_idx]
|
|
||||||
# Try to add a second sentence if it fits
|
|
||||||
if best_idx + 1 < len(sentences) and len(result) + len(sentences[best_idx + 1]) + 1 <= 200:
|
|
||||||
result += " " + sentences[best_idx + 1]
|
|
||||||
return result[:200] if len(result) > 200 else result
|
|
||||||
except Exception:
|
|
||||||
return sentences[0][:200]
|
|
||||||
|
|
||||||
|
|
||||||
def index_url(url, note=""):
|
def index_url(url, note=""):
|
||||||
url = clean_url(url)
|
url = clean_url(url)
|
||||||
title, body, links, meta_desc = fetch_page(url)
|
title, body, links, meta_desc, paragraphs = fetch_page(url)
|
||||||
# Use meta description if available, otherwise generate from body
|
# Use meta description if available and meaningful, otherwise generate from body
|
||||||
summary = meta_desc if meta_desc else _generate_summary(title, body)
|
summary = meta_desc if meta_desc and len(meta_desc) > 20 else _generate_summary(title, body, paragraphs)
|
||||||
db = get_db()
|
db = get_db()
|
||||||
try:
|
try:
|
||||||
now = __import__("datetime").datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
|
now = __import__("datetime").datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue