Improve snippet generation with heuristic extraction instead of AI

- Case-insensitive meta description extraction (fixes sites like Lemmy
  with capitalized "Description" meta name)
- Strip aside and noscript tags for cleaner body text
- Extract paragraph text separately for better sentence quality
- Prefer sentences mentioning the site name, then first quality
  paragraph, then title as fallback
- Skip meta descriptions under 20 chars (e.g. just "Lemmy")
- Remove embedding/centroid dependency from summary generation

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Derick Phan 2026-03-27 15:44:07 -07:00
parent 570d876b8e
commit c9a8cba9d1
No known key found for this signature in database

129
db.py
View file

@ -317,88 +317,97 @@ def fetch_page(url):
label = a.get_text(strip=True) or href label = a.get_text(strip=True) or href
links.append((href, label[:200])) links.append((href, label[:200]))
# Extract meta description before stripping tags # Extract meta description before stripping tags (case-insensitive)
meta_desc = "" meta_desc = ""
meta_tag = soup.find("meta", attrs={"name": "description"}) for m in soup.find_all("meta"):
if meta_tag and meta_tag.get("content"): name = (m.get("name") or "").lower()
meta_desc = meta_tag["content"].strip() prop = (m.get("property") or "").lower()
if not meta_desc: content = (m.get("content") or "").strip()
# Try og:description as fallback if not content:
og_tag = soup.find("meta", attrs={"property": "og:description"}) continue
if og_tag and og_tag.get("content"): if name == "description" and len(content) > len(meta_desc):
meta_desc = og_tag["content"].strip() meta_desc = content
elif prop == "og:description" and not meta_desc:
meta_desc = content
for tag in soup(["script", "style", "nav", "footer", "header", "noscript"]): for tag in soup(["script", "style", "nav", "footer", "header", "noscript", "aside"]):
tag.decompose() tag.decompose()
title = soup.title.string.strip() if soup.title and soup.title.string else url title = soup.title.string.strip() if soup.title and soup.title.string else url
# Extract paragraph text for better summary generation
paragraphs = []
for p in soup.find_all("p"):
text = p.get_text(strip=True)
if len(text) >= 40:
paragraphs.append(text)
body = soup.get_text(separator=" ", strip=True) body = soup.get_text(separator=" ", strip=True)
return title, body, links, meta_desc return title, body, links, meta_desc, paragraphs
def _generate_summary(title, body): def _generate_summary(title, body, paragraphs=None):
"""Generate a summary from body text using centroid extractive method. """Generate a summary by extracting the best sentence from the page.
Filters out UI debris, embeds remaining sentences, finds the one Priority: sentence mentioning the site name > first paragraph sentence
closest to the centroid (most representative of the page). > first body sentence > title.
""" """
import re import re
# Split on sentence boundaries
raw = re.split(r'(?<=[.!?])\s+', body)
sentences = []
noise_patterns = re.compile( noise_patterns = re.compile(
r'arrow-|fedilink|message-square|link-external|' r'arrow-|fedilink|message-square|link-external|'
r'skip to|cookie|subscribe|sign up|log in|' r'skip to|cookie|subscribe|sign up|log in|'
r'privacy policy|terms of|©|\bads?\b', r'privacy policy|terms of|©|\bads?\b',
re.IGNORECASE re.IGNORECASE
) )
for s in raw:
s = s.strip() def _filter_sentences(raw):
if len(s) < 40: result = []
continue for s in raw:
words = s.split() s = s.strip()
if len(words) < 7: if len(s) < 40 or len(s.split()) < 7:
continue continue
# Skip if mostly non-alpha (icons, arrows, encoded chars) alpha_chars = sum(1 for c in s if c.isalpha() or c == ' ')
alpha_chars = sum(1 for c in s if c.isalpha() or c == ' ') if alpha_chars < len(s) * 0.6:
if alpha_chars < len(s) * 0.6: continue
continue if s.count('|') > 2 or s.count('·') > 2 or s.count('') > 0:
# Skip nav/menu patterns continue
if s.count('|') > 2 or s.count('·') > 2 or s.count('') > 0: if noise_patterns.search(s):
continue continue
# Skip UI debris result.append(s)
if noise_patterns.search(s): return result
continue
sentences.append(s) # Prefer sentences from <p> tags (actual content, not UI)
sentences = []
if paragraphs:
raw = []
for p in paragraphs:
raw.extend(re.split(r'(?<=[.!?])\s+', p))
sentences = _filter_sentences(raw)
# Fall back to full body text
if not sentences:
raw = re.split(r'(?<=[.!?])\s+', body)
sentences = _filter_sentences(raw)
if not sentences: if not sentences:
# Last resort: take the first chunk of body that looks like prose return title[:200] if title else ""
clean = re.sub(r'\s+', ' ', body).strip()
return clean[:160] + "..." if len(clean) > 160 else clean # Prefer a sentence that mentions the site name
if len(sentences) == 1: if title:
s = sentences[0] title_words = [w.lower() for w in re.split(r'\W+', title) if len(w) >= 3]
return s[:200] if len(s) > 200 else s for s in sentences:
try: s_lower = s.lower()
from embeddings import embed if sum(1 for w in title_words if w in s_lower) >= max(1, len(title_words) // 2):
import numpy as np return s[:200]
embs = embed(sentences[:50]) # cap to avoid embedding too many
centroid = embs.mean(axis=0, keepdims=True) # Otherwise use the first quality sentence
centroid = centroid / max(np.linalg.norm(centroid), 1e-12) return sentences[0][:200]
scores = (embs @ centroid.T).flatten()
best_idx = int(np.argmax(scores))
result = sentences[best_idx]
# Try to add a second sentence if it fits
if best_idx + 1 < len(sentences) and len(result) + len(sentences[best_idx + 1]) + 1 <= 200:
result += " " + sentences[best_idx + 1]
return result[:200] if len(result) > 200 else result
except Exception:
return sentences[0][:200]
def index_url(url, note=""): def index_url(url, note=""):
url = clean_url(url) url = clean_url(url)
title, body, links, meta_desc = fetch_page(url) title, body, links, meta_desc, paragraphs = fetch_page(url)
# Use meta description if available, otherwise generate from body # Use meta description if available and meaningful, otherwise generate from body
summary = meta_desc if meta_desc else _generate_summary(title, body) summary = meta_desc if meta_desc and len(meta_desc) > 20 else _generate_summary(title, body, paragraphs)
db = get_db() db = get_db()
try: try:
now = __import__("datetime").datetime.now().strftime("%Y-%m-%dT%H:%M:%S") now = __import__("datetime").datetime.now().strftime("%Y-%m-%dT%H:%M:%S")