From e72afbb22e24cb7253c6c76d9f93d0347e5e11fb Mon Sep 17 00:00:00 2001 From: lichenblankie Date: Fri, 27 Mar 2026 15:44:07 -0700 Subject: [PATCH] improved snippet extraction (heuristic) - Case-insensitive meta description extraction (fixes sites like Lemmy with capitalized "Description" meta name) - Strip aside and noscript tags for cleaner body text - Extract paragraph text separately for better sentence quality - Prefer sentences mentioning the site name, then first quality paragraph, then title as fallback - Skip meta descriptions under 20 chars (e.g. just "Lemmy") - Remove embedding/centroid dependency from summary generation --- db.py | 129 +++++++++++++++++++++++++++++++--------------------------- 1 file changed, 69 insertions(+), 60 deletions(-) diff --git a/db.py b/db.py index a6d8008..f31473f 100644 --- a/db.py +++ b/db.py @@ -317,88 +317,97 @@ def fetch_page(url): label = a.get_text(strip=True) or href links.append((href, label[:200])) - # Extract meta description before stripping tags + # Extract meta description before stripping tags (case-insensitive) meta_desc = "" - meta_tag = soup.find("meta", attrs={"name": "description"}) - if meta_tag and meta_tag.get("content"): - meta_desc = meta_tag["content"].strip() - if not meta_desc: - # Try og:description as fallback - og_tag = soup.find("meta", attrs={"property": "og:description"}) - if og_tag and og_tag.get("content"): - meta_desc = og_tag["content"].strip() + for m in soup.find_all("meta"): + name = (m.get("name") or "").lower() + prop = (m.get("property") or "").lower() + content = (m.get("content") or "").strip() + if not content: + continue + if name == "description" and len(content) > len(meta_desc): + meta_desc = content + elif prop == "og:description" and not meta_desc: + meta_desc = content - for tag in soup(["script", "style", "nav", "footer", "header", "noscript"]): + for tag in soup(["script", "style", "nav", "footer", "header", "noscript", "aside"]): tag.decompose() title = soup.title.string.strip() if soup.title and soup.title.string else url + + # Extract paragraph text for better summary generation + paragraphs = [] + for p in soup.find_all("p"): + text = p.get_text(strip=True) + if len(text) >= 40: + paragraphs.append(text) + body = soup.get_text(separator=" ", strip=True) - return title, body, links, meta_desc + return title, body, links, meta_desc, paragraphs -def _generate_summary(title, body): - """Generate a summary from body text using centroid extractive method. +def _generate_summary(title, body, paragraphs=None): + """Generate a summary by extracting the best sentence from the page. - Filters out UI debris, embeds remaining sentences, finds the one - closest to the centroid (most representative of the page). + Priority: sentence mentioning the site name > first paragraph sentence + > first body sentence > title. """ import re - # Split on sentence boundaries - raw = re.split(r'(?<=[.!?])\s+', body) - sentences = [] noise_patterns = re.compile( r'arrow-|fedilink|message-square|link-external|' r'skip to|cookie|subscribe|sign up|log in|' r'privacy policy|terms of|©|\bads?\b', re.IGNORECASE ) - for s in raw: - s = s.strip() - if len(s) < 40: - continue - words = s.split() - if len(words) < 7: - continue - # Skip if mostly non-alpha (icons, arrows, encoded chars) - alpha_chars = sum(1 for c in s if c.isalpha() or c == ' ') - if alpha_chars < len(s) * 0.6: - continue - # Skip nav/menu patterns - if s.count('|') > 2 or s.count('·') > 2 or s.count('►') > 0: - continue - # Skip UI debris - if noise_patterns.search(s): - continue - sentences.append(s) + + def _filter_sentences(raw): + result = [] + for s in raw: + s = s.strip() + if len(s) < 40 or len(s.split()) < 7: + continue + alpha_chars = sum(1 for c in s if c.isalpha() or c == ' ') + if alpha_chars < len(s) * 0.6: + continue + if s.count('|') > 2 or s.count('·') > 2 or s.count('►') > 0: + continue + if noise_patterns.search(s): + continue + result.append(s) + return result + + # Prefer sentences from

tags (actual content, not UI) + sentences = [] + if paragraphs: + raw = [] + for p in paragraphs: + raw.extend(re.split(r'(?<=[.!?])\s+', p)) + sentences = _filter_sentences(raw) + + # Fall back to full body text + if not sentences: + raw = re.split(r'(?<=[.!?])\s+', body) + sentences = _filter_sentences(raw) if not sentences: - # Last resort: take the first chunk of body that looks like prose - clean = re.sub(r'\s+', ' ', body).strip() - return clean[:160] + "..." if len(clean) > 160 else clean - if len(sentences) == 1: - s = sentences[0] - return s[:200] if len(s) > 200 else s - try: - from embeddings import embed - import numpy as np - embs = embed(sentences[:50]) # cap to avoid embedding too many - centroid = embs.mean(axis=0, keepdims=True) - centroid = centroid / max(np.linalg.norm(centroid), 1e-12) - scores = (embs @ centroid.T).flatten() - best_idx = int(np.argmax(scores)) - result = sentences[best_idx] - # Try to add a second sentence if it fits - if best_idx + 1 < len(sentences) and len(result) + len(sentences[best_idx + 1]) + 1 <= 200: - result += " " + sentences[best_idx + 1] - return result[:200] if len(result) > 200 else result - except Exception: - return sentences[0][:200] + return title[:200] if title else "" + + # Prefer a sentence that mentions the site name + if title: + title_words = [w.lower() for w in re.split(r'\W+', title) if len(w) >= 3] + for s in sentences: + s_lower = s.lower() + if sum(1 for w in title_words if w in s_lower) >= max(1, len(title_words) // 2): + return s[:200] + + # Otherwise use the first quality sentence + return sentences[0][:200] def index_url(url, note=""): url = clean_url(url) - title, body, links, meta_desc = fetch_page(url) - # Use meta description if available, otherwise generate from body - summary = meta_desc if meta_desc else _generate_summary(title, body) + title, body, links, meta_desc, paragraphs = fetch_page(url) + # Use meta description if available and meaningful, otherwise generate from body + summary = meta_desc if meta_desc and len(meta_desc) > 20 else _generate_summary(title, body, paragraphs) db = get_db() try: now = __import__("datetime").datetime.now().strftime("%Y-%m-%dT%H:%M:%S")