From e72afbb22e24cb7253c6c76d9f93d0347e5e11fb Mon Sep 17 00:00:00 2001
From: lichenblankie <lichenblankie@derickphan.com>
Date: Fri, 27 Mar 2026 15:44:07 -0700
Subject: [PATCH] improved snippet extraction (heuristic)

- Case-insensitive meta description extraction (fixes sites like Lemmy
  with capitalized "Description" meta name)
- Strip aside and noscript tags for cleaner body text
- Extract paragraph text separately for better sentence quality
- Prefer sentences mentioning the site name, then first quality
  paragraph, then title as fallback
- Skip meta descriptions under 20 chars (e.g. just "Lemmy")
- Remove embedding/centroid dependency from summary generation
---
 db.py | 129 +++++++++++++++++++++++++++++++---------------------------
 1 file changed, 69 insertions(+), 60 deletions(-)

diff --git a/db.py b/db.py
index a6d8008..f31473f 100644
--- a/db.py
+++ b/db.py
@@ -317,88 +317,97 @@ def fetch_page(url):
         label = a.get_text(strip=True) or href
         links.append((href, label[:200]))
 
-    # Extract meta description before stripping tags
+    # Extract meta description before stripping tags (case-insensitive)
     meta_desc = ""
-    meta_tag = soup.find("meta", attrs={"name": "description"})
-    if meta_tag and meta_tag.get("content"):
-        meta_desc = meta_tag["content"].strip()
-    if not meta_desc:
-        # Try og:description as fallback
-        og_tag = soup.find("meta", attrs={"property": "og:description"})
-        if og_tag and og_tag.get("content"):
-            meta_desc = og_tag["content"].strip()
+    for m in soup.find_all("meta"):
+        name = (m.get("name") or "").lower()
+        prop = (m.get("property") or "").lower()
+        content = (m.get("content") or "").strip()
+        if not content:
+            continue
+        if name == "description" and len(content) > len(meta_desc):
+            meta_desc = content
+        elif prop == "og:description" and not meta_desc:
+            meta_desc = content
 
-    for tag in soup(["script", "style", "nav", "footer", "header", "noscript"]):
+    for tag in soup(["script", "style", "nav", "footer", "header", "noscript", "aside"]):
         tag.decompose()
     title = soup.title.string.strip() if soup.title and soup.title.string else url
+
+    # Extract paragraph text for better summary generation
+    paragraphs = []
+    for p in soup.find_all("p"):
+        text = p.get_text(strip=True)
+        if len(text) >= 40:
+            paragraphs.append(text)
+
     body = soup.get_text(separator=" ", strip=True)
-    return title, body, links, meta_desc
+    return title, body, links, meta_desc, paragraphs
 
 
-def _generate_summary(title, body):
-    """Generate a summary from body text using centroid extractive method.
+def _generate_summary(title, body, paragraphs=None):
+    """Generate a summary by extracting the best sentence from the page.
 
-    Filters out UI debris, embeds remaining sentences, finds the one
-    closest to the centroid (most representative of the page).
+    Priority: sentence mentioning the site name > first paragraph sentence
+    > first body sentence > title.
     """
     import re
-    # Split on sentence boundaries
-    raw = re.split(r'(?<=[.!?])\s+', body)
-    sentences = []
     noise_patterns = re.compile(
         r'arrow-|fedilink|message-square|link-external|'
         r'skip to|cookie|subscribe|sign up|log in|'
         r'privacy policy|terms of|©|\bads?\b',
         re.IGNORECASE
     )
-    for s in raw:
-        s = s.strip()
-        if len(s) < 40:
-            continue
-        words = s.split()
-        if len(words) < 7:
-            continue
-        # Skip if mostly non-alpha (icons, arrows, encoded chars)
-        alpha_chars = sum(1 for c in s if c.isalpha() or c == ' ')
-        if alpha_chars < len(s) * 0.6:
-            continue
-        # Skip nav/menu patterns
-        if s.count('|') > 2 or s.count('·') > 2 or s.count('►') > 0:
-            continue
-        # Skip UI debris
-        if noise_patterns.search(s):
-            continue
-        sentences.append(s)
+
+    def _filter_sentences(raw):
+        result = []
+        for s in raw:
+            s = s.strip()
+            if len(s) < 40 or len(s.split()) < 7:
+                continue
+            alpha_chars = sum(1 for c in s if c.isalpha() or c == ' ')
+            if alpha_chars < len(s) * 0.6:
+                continue
+            if s.count('|') > 2 or s.count('·') > 2 or s.count('►') > 0:
+                continue
+            if noise_patterns.search(s):
+                continue
+            result.append(s)
+        return result
+
+    # Prefer sentences from <p> tags (actual content, not UI)
+    sentences = []
+    if paragraphs:
+        raw = []
+        for p in paragraphs:
+            raw.extend(re.split(r'(?<=[.!?])\s+', p))
+        sentences = _filter_sentences(raw)
+
+    # Fall back to full body text
+    if not sentences:
+        raw = re.split(r'(?<=[.!?])\s+', body)
+        sentences = _filter_sentences(raw)
 
     if not sentences:
-        # Last resort: take the first chunk of body that looks like prose
-        clean = re.sub(r'\s+', ' ', body).strip()
-        return clean[:160] + "..." if len(clean) > 160 else clean
-    if len(sentences) == 1:
-        s = sentences[0]
-        return s[:200] if len(s) > 200 else s
-    try:
-        from embeddings import embed
-        import numpy as np
-        embs = embed(sentences[:50])  # cap to avoid embedding too many
-        centroid = embs.mean(axis=0, keepdims=True)
-        centroid = centroid / max(np.linalg.norm(centroid), 1e-12)
-        scores = (embs @ centroid.T).flatten()
-        best_idx = int(np.argmax(scores))
-        result = sentences[best_idx]
-        # Try to add a second sentence if it fits
-        if best_idx + 1 < len(sentences) and len(result) + len(sentences[best_idx + 1]) + 1 <= 200:
-            result += " " + sentences[best_idx + 1]
-        return result[:200] if len(result) > 200 else result
-    except Exception:
-        return sentences[0][:200]
+        return title[:200] if title else ""
+
+    # Prefer a sentence that mentions the site name
+    if title:
+        title_words = [w.lower() for w in re.split(r'\W+', title) if len(w) >= 3]
+        for s in sentences:
+            s_lower = s.lower()
+            if sum(1 for w in title_words if w in s_lower) >= max(1, len(title_words) // 2):
+                return s[:200]
+
+    # Otherwise use the first quality sentence
+    return sentences[0][:200]
 
 
 def index_url(url, note=""):
     url = clean_url(url)
-    title, body, links, meta_desc = fetch_page(url)
-    # Use meta description if available, otherwise generate from body
-    summary = meta_desc if meta_desc else _generate_summary(title, body)
+    title, body, links, meta_desc, paragraphs = fetch_page(url)
+    # Use meta description if available and meaningful, otherwise generate from body
+    summary = meta_desc if meta_desc and len(meta_desc) > 20 else _generate_summary(title, body, paragraphs)
     db = get_db()
     try:
         now = __import__("datetime").datetime.now().strftime("%Y-%m-%dT%H:%M:%S")