From 570d876b8e43965e4d604337888f9bc4bc687dcb Mon Sep 17 00:00:00 2001 From: Derick Phan Date: Fri, 27 Mar 2026 14:18:54 -0700 Subject: [PATCH] Strip noscript tags when parsing pages to remove JS-disabled messages Lemmy and other JS-heavy sites include noscript fallback text like "Javascript is disabled" that pollutes the stored body text and generated snippets/summaries. Co-Authored-By: Claude Opus 4.6 --- db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/db.py b/db.py index 6b225a2..a6d8008 100644 --- a/db.py +++ b/db.py @@ -328,7 +328,7 @@ def fetch_page(url): if og_tag and og_tag.get("content"): meta_desc = og_tag["content"].strip() - for tag in soup(["script", "style", "nav", "footer", "header"]): + for tag in soup(["script", "style", "nav", "footer", "header", "noscript"]): tag.decompose() title = soup.title.string.strip() if soup.title and soup.title.string else url body = soup.get_text(separator=" ", strip=True)