stripped noscript tags from pages
Lemmy and other JS-heavy sites include noscript fallback text like "Javascript is disabled" that pollutes the stored body text and generated snippets/summaries.
This commit is contained in:
parent
3f8ebdab1d
commit
e8915fa381
1 changed files with 1 additions and 1 deletions
2
db.py
2
db.py
|
|
@ -328,7 +328,7 @@ def fetch_page(url):
|
|||
if og_tag and og_tag.get("content"):
|
||||
meta_desc = og_tag["content"].strip()
|
||||
|
||||
for tag in soup(["script", "style", "nav", "footer", "header"]):
|
||||
for tag in soup(["script", "style", "nav", "footer", "header", "noscript"]):
|
||||
tag.decompose()
|
||||
title = soup.title.string.strip() if soup.title and soup.title.string else url
|
||||
body = soup.get_text(separator=" ", strip=True)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue