stripped noscript tags from pages
Lemmy and other JS-heavy sites include noscript fallback text like "Javascript is disabled" that pollutes the stored body text and generated snippets/summaries.
This commit is contained in:
parent
3f8ebdab1d
commit
e8915fa381
1 changed files with 1 additions and 1 deletions
2
db.py
2
db.py
|
|
@ -328,7 +328,7 @@ def fetch_page(url):
|
||||||
if og_tag and og_tag.get("content"):
|
if og_tag and og_tag.get("content"):
|
||||||
meta_desc = og_tag["content"].strip()
|
meta_desc = og_tag["content"].strip()
|
||||||
|
|
||||||
for tag in soup(["script", "style", "nav", "footer", "header"]):
|
for tag in soup(["script", "style", "nav", "footer", "header", "noscript"]):
|
||||||
tag.decompose()
|
tag.decompose()
|
||||||
title = soup.title.string.strip() if soup.title and soup.title.string else url
|
title = soup.title.string.strip() if soup.title and soup.title.string else url
|
||||||
body = soup.get_text(separator=" ", strip=True)
|
body = soup.get_text(separator=" ", strip=True)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue