Skip to content

Commit

Permalink
fix: [title beautifullsoup] add signal, BeautifulSoup html.parser is …
Browse files Browse the repository at this point in the history
…stuck
  • Loading branch information
Terrtia committed Jan 7, 2025
1 parent 8692d9b commit 868da3c
Showing 1 changed file with 24 additions and 5 deletions.
29 changes: 24 additions & 5 deletions bin/lib/crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,19 @@

from pyfaup.faup import Faup


import signal

class TimeoutException(Exception):
pass

def timeout_handler(signum, frame):
raise TimeoutException


signal.signal(signal.SIGALRM, timeout_handler)


# interact with splash_crawler API
import requests
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
Expand Down Expand Up @@ -310,12 +323,18 @@ def extract_favicon_from_html(html, url):
# # # # # # # #

def extract_title_from_html(html):
soup = BeautifulSoup(html, 'html.parser')
title = soup.title
if title:
title = title.string
signal.alarm(60)
try:
soup = BeautifulSoup(html, 'html.parser')
title = soup.title
if title:
return str(title)
title = title.string
if title:
return str(title)
except TimeoutException:
pass
else:
signal.alarm(0)
return ''

def extract_description_from_html(html):
Expand Down

0 comments on commit 868da3c

Please sign in to comment.