Skip to content

Commit

Permalink
fix: [crawler] log timeout + debug signal timeout
Browse files Browse the repository at this point in the history
  • Loading branch information
Terrtia committed Jan 8, 2025
1 parent 38d1d01 commit 0287a13
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 2 deletions.
2 changes: 1 addition & 1 deletion bin/crawlers/Crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ def save_capture_response(self, parent_id, entries):
dom_hash.add(self.date.replace('/', ''), item)
dom_hash.add_correlation('domain', '', self.domain.id)

title_content = crawlers.extract_title_from_html(entries['html'])
title_content = crawlers.extract_title_from_html(entries['html'], item_id)
if title_content:
title = Titles.create_title(title_content)
title.add(item.get_date(), item)
Expand Down
7 changes: 6 additions & 1 deletion bin/lib/crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import gzip
import hashlib
import json
import logging
import os
import pickle
import re
Expand Down Expand Up @@ -72,6 +73,8 @@ def timeout_handler(signum, frame):

faup = Faup()

logger_crawler = logging.getLogger('crawlers.log')

# # # # # # # #
# #
# DOMAINS #
Expand Down Expand Up @@ -322,7 +325,7 @@ def extract_favicon_from_html(html, url):
# #
# # # # # # # #

def extract_title_from_html(html):
def extract_title_from_html(html, item_id):
signal.alarm(60)
try:
soup = BeautifulSoup(html, 'html.parser')
Expand All @@ -333,8 +336,10 @@ def extract_title_from_html(html):
return str(title)
except TimeoutException:
signal.alarm(0)
logger_crawler.warning(f'BeautifulSoup HTML parser timeout: {item_id}')
else:
signal.alarm(0)
signal.alarm(0)
return ''

def extract_description_from_html(html):
Expand Down

0 comments on commit 0287a13

Please sign in to comment.