From 236dff0b3f6e28c76f924c4cb46129973d383db1 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Fri, 1 Nov 2024 15:38:30 +0100 Subject: [PATCH] downloads: cleaner urllib3 code --- trafilatura/downloads.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/trafilatura/downloads.py b/trafilatura/downloads.py index 4aa5d4cb..a57cdd08 100644 --- a/trafilatura/downloads.py +++ b/trafilatura/downloads.py @@ -153,28 +153,32 @@ def _determine_headers( return headers or DEFAULT_HEADERS -def _send_urllib_request( - url: str, no_ssl: bool, with_headers: bool, config: ConfigParser -) -> Optional[Response]: - "Internal function to robustly send a request (SSL or not) and return its result." - # customize headers - global HTTP_POOL, NO_CERT_POOL, RETRY_STRATEGY +def _get_retry_strategy(config: ConfigParser) -> urllib3.util.Retry: + "Define a retry strategy according to the config file." + global RETRY_STRATEGY if not RETRY_STRATEGY: RETRY_STRATEGY = urllib3.util.Retry( total=config.getint("DEFAULT", "MAX_REDIRECTS"), - redirect=config.getint( - "DEFAULT", "MAX_REDIRECTS" - ), # raise_on_redirect=False, + redirect=config.getint("DEFAULT", "MAX_REDIRECTS"), # raise_on_redirect=False, connect=0, backoff_factor=config.getint("DEFAULT", "DOWNLOAD_TIMEOUT") / 2, status_forcelist=FORCE_STATUS, # unofficial: https://en.wikipedia.org/wiki/List_of_HTTP_status_codes#Unofficial_codes ) + return RETRY_STRATEGY + + +def _send_urllib_request( + url: str, no_ssl: bool, with_headers: bool, config: ConfigParser +) -> Optional[Response]: + "Internal function to robustly send a request (SSL or not) and return its result." + global HTTP_POOL, NO_CERT_POOL + try: if no_ssl is False: if not HTTP_POOL: HTTP_POOL = create_pool( - retries=RETRY_STRATEGY, + retries=_get_retry_strategy(config), timeout=config.getint("DEFAULT", "DOWNLOAD_TIMEOUT"), ca_certs=certifi.where() ) # cert_reqs='CERT_REQUIRED' @@ -182,7 +186,7 @@ def _send_urllib_request( else: if not NO_CERT_POOL: NO_CERT_POOL = create_pool( - retries=RETRY_STRATEGY, + retries=_get_retry_strategy(config), timeout=config.getint("DEFAULT", "DOWNLOAD_TIMEOUT"), cert_reqs="CERT_NONE" )