From 7441c14103a3ba43f45bcee6f66dd6d4ab7dc298 Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Mon, 24 Jun 2024 10:07:08 -0400 Subject: [PATCH] Retry OpenAlex SSL exceptions I noticed that I hit some SSL exceptions when harvesting more data from OpenAlex (AIRFLOW_VAR_DEV_LIMIT=10000). ``` urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='api.openalex.org', port=443): Max retries exceeded with url: /authors/https://orcid.org/0000-0001-5838-5335 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1000)'))) During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/home/airflow/.local/lib/python3.12/site-packages/airflow/models/taskinstance.py", line 465, in _execute_task result = _execute_callable(context=context, **execute_callable_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/airflow/.local/lib/python3.12/site-packages/airflow/models/taskinstance.py", line 432, in _execute_callable return execute_callable(context=context, **execute_callable_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/airflow/.local/lib/python3.12/site-packages/airflow/models/baseoperator.py", line 401, in wrapper return func(self, *args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/airflow/.local/lib/python3.12/site-packages/airflow/decorators/base.py", line 265, in execute return_value = super().execute(context) ^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/airflow/.local/lib/python3.12/site-packages/airflow/models/baseoperator.py", line 401, in wrapper return func(self, *args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/airflow/.local/lib/python3.12/site-packages/airflow/operators/python.py", line 235, in execute return_value = self.execute_callable() ^^^^^^^^^^^^^^^^^^^^^^^ File "/home/airflow/.local/lib/python3.12/site-packages/airflow/operators/python.py", line 252, in execute_callable return self.python_callable(*self.op_args, **self.op_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/airflow/rialto_airflow/dags/harvest.py", line 59, in openalex_harvest_dois openalex.doi_orcids_pickle(authors_csv, pickle_file, limit=dev_limit) File "/opt/airflow/rialto_airflow/harvest/openalex.py", line 22, in doi_orcids_pickle orcid_dois[orcid] = list(dois_from_orcid(orcid)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/airflow/rialto_airflow/harvest/openalex.py", line 41, in dois_from_orcid author_resp = requests.get( ^^^^^^^^^^^^^ File "/home/airflow/.local/lib/python3.12/site-packages/requests/api.py", line 73, in get return request("get", url, params=params, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/airflow/.local/lib/python3.12/site-packages/requests/api.py", line 59, in request return session.request(method=method, url=url, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/airflow/.local/lib/python3.12/site-packages/requests/sessions.py", line 589, in request resp = self.send(prep, **send_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/airflow/.local/lib/python3.12/site-packages/requests/sessions.py", line 703, in send r = adapter.send(request, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/airflow/.local/lib/python3.12/site-packages/requests/adapters.py", line 698, in send raise SSLError(e, request=request) requests.exceptions.SSLError: HTTPSConnectionPool(host='api.openalex.org', port=443): Max retries exceeded with url: /authors/https://orcid.org/0000-0001-5838-5335 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1000)'))) [2024-06-24, 11:14:31 UTC] {taskinstance.py:1206} INFO - Marking task as FAILED. dag_id=harvest, task_id=openalex_harvest_dois, run_id=manual__2024-06-24T11:02:02.383856+00:00, execution_date=20240624T110202, start_date=20240624T110205, end_date=20240624T111431 [2024-06-24, 11:14:31 UTC] {standard_task_runner.py:110} ERROR - Failed to execute job 222 for task openalex_harvest_dois (HTTPSConnectionPool(host='api.openalex.org', port=443): Max retries exceeded with url: /authors/https://orcid.org/0000-0001-5838-5335 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1000)'))); 86) [2024-06-24, 11:14:31 UTC] {local_task_job_runner.py:240} INFO - Task exited with return code 1 ``` This commit uses tenacity to retry these with a random wait between 1-5 seconds, which stops after 60 seconds of trying. We may want to adjust these based on how well they work. The retry behavior only works with the SSLError for now so we can get insight into other errors that we might encounter. --- pyproject.toml | 3 ++- requirements.txt | 4 ++-- rialto_airflow/harvest/openalex.py | 11 ++++++++++- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9cd8e14..97e5a9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,8 @@ dependencies = [ "pandas", "requests", "python-dotenv", - "dimcli" + "dimcli", + "tenacity" ] [tool.pytest.ini_options] diff --git a/requirements.txt b/requirements.txt index 95139a6..fa27673 100644 --- a/requirements.txt +++ b/requirements.txt @@ -102,14 +102,14 @@ sphinxcontrib-serializinghtml==1.1.10 # via sphinx stack-data==0.6.3 # via ipython +tenacity==8.4.1 + # via rialto-airflow (pyproject.toml) tqdm==4.66.4 # via dimcli traitlets==5.14.3 # via # ipython # matplotlib-inline -typing-extensions==4.12.2 - # via ipython tzdata==2024.1 # via pandas urllib3==2.2.1 diff --git a/rialto_airflow/harvest/openalex.py b/rialto_airflow/harvest/openalex.py index 53d3ec8..f2b55be 100644 --- a/rialto_airflow/harvest/openalex.py +++ b/rialto_airflow/harvest/openalex.py @@ -4,6 +4,8 @@ import time import requests +from requests.exceptions import SSLError +from tenacity import retry, retry_if_exception_type, stop_after_delay, wait_random from rialto_airflow.utils import invert_dict @@ -27,6 +29,11 @@ def doi_orcids_pickle(authors_csv, pickle_file, limit=None): pickle.dump(invert_dict(orcid_dois), handle, protocol=pickle.HIGHEST_PROTOCOL) +@retry( + wait=wait_random(1, 5), + stop=stop_after_delay(60), + retry=retry_if_exception_type(SSLError), +) def dois_from_orcid(orcid: str): """ Pass in the ORCID ID and get back an iterator of DOIs for publications authored by that person. @@ -79,5 +86,7 @@ def works_from_author_id(author_id, limit=None): else: yield result else: - logging.error(f"encountered non-200 response: {url} {params}") + logging.error( + f"encountered HTTP {resp.status_code} response from {url} {params}: {resp.text}" + ) has_more = False