diff --git a/pyproject.toml b/pyproject.toml index 9cd8e14..97e5a9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,8 @@ dependencies = [ "pandas", "requests", "python-dotenv", - "dimcli" + "dimcli", + "tenacity" ] [tool.pytest.ini_options] diff --git a/requirements.txt b/requirements.txt index 95139a6..fa27673 100644 --- a/requirements.txt +++ b/requirements.txt @@ -102,14 +102,14 @@ sphinxcontrib-serializinghtml==1.1.10 # via sphinx stack-data==0.6.3 # via ipython +tenacity==8.4.1 + # via rialto-airflow (pyproject.toml) tqdm==4.66.4 # via dimcli traitlets==5.14.3 # via # ipython # matplotlib-inline -typing-extensions==4.12.2 - # via ipython tzdata==2024.1 # via pandas urllib3==2.2.1 diff --git a/rialto_airflow/harvest/openalex.py b/rialto_airflow/harvest/openalex.py index 53d3ec8..f2b55be 100644 --- a/rialto_airflow/harvest/openalex.py +++ b/rialto_airflow/harvest/openalex.py @@ -4,6 +4,8 @@ import time import requests +from requests.exceptions import SSLError +from tenacity import retry, retry_if_exception_type, stop_after_delay, wait_random from rialto_airflow.utils import invert_dict @@ -27,6 +29,11 @@ def doi_orcids_pickle(authors_csv, pickle_file, limit=None): pickle.dump(invert_dict(orcid_dois), handle, protocol=pickle.HIGHEST_PROTOCOL) +@retry( + wait=wait_random(1, 5), + stop=stop_after_delay(60), + retry=retry_if_exception_type(SSLError), +) def dois_from_orcid(orcid: str): """ Pass in the ORCID ID and get back an iterator of DOIs for publications authored by that person. @@ -79,5 +86,7 @@ def works_from_author_id(author_id, limit=None): else: yield result else: - logging.error(f"encountered non-200 response: {url} {params}") + logging.error( + f"encountered HTTP {resp.status_code} response from {url} {params}: {resp.text}" + ) has_more = False