diff --git a/asclepias_broker/harvester/crossref.py b/asclepias_broker/harvester/crossref.py index 6bd067d..d42e663 100644 --- a/asclepias_broker/harvester/crossref.py +++ b/asclepias_broker/harvester/crossref.py @@ -17,6 +17,8 @@ from ..events.api import EventAPI from ..utils import chunks from .proxies import current_harvester +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry class CrossrefAPIException(Exception): @@ -53,9 +55,25 @@ def __init__(self, *, id: str = None, base_url: str = None, self.id = id self.base_url = base_url or self.DEFAULT_API_BASE_URL self.params = params or {} - - def _transform_scholix(self, data): - """.""" + self._session = None + + @property + def session(self): + """Create a session for making HTTP requests to the API.""" + if self._session is None: + _session = requests.Session() + retry = Retry( + total=5, read=5, connect=5, + backoff_factor=0.3, + status_forcelist=(500, 502, 504), + ) + adapter = HTTPAdapter(max_retries=retry) + _session.mount('http://', adapter) + _session.mount('https://', adapter) + self._session = _session + return self._session + + def _clean_scholix(self, data): data.pop('Url', None) for k in ('Source', 'Target'): t = data[k]['Type'] @@ -89,14 +107,14 @@ def search_events(self, *, scholix: bool = True) -> Iterator[dict]: raise CrossrefAPIParametersException() while True: - resp = requests.get(url, params=params) + resp = self.session.get(url, params=params) if not resp.ok or resp.json().get('status') != 'ok': raise CrossrefAPIException() payload = resp.json() items = payload.get('message', {}).get( 'link-packages' if scholix else 'events', []) for item in items: - yield self._transform_scholix(item) if scholix else item + yield self._clean_scholix(item) if scholix else item cursor_id = payload.get('message', {}).get('next-cursor') if cursor_id: @@ -110,7 +128,7 @@ def harvest(self, eager: bool = False, no_index: bool = True): current_datetime = datetime.now() if last_run: self.params.setdefault( - 'from-update-date', last_run.date().isoformat()) + 'from-occurred-date', last_run.date().isoformat()) results = self.search_events() for events in chunks(results, 100):