Skip to content

Commit

Permalink
Merge pull request inspirehep#2373 from jacquerie/integrate-pr-2361
Browse files Browse the repository at this point in the history
workflows: retry arxiv pdf download
  • Loading branch information
jacquerie authored May 23, 2017
2 parents 23e75d2 + e5316ab commit 0208ed2
Show file tree
Hide file tree
Showing 17 changed files with 72 additions and 18 deletions.
1 change: 0 additions & 1 deletion inspirehep/modules/fixtures/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
# or submit itself to any jurisdiction.

"""INSPIRE fixtures - loading of configs and database content."""

Expand Down
2 changes: 0 additions & 2 deletions inspirehep/modules/pidstore/fetchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
# waive the privileges and immunities granted to it by virtue of its status
# as an Intergovernmental Organization or submit itself to any jurisdiction.

"""Persistent identifier minters."""

Expand Down
1 change: 0 additions & 1 deletion inspirehep/modules/search/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
# or submit itself to any jurisdiction.

"""Default configuration of SPIRES parser."""

Expand Down
1 change: 0 additions & 1 deletion inspirehep/modules/search/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
# or submit itself to any jurisdiction.

"""SPIRES parser implementation."""

Expand Down
1 change: 0 additions & 1 deletion inspirehep/modules/search/walkers/pypeg_to_ast.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
# or submit itself to any jurisdiction.

"""SPIRES extended Pypeg to AST converter."""

Expand Down
1 change: 0 additions & 1 deletion inspirehep/modules/search/walkers/spires_to_invenio.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
# or submit itself to any jurisdiction.

"""Implement query printer."""

Expand Down
1 change: 0 additions & 1 deletion inspirehep/modules/workflows/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
# or submit itself to any jurisdiction.

"""INSPIRE theme and filters."""

Expand Down
1 change: 0 additions & 1 deletion inspirehep/modules/workflows/actions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
# or submit itself to any jurisdiction.

"""Our workflows."""

Expand Down
1 change: 0 additions & 1 deletion inspirehep/modules/workflows/bundles.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
# or submit itself to any jurisdiction.

"""Bundles for forms used across INSPIRE."""

Expand Down
30 changes: 30 additions & 0 deletions inspirehep/modules/workflows/errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2014-2017 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.

from __future__ import absolute_import, division, print_function

from invenio_workflows.errors import WorkflowsError


class DownloadError(WorkflowsError):

"""Error representing a failed download in a workflow."""
1 change: 0 additions & 1 deletion inspirehep/modules/workflows/ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
# or submit itself to any jurisdiction.

from __future__ import absolute_import, division, print_function

Expand Down
1 change: 0 additions & 1 deletion inspirehep/modules/workflows/receivers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
# or submit itself to any jurisdiction.

"""Receivers for INSPIRE workflows."""

Expand Down
1 change: 0 additions & 1 deletion inspirehep/modules/workflows/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
# or submit itself to any jurisdiction.

"""Search factory for INSPIRE workflows UI.
Expand Down
11 changes: 10 additions & 1 deletion inspirehep/modules/workflows/tasks/arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import re
from functools import wraps

import backoff
from flask import current_app
from lxml.etree import XMLSyntaxError
from six import BytesIO
Expand All @@ -45,6 +46,7 @@
from plotextractor.converter import untar
from plotextractor.errors import InvalidTarball, NoTexFilesFound

from ..errors import DownloadError
from ..utils import download_file_to_workflow, with_debug_logging


Expand All @@ -56,6 +58,7 @@


@with_debug_logging
@backoff.on_exception(backoff.expo, DownloadError, base=4, max_tries=5)
def arxiv_fulltext_download(obj, eng):
"""Perform the fulltext download step for arXiv records.
Expand All @@ -70,8 +73,14 @@ def arxiv_fulltext_download(obj, eng):
url=current_app.config['ARXIV_PDF_URL'].format(arxiv_id=arxiv_id),
)

if pdf:
if pdf and pdf.get_version().mimetype == 'application/pdf':
obj.log.info('PDF retrieved from arXiv for %s', arxiv_id)
elif pdf:
# We need to delete the failed download as otherwise it would
# create a new version instead of replacing the previous one.
del obj.files[filename]
pdf.delete()
raise DownloadError()
else:
obj.log.error('Cannot retrieve PDF from arXiv for %s', arxiv_id)

Expand Down
1 change: 0 additions & 1 deletion inspirehep/modules/workflows/workflows/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
# or submit itself to any jurisdiction.

"""Our workflows."""

Expand Down
31 changes: 30 additions & 1 deletion tests/unit/helpers/mocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,42 @@ def __contains__(self, item):
return item in self.data

def __setitem__(self, key, value):
self.data[key] = {'key': key}
self.data[key] = MockFileObject(key=key)

def __delitem__(self, key):
del self.data[key]

@property
def keys(self):
return self.data.keys()


class MockFileObject(object):
def __init__(self, key):
self.obj = {'key': key}

def __eq__(self, other):
return self.obj['key'] == other.obj['key']

def __getitem__(self, key):
return self.obj[key]

def __setitem__(self, key, value):
self.obj[key] = value

def delete(self):
pass

def get_version(self):
return MockObjectVersion()


class MockObjectVersion(object):
@property
def mimetype(self):
return 'application/pdf'


class MockLog(object):
def __init__(self):
self._debug = StringIO()
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/workflows/test_workflows_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

from inspirehep.modules.workflows.utils import download_file_to_workflow

from mocks import MockFiles, MockObj
from mocks import MockFiles, MockFileObject, MockObj


def test_download_file_to_workflow_retries_on_protocol_error():
Expand All @@ -50,7 +50,7 @@ def test_download_file_to_workflow_retries_on_protocol_error():

obj = MockObj(data, extra_data, files=files)

expected = {'key': '1605.03844.pdf'}
expected = MockFileObject(key='1605.03844.pdf')
result = download_file_to_workflow(
obj, '1605.03844.pdf', 'http://export.arxiv.org/pdf/1605.03844')

Expand Down

0 comments on commit 0208ed2

Please sign in to comment.