From 1548182ad81fe618a36433f5998ee4d5c3488a19 Mon Sep 17 00:00:00 2001 From: Chiara Bigarella Date: Thu, 18 Jul 2019 12:12:43 +0200 Subject: [PATCH] harvester: add arXiv metadata harvester --- Pipfile | 1 + Pipfile.lock | 34 +++++++--- asclepias_broker/harvester/arxiv.py | 102 ++++++++++++++++++++++++++++ rtd-requirements.txt | 1 + 4 files changed, 128 insertions(+), 10 deletions(-) create mode 100644 asclepias_broker/harvester/arxiv.py diff --git a/Pipfile b/Pipfile index bb2876a..6c9335c 100644 --- a/Pipfile +++ b/Pipfile @@ -4,6 +4,7 @@ verify_ssl = true name = "pypi" [packages] +arxiv = "*" invenio = {version = "*",extras = ["elasticsearch6", "postgresql"]} invenio-accounts = "*" invenio-oauth2server = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 8096ca9..06071d8 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "64741dd015dd49a8980624f508f24135e167967106f98b7bcf41092a50f56981" + "sha256": "daf390a9e12652c861d7992106ae9651386b4627ad5193db05b0e4705e83f5b0" }, "pipfile-spec": 6, "requires": { @@ -29,14 +29,6 @@ ], "version": "==2.5.0" }, - "appnope": { - "hashes": [ - "sha256:5b26757dc6f79a3b7dc9fab95359328d5747fcb2409d331ea66d0272b90ab2a0", - "sha256:8b995ffe925347a2138d7ac0fe77155e4311a0ea6d6da4f5128fe4b3cbe5ed71" - ], - "markers": "sys_platform == 'darwin'", - "version": "==0.1.0" - }, "arrow": { "hashes": [ "sha256:03404b624e89ac5e4fc19c52045fa0f3203419fd4dd64f6e8958c522580a574a", @@ -44,6 +36,14 @@ ], "version": "==0.14.2" }, + "arxiv": { + "hashes": [ + "sha256:5cfb924b60e3ea0ebb3b5d0c32c849df46a2b000036d0bf578c71fba54512233", + "sha256:da8d9b402fde28207975c6e3c2e177ff1c8b2077bb7178a1eaf1c5c4bf41038a" + ], + "index": "pypi", + "version": "==0.5.1" + }, "asn1crypto": { "hashes": [ "sha256:2f1adbb7546ed199e3c90ef23ec95c5cf3585bac7d11fb7eb562a3fe89c64e87", @@ -221,6 +221,14 @@ ], "version": "==6.1.0" }, + "feedparser": { + "hashes": [ + "sha256:bd030652c2d08532c034c27fcd7c85868e7fa3cb2b17f230a44a6bbc92519bf9", + "sha256:cd2485472e41471632ed3029d44033ee420ad0b57111db95c240c9160a85831c", + "sha256:ce875495c90ebd74b179855449040003a1beb40cd13d5f037a0654251e260b02" + ], + "version": "==5.2.1" + }, "flask": { "hashes": [ "sha256:13f9f196f330c7c2c5d7a5cf91af894110ca0215ac051b5844701f2bfd934d52", @@ -864,6 +872,13 @@ ], "version": "==0.15.3" }, + "pytest-runner": { + "hashes": [ + "sha256:25a013c8d84f0ca60bb01bd11913a3bcab420f601f0f236de4423074af656e7a", + "sha256:d04243fbf29a3b574f18f1bcff2a07f505db5daede82f706f2e32728f77d3f4d" + ], + "version": "==5.1" + }, "python-dateutil": { "hashes": [ "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb", @@ -1440,7 +1455,6 @@ "sha256:25a013c8d84f0ca60bb01bd11913a3bcab420f601f0f236de4423074af656e7a", "sha256:d04243fbf29a3b574f18f1bcff2a07f505db5daede82f706f2e32728f77d3f4d" ], - "index": "pypi", "version": "==5.1" }, "pytz": { diff --git a/asclepias_broker/harvester/arxiv.py b/asclepias_broker/harvester/arxiv.py new file mode 100644 index 0000000..65979ba --- /dev/null +++ b/asclepias_broker/harvester/arxiv.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2019 CERN. +# +# Asclepias Harvester is free software; you can redistribute it and/or modify +# it under the terms of the MIT License; see LICENSE file for more details. + +"""ArXiv metadata harvester.""" + +from typing import List + +import arxiv + +from .base import MetadataHarvester +from .metadata import update_metadata + + +class ArxivAPIException(Exception): + """ArXiv REST API exception.""" + + +class ArxivClient: + """ArXiv client.""" + + def get_metadata(self, arxiv_id): + """Get metadata from ArXiv.""" + res = arxiv.query(query="", + id_list=[arxiv_id], + max_results=None, + start=0, + sort_by="relevance", + sort_order="descending", + prune=True, + iterative=False, + max_chunk_results=1000) + + if len(res) == 0: + raise ArxivAPIException() + else: + return res[0] + + +class ArxivMetadataHarvester(MetadataHarvester): + """Metadata harvester for ArXiv records' metadata.""" + + def __init__(self, *, provider_name: str = None): + """.""" + self.provider_name = provider_name or "ArXiv versioning harvester" + + def can_harvest(self, identifier: str, scheme: str, + providers: List[str] = None) -> bool: + """.""" + is_provider = False + if providers: + is_provider = self.provider_name in providers + + return self._is_arxiv_doi(identifier) and not is_provider + + def harvest(self, identifier: str, scheme: str, + providers: List[str] = None): + """.""" + data = self.get_metadata(identifier) + if data: + providers = set(providers) if providers else set() + providers.add(self.provider_name) + update_metadata( + identifier, scheme, data, + providers=list(providers)) + + def _is_arxiv_doi(self, identifier: str) -> bool: + if identifier.lower().startswith('arxiv:'): + return True + else: + return False + + def get_metadata(self, arxiv_id): + """.""" + client = ArxivClient() + + arxiv_id.replace('arXiv:', '') + metadata = client.get_metadata(arxiv_id) + result = {} + + # Identifiers + result['Identifier'] = [] + doi = metadata['doi'] + if doi: + result['Identifier'].append({'IDScheme': 'doi', 'ID': doi}) + + # Type + result['Type'] = {'Name': 'literature'} + + # Title + result['Title'] = metadata['title'] + + # Creators + result['Creator'] = [{'Name': c} for c in metadata['authors']] + + # Publication date + result['PublicationDate'] = metadata['published'] + + return result diff --git a/rtd-requirements.txt b/rtd-requirements.txt index b8045c4..fdec433 100644 --- a/rtd-requirements.txt +++ b/rtd-requirements.txt @@ -5,6 +5,7 @@ alabaster==0.7.12 alembic==1.0.11 amqp==2.5.0 +arxiv==0.5.1 arrow==0.14.2 asn1crypto==0.24.0 astunparse==1.6.2