Skip to content

Commit

Permalink
harvester: add arXiv metadata harvester
Browse files Browse the repository at this point in the history
  • Loading branch information
ChiaraBi committed Jul 18, 2019
1 parent 34f0ff1 commit 1548182
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 10 deletions.
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ verify_ssl = true
name = "pypi"

[packages]
arxiv = "*"
invenio = {version = "*",extras = ["elasticsearch6", "postgresql"]}
invenio-accounts = "*"
invenio-oauth2server = "*"
Expand Down
34 changes: 24 additions & 10 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

102 changes: 102 additions & 0 deletions asclepias_broker/harvester/arxiv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2019 CERN.
#
# Asclepias Harvester is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.

"""ArXiv metadata harvester."""

from typing import List

import arxiv

from .base import MetadataHarvester
from .metadata import update_metadata


class ArxivAPIException(Exception):
"""ArXiv REST API exception."""


class ArxivClient:
"""ArXiv client."""

def get_metadata(self, arxiv_id):
"""Get metadata from ArXiv."""
res = arxiv.query(query="",
id_list=[arxiv_id],
max_results=None,
start=0,
sort_by="relevance",
sort_order="descending",
prune=True,
iterative=False,
max_chunk_results=1000)

if len(res) == 0:
raise ArxivAPIException()
else:
return res[0]


class ArxivMetadataHarvester(MetadataHarvester):
"""Metadata harvester for ArXiv records' metadata."""

def __init__(self, *, provider_name: str = None):
"""."""
self.provider_name = provider_name or "ArXiv versioning harvester"

def can_harvest(self, identifier: str, scheme: str,
providers: List[str] = None) -> bool:
"""."""
is_provider = False
if providers:
is_provider = self.provider_name in providers

return self._is_arxiv_doi(identifier) and not is_provider

def harvest(self, identifier: str, scheme: str,
providers: List[str] = None):
"""."""
data = self.get_metadata(identifier)
if data:
providers = set(providers) if providers else set()
providers.add(self.provider_name)
update_metadata(
identifier, scheme, data,
providers=list(providers))

def _is_arxiv_doi(self, identifier: str) -> bool:
if identifier.lower().startswith('arxiv:'):
return True
else:
return False

def get_metadata(self, arxiv_id):
"""."""
client = ArxivClient()

arxiv_id.replace('arXiv:', '')
metadata = client.get_metadata(arxiv_id)
result = {}

# Identifiers
result['Identifier'] = []
doi = metadata['doi']
if doi:
result['Identifier'].append({'IDScheme': 'doi', 'ID': doi})

# Type
result['Type'] = {'Name': 'literature'}

# Title
result['Title'] = metadata['title']

# Creators
result['Creator'] = [{'Name': c} for c in metadata['authors']]

# Publication date
result['PublicationDate'] = metadata['published']

return result
1 change: 1 addition & 0 deletions rtd-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
alabaster==0.7.12
alembic==1.0.11
amqp==2.5.0
arxiv==0.5.1
arrow==0.14.2
asn1crypto==0.24.0
astunparse==1.6.2
Expand Down

0 comments on commit 1548182

Please sign in to comment.