Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cds: use the OAI-PMH spider to harvest CDS #216

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 24 additions & 6 deletions docker-compose.test.yml
Original file line number Diff line number Diff line change
@@ -58,6 +58,15 @@ services:
arxiv-http-server.local:
condition: service_healthy

functional_cds:
<<: *service_base
command: py.test -vv tests/functional/cds
depends_on:
scrapyd:
condition: service_healthy
cds-http-server.local:
condition: service_healthy

functional_pos:
<<: *service_base
command: py.test -vv tests/functional/pos
@@ -126,12 +135,6 @@ services:
- "CMD-SHELL"
- "curl https://localhost:443/"

functional_cds:
<<: *service_base
command: py.test -vv tests/functional/cds
links:
- scrapyd

arxiv-http-server.local:
image: nginx:stable-alpine
volumes:
@@ -147,6 +150,21 @@ services:
- "CMD-SHELL"
- "curl http://localhost:80/"

cds-http-server.local:
image: nginx:stable-alpine
volumes:
- ${PWD}/tests/functional/cds/fixtures/http_server/conf/proxy.conf:/etc/nginx/conf.d/default.conf
- ${PWD}/tests/functional/cds/fixtures/http_server/records:/etc/nginx/html/
ports:
- 80:80
healthcheck:
timeout: 5s
interval: 5s
retries: 5
test:
- "CMD-SHELL"
- "curl http://localhost:80/"

rabbitmq:
image: rabbitmq
healthcheck:
75 changes: 26 additions & 49 deletions hepcrawl/spiders/cds_spider.py
Original file line number Diff line number Diff line change
@@ -9,75 +9,52 @@

"""Spider for the CERN Document Server OAI-PMH interface"""

from dojson.contrib.marc21.utils import create_record
import logging
from flask.app import Flask
from harvestingkit.inspire_cds_package.from_cds import CDS2Inspire
from harvestingkit.bibrecord import (
create_record as create_bibrec,
record_xml_output,
)
from inspire_dojson.hep import hep
from scrapy import Request
from scrapy.spider import XMLFeedSpider
from inspire_dojson import marcxml2record
from os.path import join as path_join

from . import StatefulSpider
from .common.oaipmh_spider import OAIPMHSpider
from ..utils import ParsedItem


class CDSSpider(StatefulSpider, XMLFeedSpider):
LOGGER = logging.getLogger(__name__)


class CDSSpider(OAIPMHSpider):
"""Spider for crawling the CERN Document Server OAI-PMH XML files.

Example:
Using OAI-PMH XML files::

$ scrapy crawl \\
cds \\
-a "source_file=file://$PWD/tests/functional/cds/fixtures/oai_harvested/cds_smoke_records.xml"
$ scrapy crawl CDS \\
-a "sets=forINSPIRE" -a "from_date=2017-10-10"

It uses `HarvestingKit <https://pypi.python.org/pypi/HarvestingKit>`_ to
translate from CDS's MARCXML into INSPIRE Legacy's MARCXML flavor. It then
employs `inspire-dojson <https://pypi.python.org/pypi/inspire-dojson>`_ to
transform the legacy INSPIRE MARCXML into the new INSPIRE Schema.
It uses `inspire-dojson <https://pypi.python.org/pypi/inspire-dojson>`_ to
translate from CDS's MARCXML into the new INSPIRE Schema.
"""

name = 'CDS'
iterator = 'xml'
itertag = 'OAI-PMH:record'
namespaces = [
('OAI-PMH', 'http://www.openarchives.org/OAI/2.0/'),
('marc', 'http://www.loc.gov/MARC21/slim'),
]

def __init__(self, source_file=None, **kwargs):
super(CDSSpider, self).__init__(**kwargs)
self.source_file = source_file
def __init__(self, *args, **kwargs):
kwargs.setdefault('url', 'http://cds.cern.ch/oai2d')
kwargs.setdefault('format', 'marcxml')
kwargs.setdefault('sets', 'forINSPIRE')
super(CDSSpider, self).__init__(*args, **kwargs)

def start_requests(self):
yield Request(self.source_file)

def parse_node(self, response, node):
node.remove_namespaces()
cds_bibrec, ok, errs = create_bibrec(
node.xpath('.//record').extract()[0]
)
if not ok:
raise RuntimeError("Cannot parse record %s: %s", node, errs)
self.logger.info("Here's the record: %s" % cds_bibrec)
inspire_bibrec = CDS2Inspire(cds_bibrec).get_record()
marcxml_record = record_xml_output(inspire_bibrec)
record = create_record(marcxml_record)
def get_record_identifier(self, record):
"""Extracts a unique identifier from a sickle record."""
return record.header.identifier

def parse_record(self, selector):
selector.remove_namespaces()
record = selector.xpath('.//record').extract_first()
app = Flask('hepcrawl')
app.config.update(
self.settings.getdict('MARC_TO_HEP_SETTINGS', {})
)
with app.app_context():
json_record = hep.do(record)
json_record = marcxml2record(record)
base_uri = self.settings['SCHEMA_BASE_URI']
json_record['$schema'] = base_uri + 'hep.json'

parsed_item = ParsedItem(
record=json_record,
record_format='hep',
)
return parsed_item
json_record['$schema'] = path_join(base_uri, 'hep.json')
return ParsedItem(record=json_record, record_format='hep')
2 changes: 1 addition & 1 deletion hepcrawl/spiders/common/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# This file is part of hepcrawl.
# Copyright (C) 2017 CERN.
# Copyright (C) 2015, 2016, 2017, 2018 CERN.
#
# hepcrawl is a free software; you can redistribute it and/or modify it
# under the terms of the Revised BSD License; see LICENSE file for
218 changes: 218 additions & 0 deletions tests/functional/cds/fixtures/cds_expected.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
[
{
"core": true,
"documents": [
{
"url": "http://cds.cern.ch/record/1200752/files/MQW7_018.pdf",
"source": "CDS",
"description": "Published version from PoS",
"key": "MQW7_018.pdf"
}
],
"curated": true,
"_collections": [
"Literature"
],
"inspire_categories": [
{
"source": "cds",
"term": "Astrophysics"
}
],
"titles": [
{
"source": "CDS",
"title": "High and very high energy gamma-ray emission from binaries"
}
],
"_private_notes": [
{
"source": "CDS",
"value": "CDS-1200752"
}
],
"authors": [
{
"affiliations": [
{
"value": "Grenoble Observ."
}
],
"full_name": "Dubus, G"
}
],
"publication_info": [
{
"journal_volume": "MQW7",
"page_start": "018",
"journal_title": "PoS",
"artid": "018",
"year": 2008
}
],
"$schema": "http://localhost/schemas/records/hep.json",
"document_type": [
"conference paper"
],
"citeable": true,
"imprints": [
{
"date": "2009"
}
],
"acquisition_source": {
"source": "CDS",
"method": "hepcrawl",
"submission_number": "None",
"datetime": "2017-12-14T08:10:03.875113"
}
},
{
"core": true,
"documents": [
{
"url": "http://cds.cern.ch/record/1200753/files/MQW7_019.pdf",
"source": "CDS",
"description": "Published version from PoS",
"key": "MQW7_019.pdf"
}
],
"curated": true,
"_collections": [
"Literature"
],
"collaborations": [
{
"value": "Fermi LAT"
}
],
"inspire_categories": [
{
"source": "cds",
"term": "Astrophysics"
}
],
"titles": [
{
"source": "CDS",
"title": "GLAST: Launched and Being Commissioned - Status and Prospects for Microquasars"
},
{
"source": "CDS",
"title": "Fermi: Launched and Being Commissioned - Status and Prospects for Microquasars"
}
],
"_private_notes": [
{
"source": "CDS",
"value": "CDS-1200753"
}
],
"authors": [
{
"affiliations": [
{
"value": "SLAC"
}
],
"full_name": "Dubois, R"
}
],
"publication_info": [
{
"journal_volume": "MQW7",
"page_start": "019",
"journal_title": "PoS",
"artid": "019",
"year": 2008
}
],
"$schema": "http://localhost/schemas/records/hep.json",
"document_type": [
"conference paper"
],
"citeable": true,
"imprints": [
{
"date": "2008"
}
],
"acquisition_source": {
"source": "CDS",
"method": "hepcrawl",
"submission_number": "None",
"datetime": "2017-12-14T08:10:03.951904"
}
},
{
"core": true,
"documents": [
{
"url": "http://cds.cern.ch/record/1200754/files/MQW7_020.pdf",
"source": "CDS",
"description": "Published version from PoS",
"key": "MQW7_020.pdf"
}
],
"curated": true,
"_collections": [
"Literature"
],
"inspire_categories": [
{
"source": "cds",
"term": "Astrophysics"
}
],
"titles": [
{
"source": "CDS",
"title": "Hadronic models of high-energy radiation from microquasars: recent developments"
}
],
"_private_notes": [
{
"source": "CDS",
"value": "CDS-1200754"
}
],
"authors": [
{
"affiliations": [
{
"value": "Villa Elisa, Inst. Argentino Radioastron."
},
{
"value": "La Plata U."
}
],
"full_name": "Romero, G E"
}
],
"publication_info": [
{
"journal_volume": "MQW7",
"page_start": "020",
"journal_title": "PoS",
"artid": "020",
"year": 2008
}
],
"$schema": "http://localhost/schemas/records/hep.json",
"document_type": [
"conference paper"
],
"citeable": true,
"imprints": [
{
"date": "2008"
}
],
"acquisition_source": {
"source": "CDS",
"method": "hepcrawl",
"submission_number": "None",
"datetime": "2017-12-14T08:10:03.984541"
}
}
]
Loading