Skip to content

Commit

Permalink
wip: refactor, test contents
Browse files Browse the repository at this point in the history
Signed-off-by: Szymon Łopaciuk <[email protected]>
  • Loading branch information
szymonlopaciuk committed Dec 7, 2017
1 parent d52f775 commit b500aa0
Show file tree
Hide file tree
Showing 9 changed files with 2,954 additions and 181 deletions.
2 changes: 1 addition & 1 deletion hepcrawl/scrapy.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
default = hepcrawl.settings

[deploy]
url = http://localhost:6800/
url = http://scrapyd:6800/
project = hepcrawl
#username = scrapy
#password = secret
7 changes: 4 additions & 3 deletions hepcrawl/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,12 @@
'hepcrawl.middlewares.HepcrawlCrawlOnceMiddleware': 100,
}

DOWNLOAD_HANDLERS_BASE = dict(default_settings.DOWNLOAD_HANDLERS_BASE)
DOWNLOAD_HANDLERS_BASE.update({
# Configure custom downloaders
# See https://doc.scrapy.org/en/0.20/topics/settings.html#download-handlers
DOWNLOAD_HANDLERS = {
'oaipmh+http': 'hepcrawl.downloaders.DummyDownloadHandler',
'oaipmh+https': 'hepcrawl.downloaders.DummyDownloadHandler',
})
}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
Expand Down
2 changes: 1 addition & 1 deletion hepcrawl/spiders/arxiv_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class ArxivSpider(StatefulSpider, XMLFeedSpider):
"""

name = 'arXiv'
iterator = 'iternodes'
iterator = 'xml'
itertag = 'OAI-PMH:record'
namespaces = [
("OAI-PMH", "http://www.openarchives.org/OAI/2.0/")
Expand Down
13 changes: 10 additions & 3 deletions hepcrawl/spiders/cds_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class CDSSpider(OAIPMHSpider):
Using OAI-PMH XML files::
$ scrapy crawl CDS \\
-a "set=forINSPIRE" -a "from_date=2017-10-10"
-a "oai_set=forINSPIRE" -a "from_date=2017-10-10"
It uses `HarvestingKit <https://pypi.python.org/pypi/HarvestingKit>`_ to
translate from CDS's MARCXML into INSPIRE Legacy's MARCXML flavor. It then
Expand All @@ -44,8 +44,13 @@ class CDSSpider(OAIPMHSpider):

name = 'CDS'

def __init__(self, from_date=None, set="forINSPIRE", *args, **kwargs):
super(CDSSpider, self).__init__(url='http://cds.cern.ch/oai2d', metadata_prefix='marcxml', set=set, from_date=from_date, **kwargs)
def __init__(self, from_date=None, oai_set="forINSPIRE", *args, **kwargs):
super(CDSSpider, self).__init__(
url='http://cds.cern.ch/oai2d',
metadata_prefix='marcxml',
oai_set=oai_set,
from_date=from_date,
**kwargs)

def parse_record(self, record):
response = XmlResponse(self.url, encoding='utf-8', body=record.raw)
Expand All @@ -65,6 +70,8 @@ def parse_record(self, record):
)
with app.app_context():
json_record = hep.do(record)
base_uri = self.settings['SCHEMA_BASE_URI']
json_record['$schema'] = base_uri + 'hep.json'
return ParsedItem(record=json_record, record_format='hep')
except Exception:
logger.exception("Error when parsing record")
Expand Down
36 changes: 23 additions & 13 deletions hepcrawl/spiders/oaipmh_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"""Generic spider for OAI-PMH servers."""

import logging
import sickle
from enum import Enum
from datetime import datetime

from sickle import Sickle
Expand All @@ -22,6 +22,19 @@

logger = logging.getLogger(__name__)


class _Granularity(Enum):
DATE = 'YYYY-MM-DD'
SECOND = 'YYYY-MM-DDThh:mm:ssZ'

def format(self, datetime_object):
if self == self.DATE:
return datetime_object.strftime('%Y-%m-%d')
if self == self.SECOND:
return datetime_object.strftime('%Y-%m-%dT%H:%M:%SZ')
raise ValueError("Invalid granularity: %s" % self.granularity)


class OAIPMHSpider(Spider):
"""
Implements a spider for the OAI-PMH protocol by using the Python sickle library.
Expand All @@ -31,12 +44,15 @@ class OAIPMHSpider(Spider):
"""
name = 'OAI-PMH'
state = {}
granularity = _Granularity.DATE

def __init__(self, url, metadata_prefix='marcxml', set=None, alias=None, from_date=None, until_date=None, granularity='YYYY-MM-DD', record_class=Record, *args, **kwargs):
def __init__(self, url, metadata_prefix='marcxml', oai_set=None, alias=None,
from_date=None, until_date=None, granularity='',
record_class=Record, *args, **kwargs):
super(OAIPMHSpider, self).__init__(*args, **kwargs)
self.url = url
self.metadata_prefix = metadata_prefix
self.set = set
self.set = oai_set
self.granularity = granularity
self.alias = alias or self._make_alias()
self.from_date = from_date
Expand All @@ -47,7 +63,9 @@ def __init__(self, url, metadata_prefix='marcxml', set=None, alias=None, from_da
def start_requests(self):
self.from_date = self.from_date or self.state.get(self.alias)
logger.info("Current state 2:{}".format(self.state))
logger.info("Starting harvesting of {url} with set={set} and metadataPrefix={metadata_prefix}, from={from_date}, until={until_date}".format(
logger.info("Starting harvesting of {url} with set={set} and "
"metadataPrefix={metadata_prefix}, from={from_date}, "
"until={until_date}".format(
url=self.url,
set=self.set,
metadata_prefix=self.metadata_prefix,
Expand All @@ -57,7 +75,7 @@ def start_requests(self):
now = datetime.utcnow()
request = Request('oaipmh+{}'.format(self.url), self.parse)
yield request
self.state[self.alias] = self._format_date(now)
self.state[self.alias] = self.granularity.format(now)
logger.info("Harvesting completed. Next harvesting will resume from {}".format(self.state[self.alias]))

def parse_record(self, record):
Expand All @@ -84,14 +102,6 @@ def parse(self, response):
for record in records:
yield self.parse_record(record)

def _format_date(self, datetime_object):
if self.granularity == 'YYYY-MM-DD':
return datetime_object.strftime('%Y-%m-%d')
elif self.granularity == 'YYYY-MM-DDThh:mm:ssZ':
return datetime_object.strftime('%Y-%m-%dT%H:%M:%SZ')
else:
raise RuntimeError("Invalid granularity: %s" % self.granularity)

def _make_alias(self):
return '{url}-{metadata_prefix}-{set}'.format(
url=self.url,
Expand Down
Loading

0 comments on commit b500aa0

Please sign in to comment.