From ea0ace2d9ae988f72749838b65da64c3b617afcd Mon Sep 17 00:00:00 2001 From: Victor Balbuena Date: Thu, 21 Feb 2019 13:28:56 +0100 Subject: [PATCH] PoS spider: use OAI-PMH spider Signed-off-by: Victor Balbuena --- docker-compose.test.yml | 33 +- hepcrawl/spiders/pos_spider.py | 694 ++++++++++-------- .../pos/fixtures/http_server/conf/proxy.conf | 15 + .../records/187.html | 0 .../records/pos-conference-ihep.xml | 54 ++ .../http_server/records/pos-single.xml | 33 + .../pos/fixtures/https_server/conf/proxy.conf | 25 - .../fixtures/https_server/conf/ssl/cert.key | 28 - .../fixtures/https_server/conf/ssl/cert.pem | 19 - .../records/PoS(LATTICE 2013)001.html | 58 -- .../pos/fixtures/oai_harvested/pos_record.xml | 33 - .../pos_conference_proceedings_records.json | 101 --- .../functional/pos/fixtures/pos_expected.json | 121 +++ .../pos/fixtures/pos_single_expected.json | 62 ++ tests/functional/pos/test_pos.py | 67 +- .../unit/responses/pos/sample_pos_record.xml | 28 +- 16 files changed, 724 insertions(+), 647 deletions(-) create mode 100644 tests/functional/pos/fixtures/http_server/conf/proxy.conf rename tests/functional/pos/fixtures/{https_server => http_server}/records/187.html (100%) create mode 100644 tests/functional/pos/fixtures/http_server/records/pos-conference-ihep.xml create mode 100644 tests/functional/pos/fixtures/http_server/records/pos-single.xml delete mode 100644 tests/functional/pos/fixtures/https_server/conf/proxy.conf delete mode 100755 tests/functional/pos/fixtures/https_server/conf/ssl/cert.key delete mode 100755 tests/functional/pos/fixtures/https_server/conf/ssl/cert.pem delete mode 100644 tests/functional/pos/fixtures/https_server/records/PoS(LATTICE 2013)001.html delete mode 100644 tests/functional/pos/fixtures/oai_harvested/pos_record.xml delete mode 100644 tests/functional/pos/fixtures/pos_conference_proceedings_records.json create mode 100644 tests/functional/pos/fixtures/pos_expected.json create mode 100644 tests/functional/pos/fixtures/pos_single_expected.json diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 1b1cbc9a..512346e3 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -64,7 +64,7 @@ services: depends_on: scrapyd: condition: service_healthy - http-server.local: + pos-http-server.local: condition: service_healthy unit: @@ -118,22 +118,6 @@ services: - ${PWD}/tests/functional/wsp/fixtures/ftp_server/WSP:/home/ftpusers/bob/WSP - ${PWD}/tests/functional/wsp/fixtures/ftp_server/pureftpd.passwd:/etc/pure-ftpd/passwd/pureftpd.passwd - http-server.local: - image: nginx:stable-alpine - volumes: - - ${PWD}/tests/functional/pos/fixtures/https_server/conf/proxy.conf:/etc/nginx/conf.d/default.conf - - ${PWD}/tests/functional/pos/fixtures/https_server/conf/ssl:/etc/nginx/ssl - - ${PWD}/tests/functional/pos/fixtures/https_server/records:/etc/nginx/html/ - ports: - - 443:443 - healthcheck: - timeout: 5s - interval: 5s - retries: 5 - test: - - "CMD-SHELL" - - "curl https://localhost:443/" - functional_cds: <<: *service_base command: py.test -vv tests/functional/cds @@ -173,6 +157,21 @@ services: - "CMD-SHELL" - "curl http://localhost:80/" + pos-http-server.local: + image: nginx:stable-alpine + volumes: + - ${PWD}/tests/functional/pos/fixtures/http_server/conf/proxy.conf:/etc/nginx/conf.d/default.conf + - ${PWD}/tests/functional/pos/fixtures/http_server/records:/etc/nginx/html/ + ports: + - 80:80 + healthcheck: + timeout: 5s + interval: 5s + retries: 5 + test: + - "CMD-SHELL" + - "curl http://localhost:80/" + rabbitmq: image: rabbitmq healthcheck: diff --git a/hepcrawl/spiders/pos_spider.py b/hepcrawl/spiders/pos_spider.py index 0e4a5700..02f8da4f 100644 --- a/hepcrawl/spiders/pos_spider.py +++ b/hepcrawl/spiders/pos_spider.py @@ -7,7 +7,7 @@ # under the terms of the Revised BSD License; see LICENSE file for # more details. -"""Spider for POS.""" +"""Spider for POS OAI-PMH interface.""" from __future__ import absolute_import, division, print_function @@ -18,7 +18,7 @@ from scrapy import Request, Selector -from . import StatefulSpider +from .common.oaipmh_spider import OAIPMHSpider from ..dateutils import create_valid_date from ..items import HEPRecord from ..loaders import HEPLoader @@ -30,12 +30,11 @@ ) -DEFAULT_BASE_URL = 'https://pos.sissa.it' DEFAULT_BASE_CONFERENCE_PAPER_URL = ( - DEFAULT_BASE_URL + '/contribution?id=' + 'https://pos.sissa.it/contribution?id=' ) DEFAULT_BASE_PROCEEDINGS_URL = ( - DEFAULT_BASE_URL + '/cgi-bin/reader/conf.cgi?confid=' + 'https://pos.sissa.it//cgi-bin/reader/conf.cgi?confid=' ) @@ -43,7 +42,7 @@ class PoSExtractionException(Exception): pass -class POSSpider(StatefulSpider): +class POSSpider(OAIPMHSpider): """POS/Sissa crawler. From PoS we create two types of records, a conference paper record, and a @@ -73,356 +72,417 @@ class POSSpider(StatefulSpider): Example: :: $ scrapy crawl PoS \\ - -a "source_file=file://$PWD/tests/unit/responses/pos/sample_pos_record.xml" + -a "sets=conference:IHEP-LHC" -a "from_date=2012-12-13" """ - name = 'pos' + name = 'PoS' + source = 'PoS' @strict_kwargs def __init__( self, - source_file=None, - base_conference_paper_url=DEFAULT_BASE_CONFERENCE_PAPER_URL, - base_proceedings_url=DEFAULT_BASE_PROCEEDINGS_URL, + url='https://pos.sissa.it/oai', + format='oai_dc', + sets=None, + from_date=None, + until_date=None, **kwargs ): - super(POSSpider, self).__init__(**kwargs) - self.source_file = source_file - self.base_conference_paper_url = base_conference_paper_url - self.base_proceedings_url = base_proceedings_url - - def start_requests(self): - yield Request(self.source_file) - - def parse(self, response): - self.logger.info('Got record from: {response.url}'.format(**vars())) - - response.selector.remove_namespaces() - record_xml_selectors = response.selector.xpath('.//record') - for record_xml_selector in record_xml_selectors: - yield self.get_conference_paper_page_request( - xml_selector=record_xml_selector, - ) - - def get_conference_paper_page_request(self, xml_selector, meta=None): - """Gets the conference paper html page, for the pdf link for the - conference paper, and later the internal conference id. - """ - meta = meta or {} - - identifier = xml_selector.xpath( - './/metadata/pex-dc/identifier/text()' - ).extract_first() - conference_paper_url = "{0}{1}".format( - self.base_conference_paper_url, - identifier, - ) - meta['xml_record'] = xml_selector.extract() - - # the meta parameter will be passed over to the callback as a property - # in the response parameter - return Request( - url=conference_paper_url, - callback=self.parse_conference_paper, - meta=meta + super(POSSpider, self).__init__( + url=url, + format=format, + sets=sets, + from_date=from_date, + until_date=until_date, + **kwargs ) - def parse_conference_paper(self, response): - self.logger.info( - 'Parsing conference paper from: {response.url}'.format(**vars()) - ) - xml_record = response.meta.get('xml_record') - conference_paper_url = response.url - conference_paper_pdf_url = self._get_conference_paper_pdf_url( - conference_paper_page_html=response.body, - ) + def get_record_identifier(self, record): + """Extracts a unique identifier from a sickle record.""" + return record.header.identifier - parsed_conference_paper = self.build_conference_paper_item( - xml_record=xml_record, - conference_paper_url=conference_paper_url, - conference_paper_pdf_url=conference_paper_pdf_url, - ) - yield parsed_conference_paper + def parse_record(self, selector): + """Parse a PoS MARCXML record into a HEP record.""" + selector.remove_namespaces() + conference_paper_page_request = get_conference_paper_page_request(xml_selector=selector) - # prepare next callback step - response.meta['html_record'] = response.body - yield self.get_conference_proceedings_page_request( - meta=response.meta, - ) + return conference_paper_page_request - def get_conference_proceedings_page_request(self, meta): - """Gets the conference proceedings page, using the indernal conference - id from the record html page retrieved before. - """ - if not meta.get('html_record'): - raise PoSExtractionException( - 'PoS conference paper page was empty, current meta:\n%s' % meta - ) - - proceedings_page_url = self._get_proceedings_page_url( - page_html=meta.get('html_record'), - ) - page_selector = Selector( - text=meta.get('xml_record'), - type='xml', - ) - page_selector.remove_namespaces() - pos_id = page_selector.xpath( - ".//metadata/pex-dc/identifier/text()" - ).extract_first() - meta['pos_id'] = pos_id - - return Request( - url=proceedings_page_url, - meta=meta, - callback=self.parse_conference_proceedings, - ) +class POSSpiderSingle(OAIPMHSpider): + """POS/Sissa crawler. - def parse_conference_proceedings(self, request): - parsed_conference_proceedings = self.build_conference_proceedings_item( - proceedings_page_html=request.body, - pos_id=request.meta['pos_id'], - ) - yield parsed_conference_proceedings + From PoS we create two types of records, a conference paper record, and a + conference proceedings record. - def _get_proceedings_page_url(self, page_html): - page_selector = Selector( - text=page_html, - type="html" - ) - internal_url = page_selector.xpath( - "//a[not(contains(text(),'pdf'))]/@href", - ).extract_first() - proceedings_internal_id = internal_url.split('/')[1] - return '{0}{1}'.format( - self.base_proceedings_url, - proceedings_internal_id, - ) + The bulk of the records comes from oaiharvest, and this spider crawls the + files generated by it. - def build_conference_paper_item( - self, - xml_record, - conference_paper_url, - conference_paper_pdf_url, - ): - selector = Selector( - text=xml_record, - type="xml" - ) - selector.remove_namespaces() - record = HEPLoader( - item=HEPRecord(), - selector=selector - ) + For the conference paper record we have to scrape also the html page of the + record on the PoS website to get the pdf link. (see + `DEFAULT_BASE_CONFERENCE_PAPER_URL`) - license_text = selector.xpath( - './/metadata/pex-dc/rights/text()' - ).extract_first() - record.add_value('license', get_licenses(license_text=license_text)) - - date, year = self._get_date(selector=selector) - record.add_value('date_published', date) - record.add_value('journal_year', year) - - identifier = selector.xpath( - ".//metadata/pex-dc/identifier/text()" - ).extract_first() - record.add_value( - 'journal_title', - self._get_journal_title(pos_ext_identifier=identifier), - ) - record.add_value( - 'journal_volume', - self._get_journal_volume(pos_ext_identifier=identifier), - ) - record.add_value( - 'journal_artid', - self._get_journal_artid(pos_ext_identifier=identifier), - ) + Then, from that same page, we get the internal conference id. - record.add_xpath('title', '//metadata/pex-dc/title/text()') - record.add_xpath('source', '//metadata/pex-dc/publisher/text()') - record.add_value( - 'external_system_numbers', - self._get_ext_systems_number(selector=selector), - ) - record.add_value('language', self._get_language(selector=selector)) - record.add_value('authors', self._get_authors(selector=selector)) - record.add_value('collections', ['conferencepaper']) - record.add_value('urls', [conference_paper_url]) - record.add_value( - 'documents', - self.get_documents( - path=conference_paper_pdf_url, - ), - ) + With that conference id, then we scrape the conference proceedings page, + and extract the information to create the proceedings record. (see + `DEFAULT_BASE_PROCEEDINGS_URL`) - parsed_item = ParsedItem( - record=record.load_item(), - record_format='hepcrawl', - ) + To do that and because each needs the information of the previous, the + spider must use the callbacks system provided by scrapy through the + :class:`scrapy.html.response.Response` callback parameter, and chain the + parser functions. - return parsed_item + The deduplication of the conference proceedings papers is left for the + `HepcrawlCrawlOnceMiddleware` middleware. + + Example: + :: + $ scrapy crawl PoS_single -a "identifier=oai:pos.sissa.it:IHEP-LHC/001" + """ + name = 'PoS_single' + source = 'PoS' - def build_conference_proceedings_item( + @strict_kwargs + def __init__( self, - proceedings_page_html, - pos_id, + url='https://pos.sissa.it/oai', + format='oai_dc', + identifier=None, + **kwargs ): - selector = Selector( - text=proceedings_page_html, - type='html', + super(POSSpiderSingle, self).__init__( + url=url, + format=format, + identifier=identifier, + **kwargs ) + + def get_record_identifier(self, record): + """Extracts a unique identifier from a sickle record.""" + return record.header.identifier + + def parse_record(self, selector): + """Parse a PoS MARCXML record into a HEP record.""" selector.remove_namespaces() - record = HEPLoader( - item=HEPRecord(), - selector=selector - ) + conference_paper_page_request = get_conference_paper_page_request(xml_selector=selector) - record.add_value('collections', ['proceedings']) - record.add_value( - 'title', - self._get_proceedings_title(selector=selector), - ) - record.add_value( - 'subtitle', - self._get_proceedings_date_place(selector=selector), - ) - record.add_value('journal_title', 'PoS') - record.add_value( - 'journal_volume', - self._get_journal_volume(pos_ext_identifier=pos_id), + return conference_paper_page_request + + +def get_conference_paper_page_request(xml_selector, meta=None): + """Gets the conference paper html page, for the pdf link for the + conference paper, and later the internal conference id. + """ + meta = meta or {} + + identifier = xml_selector.xpath( + './/metadata/dc/identifier/text()' + ).extract_first() + conference_paper_url = "{0}{1}".format( + DEFAULT_BASE_CONFERENCE_PAPER_URL, + identifier, + ) + meta['xml_record'] = xml_selector.extract() + + # the meta parameter will be passed over to the callback as a property + # in the response parameter + return Request( + url=conference_paper_url, + callback=parse_conference_paper, + meta=meta + ) + + +def parse_conference_paper(response): + xml_record = response.meta.get('xml_record') + conference_paper_url = response.url + conference_paper_pdf_url = _get_conference_paper_pdf_url( + conference_paper_page_html=response.body, + ) + + parsed_conference_paper = build_conference_paper_item( + xml_record=xml_record, + conference_paper_url=conference_paper_url, + conference_paper_pdf_url=conference_paper_pdf_url, + ) + yield parsed_conference_paper + + # prepare next callback step + response.meta['html_record'] = response.body + yield get_conference_proceedings_page_request( + meta=response.meta, + ) + + +def get_conference_proceedings_page_request(meta): + """Gets the conference proceedings page, using the indernal conference + id from the record html page retrieved before. + """ + if not meta.get('html_record'): + raise PoSExtractionException( + 'PoS conference paper page was empty, current meta:\n%s' % meta ) - parsed_proceeding = ParsedItem( - record=record.load_item(), - record_format='hepcrawl', + proceedings_page_url = _get_proceedings_page_url( + page_html=meta.get('html_record'), + ) + + page_selector = Selector( + text=meta.get('xml_record'), + type='xml', + ) + page_selector.remove_namespaces() + pos_id = page_selector.xpath( + ".//metadata/dc/identifier/text()" + ).extract_first() + meta['pos_id'] = pos_id + + return Request( + url=proceedings_page_url, + meta=meta, + callback=parse_conference_proceedings, + ) + + +def parse_conference_proceedings(request): + parsed_conference_proceedings = build_conference_proceedings_item( + proceedings_page_html=request.body, + pos_id=request.meta['pos_id'], + ) + yield parsed_conference_proceedings + + +def _get_proceedings_page_url(page_html): + page_selector = Selector( + text=page_html, + type="html" + ) + internal_url = page_selector.xpath( + "//a[not(contains(text(),'pdf'))]/@href", + ).extract_first() + proceedings_internal_id = internal_url.split('/')[1] + return '{0}{1}'.format( + DEFAULT_BASE_PROCEEDINGS_URL, + proceedings_internal_id, + ) + + +def build_conference_paper_item( + xml_record, + conference_paper_url, + conference_paper_pdf_url, +): + selector = Selector( + text=xml_record, + type="xml" + ) + selector.remove_namespaces() + record = HEPLoader( + item=HEPRecord(), + selector=selector + ) + + license_text = selector.xpath( + './/metadata/dc/rights/text()' + ).extract_first() + record.add_value('license', get_licenses(license_text=license_text)) + + date, year = _get_date(selector=selector) + record.add_value('date_published', date) + record.add_value('journal_year', year) + + identifier = selector.xpath( + ".//metadata/dc/identifier/text()" + ).extract_first() + record.add_value( + 'journal_title', + _get_journal_title(pos_ext_identifier=identifier), + ) + record.add_value( + 'journal_volume', + _get_journal_volume(pos_ext_identifier=identifier), + ) + record.add_value( + 'journal_artid', + _get_journal_artid(pos_ext_identifier=identifier), + ) + + record.add_xpath('title', '//metadata/dc/title/text()') + record.add_xpath('source', '//metadata/dc/publisher/text()') + record.add_value( + 'external_system_numbers', + _get_ext_systems_number(selector=selector), + ) + record.add_value('language', _get_language(selector=selector)) + record.add_value('authors', _get_authors(selector=selector)) + record.add_value('collections', ['conferencepaper']) + record.add_value('urls', [conference_paper_url]) + record.add_value( + 'documents', + get_documents( + path=conference_paper_pdf_url, + ), + ) + + parsed_item = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) + + return parsed_item + + +def build_conference_proceedings_item( + proceedings_page_html, + pos_id, +): + selector = Selector( + text=proceedings_page_html, + type='html', + ) + selector.remove_namespaces() + record = HEPLoader( + item=HEPRecord(), + selector=selector + ) + + record.add_value('collections', ['proceedings']) + record.add_value( + 'title', + _get_proceedings_title(selector=selector), + ) + record.add_value( + 'subtitle', + _get_proceedings_date_place(selector=selector), + ) + record.add_value('journal_title', 'PoS') + record.add_value( + 'journal_volume', + _get_journal_volume(pos_ext_identifier=pos_id), + ) + + parsed_proceeding = ParsedItem( + record=record.load_item(), + record_format='hepcrawl', + ) + + return parsed_proceeding + + +def _get_conference_paper_pdf_url(conference_paper_page_html): + selector = Selector( + text=conference_paper_page_html, + type='html', + ) + conference_paper_pdf_relative_url = selector.xpath( + "//a[contains(text(),'pdf')]/@href", + ).extract_first() + + if not conference_paper_pdf_relative_url: + raise PoSExtractionException( + ( + 'Unable to get the conference paper pdf url from the html:' + '\n%s' + ) % conference_paper_page_html ) - return parsed_proceeding + return urljoin( + DEFAULT_BASE_CONFERENCE_PAPER_URL, + conference_paper_pdf_relative_url, + ) - def _get_conference_paper_pdf_url(self, conference_paper_page_html): - selector = Selector( - text=conference_paper_page_html, - type='html', - ) - conference_paper_pdf_relative_url = selector.xpath( - "//a[contains(text(),'pdf')]/@href", - ).extract_first() - - if not conference_paper_pdf_relative_url: - raise PoSExtractionException( - ( - 'Unable to get the conference paper pdf url from the html:' - '\n%s' - ) % conference_paper_page_html - ) - - return urljoin( - self.base_conference_paper_url, - conference_paper_pdf_relative_url, - ) - def _get_proceedings_url(self, response): - internal_url = response.selector.xpath( - "//a[not(contains(text(),'pdf'))]/@href", - ).extract_first() - proceedings_identifier = internal_url.split('/')[1] - return '{0}{1}'.format( - self.base_proceedings_url, - proceedings_identifier, - ) +def _get_proceedings_url(response): + internal_url = response.selector.xpath( + "//a[not(contains(text(),'pdf'))]/@href", + ).extract_first() + proceedings_identifier = internal_url.split('/')[1] + return '{0}{1}'.format( + DEFAULT_BASE_PROCEEDINGS_URL, + proceedings_identifier, + ) - @staticmethod - def get_documents(path): - return [ - { - 'key': os.path.basename(path), - 'url': quote(path, safe=':/'), - 'original_url': quote(path, safe=':/'), - 'hidden': True, - 'fulltext': True, - }, - ] - @staticmethod - def _get_language(selector): - language = selector.xpath( - ".//metadata/pex-dc/language/text()" - ).extract_first() - return language if language != 'en' else None +def get_documents(path): + return [ + { + 'key': os.path.basename(path), + 'url': quote(path, safe=':/'), + 'original_url': quote(path, safe=':/'), + 'hidden': True, + 'fulltext': True, + }, + ] + + +def _get_language(selector): + language = selector.xpath( + ".//metadata/dc/language/text()" + ).extract_first() + return language if language != 'en' else None + + +def _get_journal_title(pos_ext_identifier): + return re.split('[()]', pos_ext_identifier)[0] + + +def _get_journal_volume(pos_ext_identifier): + return re.split('[()]', pos_ext_identifier)[1] + - @staticmethod - def _get_journal_title(pos_ext_identifier): - return re.split('[()]', pos_ext_identifier)[0] +def _get_journal_artid(pos_ext_identifier): + return re.split('[()]', pos_ext_identifier)[2] - @staticmethod - def _get_journal_volume(pos_ext_identifier): - return re.split('[()]', pos_ext_identifier)[1] - @staticmethod - def _get_journal_artid(pos_ext_identifier): - return re.split('[()]', pos_ext_identifier)[2] +def _get_ext_systems_number(selector): + return [ + { + 'institute': 'pos', + 'value': selector.xpath( + './/identifier/text()' + ).extract_first() + }, + ] - @staticmethod - def _get_ext_systems_number(selector): - return [ + +def _get_date(selector): + full_date = selector.xpath( + ".//metadata/dc/date/text()" + ).extract_first() + date = create_valid_date(full_date) + year = int(date[0:4]) + + return date, year + + +def _get_authors(selector): + """Get article authors.""" + authors = [] + creators = selector.xpath('.//metadata/dc/creator') + for creator in creators: + auth_dict = {} + author = Selector(text=creator.extract()) + auth_dict['raw_name'] = get_first( + author.xpath('.//text()').extract(), + default='', + ) + auth_dict['affiliations'] = [ { - 'institute': 'pos', - 'value': selector.xpath( - './/identifier/text()' - ).extract_first() + 'value': '' }, ] + if auth_dict: + authors.append(auth_dict) + return authors + + +def _get_proceedings_title(selector): + return 'Proceedings, ' + selector.xpath('//h1/text()').extract_first() + + +def _get_proceedings_date_place(selector): + date_place = selector.xpath( + "//div[@class='conference_date']/text()" + ).extract() + return ''.join(date_place) - @staticmethod - def _get_date(selector): - full_date = selector.xpath( - ".//metadata/pex-dc/date/text()" - ).extract_first() - date = create_valid_date(full_date) - year = int(date[0:4]) - - return date, year - - @staticmethod - def _get_authors(selector): - """Get article authors.""" - authors = [] - creators = selector.xpath('.//metadata/pex-dc/creator') - for creator in creators: - auth_dict = {} - author = Selector(text=creator.extract()) - auth_dict['raw_name'] = get_first( - author.xpath('.//name//text()').extract(), - default='', - ) - for affiliation in author.xpath( - './/affiliation//text()' - ).extract(): - if 'affiliations' in auth_dict: - auth_dict['affiliations'].append( - { - 'value': affiliation - } - ) - else: - auth_dict['affiliations'] = [ - { - 'value': affiliation - }, - ] - if auth_dict: - authors.append(auth_dict) - return authors - - @staticmethod - def _get_proceedings_title(selector): - return 'Proceedings, ' + selector.xpath('//h1/text()').extract_first() - - @staticmethod - def _get_proceedings_date_place(selector): - date_place = selector.xpath( - "//div[@class='conference_date']/text()" - ).extract() - return ''.join(date_place) diff --git a/tests/functional/pos/fixtures/http_server/conf/proxy.conf b/tests/functional/pos/fixtures/http_server/conf/proxy.conf new file mode 100644 index 00000000..afc6162c --- /dev/null +++ b/tests/functional/pos/fixtures/http_server/conf/proxy.conf @@ -0,0 +1,15 @@ +server { + listen 80; + server_name localhost; + charset_types text/xml; + charset UTF-8; + + location /oai { + if ($args ~ from=2012-02-02&verb=ListRecords&set=conference%3AIHEP-LHC&metadataPrefix=oai_dc) { + rewrite ^.*$ /pos-conference-ihep.xml permanent; + } + if ($args ~ verb=GetRecord&metadataPrefix=oai_dc&identifier=oai%3Apos.sissa.it%3AIHEP-LHC%2F005) { + rewrite ^.*$ /pos-single.xml permanent; + } + } +} diff --git a/tests/functional/pos/fixtures/https_server/records/187.html b/tests/functional/pos/fixtures/http_server/records/187.html similarity index 100% rename from tests/functional/pos/fixtures/https_server/records/187.html rename to tests/functional/pos/fixtures/http_server/records/187.html diff --git a/tests/functional/pos/fixtures/http_server/records/pos-conference-ihep.xml b/tests/functional/pos/fixtures/http_server/records/pos-conference-ihep.xml new file mode 100644 index 00000000..46ceaff9 --- /dev/null +++ b/tests/functional/pos/fixtures/http_server/records/pos-conference-ihep.xml @@ -0,0 +1,54 @@ + + + 2019-02-25T14:16:29Z + + https://pos.sissa.it/oai + + +
+ oai:pos.sissa.it:IHEP-LHC/001 + 2013-07-25 + conference:IHEP-LHC + group:6 +
+ + + Status of the CMS detector and upgrade plans + Luigi Guiducci + High Energy Physics + LHC on the March; session Detector + Sissa Medialab + 2013-07-19T10:00:21Z + Text + application/pdf + PoS(IHEP-LHC)001 + en + IHEP-LHC (LHC on the March) isPartOf + Creative Commons Attribution-NonCommercial-ShareAlike + + +
+
+ oai:pos.sissa.it:IHEP-LHC/002 + 2013-07-25 + conference:IHEP-LHC + group:6 +
+ + + ALICE status and plans + Evgeny KRYSHEN + High Energy Physics + LHC on the March; session Detector + Sissa Medialab + 2013-05-13T16:45:04Z + Text + application/pdf + PoS(IHEP-LHC)002 + en + IHEP-LHC (LHC on the March) isPartOf + Creative Commons Attribution-NonCommercial-ShareAlike + + +
+
diff --git a/tests/functional/pos/fixtures/http_server/records/pos-single.xml b/tests/functional/pos/fixtures/http_server/records/pos-single.xml new file mode 100644 index 00000000..97ca211b --- /dev/null +++ b/tests/functional/pos/fixtures/http_server/records/pos-single.xml @@ -0,0 +1,33 @@ + + + 2019-02-27T14:26:55Z + + https://pos.sissa.it/oai + + + +
+ oai:pos.sissa.it:IHEP-LHC/005 + 2013-07-25 + conference:IHEP-LHC + group:6 +
+ + + Status of the ATLAS detector + Oleg SOLOVYANOV + High Energy Physics + LHC on the March; session Detector + Sissa Medialab + 2013-06-24T00:37:20Z + Text + application/pdf + PoS(IHEP-LHC)005 + en + IHEP-LHC (LHC on the March) isPartOf + Creative Commons Attribution-NonCommercial-ShareAlike + + +
+
+
\ No newline at end of file diff --git a/tests/functional/pos/fixtures/https_server/conf/proxy.conf b/tests/functional/pos/fixtures/https_server/conf/proxy.conf deleted file mode 100644 index 1591cbcd..00000000 --- a/tests/functional/pos/fixtures/https_server/conf/proxy.conf +++ /dev/null @@ -1,25 +0,0 @@ -server { - listen 443 ssl; - server_name localhost; - - ssl on; - ssl_protocols TLSv1 TLSv1.1 TLSv1.2; - ssl_certificate ssl/cert.pem; - ssl_certificate_key ssl/cert.key; - - location ~ /contribution { - if ($args ~* "^id=(.*)") { - set $mid $1; - set $args ''; - rewrite ^.*$ /$mid.html permanent; - } - } - - location ~ /cgi-bin/reader/conf.cgi { - if ($args ~* "^confid=(.*)") { - set $mid $1; - set $args ''; - rewrite ^.*$ /$mid.html permanent; - } - } -} diff --git a/tests/functional/pos/fixtures/https_server/conf/ssl/cert.key b/tests/functional/pos/fixtures/https_server/conf/ssl/cert.key deleted file mode 100755 index 19e1df68..00000000 --- a/tests/functional/pos/fixtures/https_server/conf/ssl/cert.key +++ /dev/null @@ -1,28 +0,0 @@ ------BEGIN PRIVATE KEY----- -MIIEvwIBADANBgkqhkiG9w0BAQEFAASCBKkwggSlAgEAAoIBAQChhBiOoipMRRcc -E5waKrGB01/QtRpfIGp5KmJfnif05dR05wWojHO6EtabZ2qbXtcSuyQ0vRNpbZUU -OzcriFOMk8dujDzuKMkegsq/LE4PyN/R5JZtf34NyGG7v70K6Uq7RV4PUzk+zoum -1McMUBk1QlGP/E9RsDlSPv9XOblUpicPDuwhCwPf4zi6jporgXjDJ/iUuh+bexxv -40R7f2dCWkiHYiNiLNLTwXdYkaWBcc3HoTq9FEZZhYDhWRjX0/TuINmMr5lbUvr6 -UYRABOS4VeUyHpb/e7OH9WXQxzR76LuQFfQDSgs0GxXw1KG58aq+P0ni2E77C4Iu -odQ8iT+jAgMBAAECggEBAIqJeFrXY7p5xIGznEChgBHgUR3+SPlxH4KARVLIoHMh -s2L2SVcx6Y2f3O38/Wb5KTcKx9polz7l3Go3BHJVg3xfwT7kENsipqeB/g+OHALU -BI7PJ+wR3/hIePQGWUsDobMRo8U3WDG0DfryJS09gvG4yabb/tkNc41FNdUGUR31 -7VInQFqv2/jZ/2A3s3DZ0Cns9vJuLhmf7629k3MVCuU7Rh0rStnVCA70kjgKzOfP -+26fnfd/MmrQYbaukw04+cwcwifGkF5Jis80qTWsgdF82rkzpwJLDo0Jd2HZFuOa -AHkWK2QiMzb6PS2Uo7Zarax9E+W2TLahANXZQQ32NAkCgYEAzKw7XbEwzWG/T7yX -EgNIAN7YtcGYr9sfHlVJ8bWYK7GZBbCkKDlGU+YGRE++plh/jtXYjsIFElWtv01Y -UpqBdWf7p8mXdtVoq6YyL5WuQVMwpjKHvegTXXwAoreEXZeKr1LKC11B14h+8wsR -D5uf0GVmdw12nSrzeu3Q4oSgss8CgYEAygU++fItIYuPtZfrC8qDcyEiOLQmAHtX -eTnEHOPy8ik+bdwF5Rg0nzxLu3RZ47ykGdEOzpGRO4B9V1EevwSEzX6VO7latMUS -cLKb3Y0bXm6qQcWG+LAlvyaHfAH0oN47xfScLDiUm6BKd4Eo9kpkgaQzSgUfFZNQ -6DHiA3Emau0CgYEAyel7Y3GjMGomvrXQ3x9HkDxH0/7Z71qe92CyYvZ/2VMKH9fk -Ch5+p9P8CLYW4anapQGH80WqlSzbDCd0Y4EzB6z+UceJWd0stnFtfw4N6znze3HM -AegJ+qaTRfL/bQlL8qwc0Fs+0i9A9enL+fbQEVmHXRl2E5TEwFgOQvkOQ3cCgYAA -4bD6qkHkKZXA9x7BeGrGb9iUYsTfr6ocD1J5xczjnaZ2GEW2UDq6jyrNcJ6LzeDx -c+YapKv7lH33iZUWxFBIDUtdbVul+k4wS7c+akU6TkVT8Ca8oxgnE2X39pI4uX+N -R5n+32hWnYZ1qwygtoZlwm+u3QLbtz7dJIqV9UJzqQKBgQCL8Xo9LA0Dm7ZsdDDI -I93YsjCELvBsonymmD1MTpk7uIA+qH8LAih+Vhonc17NtpXuas8eqc8ntuNLAgON -Tylvk32uaRqquHWl6MT7bwaaK7pD8KuOIUJdl5SEc+DDUcB2A2XLg7Yv08Dus8A7 -6J5oH8YJ3hqmVGZzbOo75IFerg== ------END PRIVATE KEY----- diff --git a/tests/functional/pos/fixtures/https_server/conf/ssl/cert.pem b/tests/functional/pos/fixtures/https_server/conf/ssl/cert.pem deleted file mode 100755 index 1418c1bb..00000000 --- a/tests/functional/pos/fixtures/https_server/conf/ssl/cert.pem +++ /dev/null @@ -1,19 +0,0 @@ ------BEGIN CERTIFICATE----- -MIIDATCCAemgAwIBAgIJAJRKy2TWwZqTMA0GCSqGSIb3DQEBCwUAMBcxFTATBgNV -BAMMDGh0dHBzX3NlcnZlcjAeFw0xNzA4MTQxNDQ1MTFaFw0yMDA2MDMxNDQ1MTFa -MBcxFTATBgNVBAMMDGh0dHBzX3NlcnZlcjCCASIwDQYJKoZIhvcNAQEBBQADggEP -ADCCAQoCggEBAKGEGI6iKkxFFxwTnBoqsYHTX9C1Gl8gankqYl+eJ/Tl1HTnBaiM -c7oS1ptnapte1xK7JDS9E2ltlRQ7NyuIU4yTx26MPO4oyR6Cyr8sTg/I39Hklm1/ -fg3IYbu/vQrpSrtFXg9TOT7Oi6bUxwxQGTVCUY/8T1GwOVI+/1c5uVSmJw8O7CEL -A9/jOLqOmiuBeMMn+JS6H5t7HG/jRHt/Z0JaSIdiI2Is0tPBd1iRpYFxzcehOr0U -RlmFgOFZGNfT9O4g2YyvmVtS+vpRhEAE5LhV5TIelv97s4f1ZdDHNHvou5AV9ANK -CzQbFfDUobnxqr4/SeLYTvsLgi6h1DyJP6MCAwEAAaNQME4wHQYDVR0OBBYEFAfu -RxroDak/yro7MbRfDogKVDmBMB8GA1UdIwQYMBaAFAfuRxroDak/yro7MbRfDogK -VDmBMAwGA1UdEwQFMAMBAf8wDQYJKoZIhvcNAQELBQADggEBAF5M/Gz6JDC1HoSm -6HFLBB9ul9TQQI3RhohwreCYyeZ866WrvqZfle+lxcgVburYCSyi5paFpvNK3DH2 -J0A2fDAMekZGcaJ7O5Zx0evTCwXoxDOhS+xO5IlGTXWCEKLeLkU27WJiLC9cTbFr -kfjL14IMnsioRzUz4a+aX5JllqnEccCDlHjSk1w5YvOvt6GC6Bvenouja2apPes/ -oJJpFwZVO0epqOQo1ndRGbt5NLv6YgZlvdFXWoKNKohzdfDV/RbW9BrbpyKSxFTm -usrmVcZTQpSf69zbnEVO8N3N6c1zNdETPON1ZGLW1O1MXWkQDZniH6LduXN/Oob7 -vYqvXlw= ------END CERTIFICATE----- diff --git a/tests/functional/pos/fixtures/https_server/records/PoS(LATTICE 2013)001.html b/tests/functional/pos/fixtures/https_server/records/PoS(LATTICE 2013)001.html deleted file mode 100644 index 64ad7a6f..00000000 --- a/tests/functional/pos/fixtures/https_server/records/PoS(LATTICE 2013)001.html +++ /dev/null @@ -1,58 +0,0 @@ - - - - PoS(LATTICE 2013)001 - - - - - - - - - - - Main Image - - - -

PoS(LATTICE 2013)001

- - - -
-

Heavy Flavour Physics Review

-

A. El-Khadra

-

in 31st International Symposium on Lattice Field Theory LATTICE 2013

-

Contribution: pdf

-
- - - - - - - - - diff --git a/tests/functional/pos/fixtures/oai_harvested/pos_record.xml b/tests/functional/pos/fixtures/oai_harvested/pos_record.xml deleted file mode 100644 index f65dfb9e..00000000 --- a/tests/functional/pos/fixtures/oai_harvested/pos_record.xml +++ /dev/null @@ -1,33 +0,0 @@ - - -2015-01-29T13:44:13Z - -https://pos.sissa.it/cgi-bin/oai/oai-script-spires-extended.cgi - - - -
- oai:pos.sissa.it:LATTICE 2013/001 - 2014-04-28 - conference:LATTICE 2013 - group:9 -
- - - Heavy Flavour Physics Review - Aida El-KhadraINFN and Università di Firenze - M. T. MacDonaldU of PecsLattice Field Theory - 31st International Symposium on Lattice Field Theory LATTICE 2013; Plenary sessions - Sissa Medialab - 2014-03-19T21:09:30Z - Text - application/pdf - PoS(LATTICE 2013)001 - en - LATTICE 2013 (31st International Symposium on Lattice Field Theory LATTICE 2013) isPartOf - Creative Commons Attribution-NonCommercial-ShareAlike - - -
-
-
diff --git a/tests/functional/pos/fixtures/pos_conference_proceedings_records.json b/tests/functional/pos/fixtures/pos_conference_proceedings_records.json deleted file mode 100644 index cd937f94..00000000 --- a/tests/functional/pos/fixtures/pos_conference_proceedings_records.json +++ /dev/null @@ -1,101 +0,0 @@ -[ - { - "_collections": [ "Literature" ], - "curated": false, - "publication_info": [ - { - "journal_volume": "LATTICE 2013", - "journal_title": "PoS" - } - ], - "document_type": [ - "proceedings" - ], - "titles": [ - { - "source": "pos", - "title": "Proceedings, 31st International Symposium on Lattice Field Theory LATTICE 2013", - "subtitle": "29 July \u2013 3 August, 2013 Mainz, Germany" - } - ], - "acquisition_source": { - "source": "pos", - "method": "hepcrawl", - "submission_number": "5652c7f6190f11e79e8000224dabeaad", - "datetime": "2017-04-03T10:26:40.365216" - } - }, - { - "_collections": [ "Literature" ], - "curated": false, - "acquisition_source": { - "source": "pos", - "method": "hepcrawl", - "submission_number": "5652c7f6190f11e79e8000224dabeaad", - "datetime": "2017-04-03T10:26:40.365216" - }, - "license": [ - { - "license": "Creative Commons Attribution-NonCommercial-ShareAlike" - } - ], - "titles": [ - { - "source": "Sissa Medialab", - "title": "Heavy Flavour Physics Review" - } - ], - "documents": [ - { - "fulltext": true, - "hidden": true, - "url": "https://http-server.local/187/001/pdf", - "original_url": "https://http-server.local/187/001/pdf", - "key": "pdf", - "source": "pos" - } - ], - "urls": [ - { - "value": "https://http-server.local/PoS(LATTICE%202013)001.html" - } - ], - "authors": [ - { - "raw_affiliations": [ - { - "source": "pos", - "value": "INFN and Universit\u00e0 di Firenze" - } - ], - "full_name": "El-Khadra, Aida" - }, - { - "raw_affiliations": [ - { - "source": "pos", - "value": "U of Pecs" - } - ], - "full_name": "MacDonald, M.T." - } - ], - "publication_info": [ - { - "journal_volume": "LATTICE 2013", - "year": 2014, - "artid": "001", - "journal_title": "PoS" - } - ], - "document_type": [ - "conference paper" - ], - "imprints": [ - { - "date": "2014-03-19" - } - ], - "citeable": true - } -] diff --git a/tests/functional/pos/fixtures/pos_expected.json b/tests/functional/pos/fixtures/pos_expected.json new file mode 100644 index 00000000..41197ee6 --- /dev/null +++ b/tests/functional/pos/fixtures/pos_expected.json @@ -0,0 +1,121 @@ +[ + { + "_collections": [ + "Literature" + ], + "acquisition_source": { + "datetime": "2019-02-26T15:22:48.594585", + "method": "hepcrawl", + "source": "PoS", + "submission_number": "5af38ee239da11e9bfe60242ac120008" + }, + "authors": [ + { + "full_name": "Guiducci, Luigi" + } + ], + "citeable": true, + "curated": false, + "document_type": [ + "conference paper" + ], + "documents": [ + { + "fulltext": true, + "hidden": true, + "key": "pdf", + "original_url": "https://pos.sissa.it/186/001/pdf", + "source": "PoS", + "url": "https://pos.sissa.it/186/001/pdf" + } + ], + "imprints": [ + { + "date": "2013-07-19" + } + ], + "license": [ + { + "license": "Creative Commons Attribution-NonCommercial-ShareAlike" + } + ], + "publication_info": [ + { + "journal_title": "PoS", + "journal_volume": "IHEP-LHC", + "year": 2013 + } + ], + "titles": [ + { + "source": "Sissa Medialab", + "title": "Status of the CMS detector and upgrade plans" + } + ], + "urls": [ + { + "value": "https://pos.sissa.it/contribution?id=PoS%28IHEP-LHC%29001" + } + ] + }, + { + "_collections": [ + "Literature" + ], + "acquisition_source": { + "datetime": "2019-02-26T15:24:15.858700", + "method": "hepcrawl", + "source": "PoS", + "submission_number": "8da46cf839da11e9bfe60242ac120008" + }, + "authors": [ + { + "full_name": "KRYSHEN, Evgeny" + } + ], + "citeable": true, + "curated": false, + "document_type": [ + "conference paper" + ], + "documents": [ + { + "fulltext": true, + "hidden": true, + "key": "pdf", + "original_url": "https://pos.sissa.it/186/002/pdf", + "source": "PoS", + "url": "https://pos.sissa.it/186/002/pdf" + } + ], + "imprints": [ + { + "date": "2013-05-13" + } + ], + "license": [ + { + "license": "Creative Commons Attribution-NonCommercial-ShareAlike" + } + ], + "publication_info": [ + { + "artid": "002", + "journal_title": "PoS", + "journal_volume": "IHEP-LHC", + "year": 2013 + } + ], + "titles": [ + { + "source": "Sissa Medialab", + "title": "ALICE status and plans" + } + ], + "urls": [ + { + "value": "https://pos.sissa.it/contribution?id=PoS%28IHEP-LHC%29002" + } + ] + } +] \ No newline at end of file diff --git a/tests/functional/pos/fixtures/pos_single_expected.json b/tests/functional/pos/fixtures/pos_single_expected.json new file mode 100644 index 00000000..4f3c943c --- /dev/null +++ b/tests/functional/pos/fixtures/pos_single_expected.json @@ -0,0 +1,62 @@ +[ + { + "_collections": [ + "Literature" + ], + "acquisition_source": { + "datetime": "2019-02-27T13:40:15.883925", + "method": "hepcrawl", + "source": "PoS", + "submission_number": "308823143a9511e9bfe60242ac120008" + }, + "authors": [ + { + "full_name": "SOLOVYANOV, Oleg" + } + ], + "citeable": true, + "curated": false, + "document_type": [ + "conference paper" + ], + "documents": [ + { + "fulltext": true, + "hidden": true, + "key": "pdf", + "original_url": "https://pos.sissa.it/186/005/pdf", + "source": "PoS", + "url": "https://pos.sissa.it/186/005/pdf" + } + ], + "imprints": [ + { + "date": "2013-06-24" + } + ], + "license": [ + { + "license": "Creative Commons Attribution-NonCommercial-ShareAlike" + } + ], + "publication_info": [ + { + "artid": "005", + "journal_title": "PoS", + "journal_volume": "IHEP-LHC", + "year": 2013 + } + ], + "titles": [ + { + "source": "Sissa Medialab", + "title": "Status of the ATLAS detector" + } + ], + "urls": [ + { + "value": "https://pos.sissa.it/contribution?id=PoS%28IHEP-LHC%29005" + } + ] + } +] \ No newline at end of file diff --git a/tests/functional/pos/test_pos.py b/tests/functional/pos/test_pos.py index 50d68af8..311a32d2 100644 --- a/tests/functional/pos/test_pos.py +++ b/tests/functional/pos/test_pos.py @@ -21,7 +21,7 @@ clean_dir, ) from hepcrawl.testlib.tasks import app as celery_app -from hepcrawl.testlib.utils import get_crawler_instance +from hepcrawl.testlib.utils import get_crawler_instance, deep_sort @pytest.fixture(scope='function', autouse=True) @@ -43,48 +43,59 @@ def override_generated_fields(record): def get_configuration(): - package_location = get_test_suite_path( - 'pos', - 'fixtures', - 'oai_harvested', - 'pos_record.xml', - test_suite='functional', - ) + return { + 'CRAWLER_HOST_URL': 'http://scrapyd:6800', + 'CRAWLER_PROJECT': 'hepcrawl', + 'CRAWLER_ARGUMENTS': { + 'from_date': '2012-02-02', + 'sets': 'conference:IHEP-LHC', + 'url': 'http://pos-http-server.local/oai', + } + } + +def get_configuration_single(): return { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { - 'source_file': 'file://' + package_location, - 'base_conference_paper_url': ( - 'https://http-server.local/contribution?id=' - ), - 'base_proceedings_url': ( - 'https://http-server.local/cgi-bin/reader/conf.cgi?confid=' - ), + 'identifier': 'PoS(IHEP-LHC)005', + 'url': 'http://pos-http-server.local/oai', } } @pytest.mark.parametrize( - 'expected_results, config', + 'expected_results, config, spider', [ ( expected_json_results_from_file( 'pos', 'fixtures', - 'pos_conference_proceedings_records.json', + 'pos_expected.json', ), get_configuration(), + 'PoS', + ), + ( + expected_json_results_from_file( + 'pos', + 'fixtures', + 'pos_single_expected.json', + ), + get_configuration_single(), + 'PoS_single', ), ], ids=[ 'smoke', + 'smoke_single', ] ) def test_pos_conference_paper_record_and_proceedings_record( expected_results, config, + spider, ): crawler = get_crawler_instance(config['CRAWLER_HOST_URL']) @@ -92,10 +103,10 @@ def test_pos_conference_paper_record_and_proceedings_record( app=celery_app, monitor_timeout=5, monitor_iter_limit=100, - events_limit=2, + events_limit=1, crawler_instance=crawler, project=config['CRAWLER_PROJECT'], - spider='pos', + spider=spider, settings={}, **config['CRAWLER_ARGUMENTS'] ) @@ -112,22 +123,8 @@ def test_pos_conference_paper_record_and_proceedings_record( override_generated_fields(expected) for expected in expected_results ] - gotten_results = sorted( - gotten_results, - key=lambda x: x['document_type'] - ) - expected_results = sorted( - expected_results, - key=lambda x: x['document_type'] - ) + gotten_results = deep_sort(gotten_results) + expected_results = deep_sort(expected_results) assert gotten_results == expected_results assert not crawl_result['errors'] - - -# TODO create test that receives conference paper record AND proceedings -# record. 'Crawl-once' plug-in needed. - - -# TODO create test that receives proceedings record ONLY. -# 'Crawl-once' plug-in needed. diff --git a/tests/unit/responses/pos/sample_pos_record.xml b/tests/unit/responses/pos/sample_pos_record.xml index f65dfb9e..359acfe4 100644 --- a/tests/unit/responses/pos/sample_pos_record.xml +++ b/tests/unit/responses/pos/sample_pos_record.xml @@ -13,20 +13,20 @@ https://pos.sissa.it/cgi-bin/oai/oai-script-spires-extended.cgi group:9 - - Heavy Flavour Physics Review - Aida El-KhadraINFN and Università di Firenze - M. T. MacDonaldU of PecsLattice Field Theory - 31st International Symposium on Lattice Field Theory LATTICE 2013; Plenary sessions - Sissa Medialab - 2014-03-19T21:09:30Z - Text - application/pdf - PoS(LATTICE 2013)001 - en - LATTICE 2013 (31st International Symposium on Lattice Field Theory LATTICE 2013) isPartOf - Creative Commons Attribution-NonCommercial-ShareAlike - + + Heavy Flavour Physics Review + Aida El-KhadraINFN and Università di Firenze + M. T. MacDonaldU of PecsLattice Field Theory + 31st International Symposium on Lattice Field Theory LATTICE 2013; Plenary sessions + Sissa Medialab + 2014-03-19T21:09:30Z + Text + application/pdf + PoS(LATTICE 2013)001 + en + LATTICE 2013 (31st International Symposium on Lattice Field Theory LATTICE 2013) isPartOf + Creative Commons Attribution-NonCommercial-ShareAlike +