-
Notifications
You must be signed in to change notification settings - Fork 30
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
parsers: create an NLM parser #209
base: master
Are you sure you want to change the base?
Changes from 2 commits
eec9f7a
724ce2d
7b925d0
6b96a6c
ad95e0b
d1e4cbe
2b87bfe
8cdf47a
d745f78
3106759
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,334 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# This file is part of hepcrawl. | ||
# Copyright (C) 2018 CERN. | ||
# | ||
# hepcrawl is a free software; you can redistribute it and/or modify it | ||
# under the terms of the Revised BSD License; see LICENSE file for | ||
# more details. | ||
|
||
"""Parser for NLM data format""" | ||
|
||
from __future__ import absolute_import, division, print_function | ||
|
||
import six | ||
|
||
from itertools import chain | ||
|
||
from inspire_schemas.api import LiteratureBuilder | ||
from inspire_utils.date import PartialDate | ||
from inspire_utils.helpers import maybe_int | ||
from inspire_utils.name import ParsedName | ||
|
||
from ..utils import get_node | ||
|
||
|
||
NLM_OBJECT_TYPE_TO_HEP_MAP = { | ||
'Erratum': 'erratum', | ||
'Reprint': 'reprint', | ||
'Update': 'addendum', | ||
'Dataset': 'data', | ||
} | ||
|
||
|
||
class NLMParser(object): | ||
"""Parser for the NLM format. | ||
|
||
It can be used directly by invoking the :func:`NLMParser.parse` method, | ||
or be subclassed to customize its behavior. | ||
|
||
Args: | ||
nlm_record (Union[string, scrapy.selector.Selector]): the record in NLM | ||
format to parse. | ||
source (Optional[string]): if provided, sets the ``source`` everywhere | ||
in the record. Otherwise the source is extracted from the metadata. | ||
""" | ||
def __init__(self, nlm_record, source=None): | ||
self.root = self.get_root_node(nlm_record) | ||
if not source: | ||
source = self.publisher | ||
self.builder = LiteratureBuilder(source) | ||
|
||
def parse(self): | ||
"""Extract an NLM record into an Inspire HEP record. | ||
|
||
Returns: | ||
dict: the same record in the Inspire Literature schema. | ||
""" | ||
self.builder.add_abstract(self.abstract) | ||
self.builder.add_title(self.title) | ||
self.builder.add_copyright(**self.copyright) | ||
self.builder.add_document_type(self.document_type) | ||
for author in self.authors: | ||
self.builder.add_author(author) | ||
self.builder.add_publication_info(**self.publication_info) | ||
self.builder.add_publication_type(self.publication_type) | ||
for collab in self.collaborations: | ||
self.builder.add_collaboration(collab) | ||
for doi in self.dois: | ||
self.builder.add_doi(**doi) | ||
for keyword in self.keywords: | ||
self.builder.add_keyword(keyword) | ||
self.builder.add_imprint_date(self.print_publication_date.dumps()) | ||
|
||
return self.builder.record | ||
|
||
@classmethod | ||
def bulk_parse(cls, nlm_records, source=None): | ||
"""Parse a whole ArticleSet. | ||
|
||
Args: | ||
nlm_records (Union[string, scrapy.selector.Selector]): records | ||
source (Optional[string]): source passed to `__init__` | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please document return value |
||
""" | ||
root = cls.get_root_node(nlm_records) | ||
nlm_records = root.xpath('/ArticleSet/Article').extract() | ||
return [ | ||
cls(nlm_record, source=source).parse() | ||
for nlm_record in nlm_records | ||
] | ||
|
||
@property | ||
def abstract(self): | ||
return self.root.xpath('normalize-space(./Abstract)').extract_first() | ||
|
||
@property | ||
def title(self): | ||
return self.root.xpath('./ArticleTitle/text()').extract_first() | ||
|
||
@property | ||
def copyright(self): | ||
return { | ||
'material': self.material, | ||
'statement': self.copyright_statement, | ||
} | ||
|
||
@property | ||
def copyright_statement(self): | ||
return self.root.xpath( | ||
'normalize-space(./CopyrightInformation)' | ||
).extract_first() | ||
|
||
@property | ||
def document_type(self): | ||
"""Return an applicable inspire document_type. | ||
|
||
For list of NLM PublicationTypes see: | ||
www.ncbi.nlm.nih.gov/books/NBK3828/#publisherhelp.PublicationType_O | ||
""" | ||
pub_type = self.root.xpath('./PublicationType/text()').extract_first() | ||
|
||
if 'Conference' in pub_type or pub_type == 'Congresses': | ||
return 'proceedings' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I got an example IOP update from @david-caro with a few records, but unfortunately none of them actually have the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like it. But I would not be surprised if IOP put its own values there anyway, that have nothing to do with those in the spec. |
||
if 'Report' in pub_type: | ||
return 'report' | ||
|
||
return 'article' | ||
|
||
@property | ||
def publication_type(self): | ||
"""Return an applicable inspire publication_type. | ||
|
||
For list of NLM PublicationTypes see: | ||
www.ncbi.nlm.nih.gov/books/NBK3828/#publisherhelp.PublicationType_O | ||
""" | ||
pub_type = self.root.xpath('./PublicationType/text()').extract_first() | ||
|
||
if pub_type == 'Lectures': | ||
return 'lectures' | ||
if pub_type == 'Review': | ||
return 'review' | ||
|
||
@property | ||
def authors(self): | ||
authors = self.root.xpath('./AuthorList/Author') | ||
authors_in_collaborations = self.root.xpath( | ||
'./GroupList/Group' | ||
'[GroupName/text()=../../AuthorList/Author/CollectiveName/text()]' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what's the purpose of this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
'/IndividualName' | ||
) | ||
return [ | ||
self.get_author(author) | ||
for author in chain(authors, authors_in_collaborations) | ||
if self.get_author(author) is not None | ||
] | ||
|
||
@property | ||
def publication_info(self): | ||
pub_date = self.print_publication_date or self.online_publication_date | ||
|
||
publication_info = { | ||
'journal_title': self.journal_title, | ||
'journal_issue': self.journal_issue, | ||
'journal_volume': self.journal_volume, | ||
'material': self.material, | ||
'page_start': self.page_start, | ||
'page_end': self.page_end, | ||
'year': pub_date.year, | ||
} | ||
|
||
return publication_info | ||
|
||
@property | ||
def journal_title(self): | ||
return self.root.xpath('./Journal/JournalTitle/text()').extract_first() | ||
|
||
@property | ||
def journal_issue(self): | ||
return self.root.xpath('./Journal/Issue/text()').extract_first() | ||
|
||
@property | ||
def journal_volume(self): | ||
return self.root.xpath('./Journal/Volume/text()').extract_first() | ||
|
||
@property | ||
def material(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
object_type = self.root.xpath('Object/@Type').extract_first() | ||
|
||
if object_type in NLM_OBJECT_TYPE_TO_HEP_MAP: | ||
return NLM_OBJECT_TYPE_TO_HEP_MAP[object_type] | ||
|
||
return 'publication' | ||
|
||
|
||
@property | ||
def page_start(self): | ||
return self.root.xpath('./FirstPage/text()').extract_first() | ||
|
||
@property | ||
def page_end(self): | ||
return self.root.xpath('./LastPage/text()').extract_first() | ||
|
||
@property | ||
def collaborations(self): | ||
return self.root.xpath('.//Author/CollectiveName/text()').extract() | ||
|
||
@property | ||
def dois(self): | ||
dois = self.root.xpath( | ||
'.//ArticleIdList/ArticleId[@IdType="doi"]/text()' | ||
).extract() | ||
|
||
if not dois: | ||
dois = self.root.xpath( | ||
'.//ELocationID[@EIdType="doi"]/text()' | ||
).extract() | ||
|
||
return [{'doi': value, 'material': self.material} for value in dois] | ||
|
||
@property | ||
def keywords(self): | ||
return self.root.xpath( | ||
'./ObjectList/Object[@Type="keyword"]/Param[@Name="value"]/text()' | ||
).extract() | ||
|
||
@property | ||
def print_publication_date(self): | ||
"""Date of the print publication. | ||
|
||
PubDate tags may appear in root of the Article or as part of | ||
article's History. | ||
""" | ||
pub_date = self.root.xpath('.//PubDate[@PubStatus="ppublish"]') | ||
pub_date_no_tag = self.root.xpath('.//PubDate[not(@PubStatus)]') | ||
return self.partial_date_from_date_node(pub_date or pub_date_no_tag) | ||
|
||
@property | ||
def online_publication_date(self): | ||
"""Date of the only-only publication. | ||
|
||
PubDate tags may appear in root of the Article or as part of | ||
article's History. | ||
""" | ||
pub_date = self.root.xpath('.//PubDate[@PubStatus="epublish"]') | ||
return self.partial_date_from_date_node(pub_date) | ||
|
||
@property | ||
def publisher(self): | ||
return self.root.xpath( | ||
'./Journal/PublisherName/text()' | ||
).extract_first() | ||
|
||
@staticmethod | ||
def get_root_node(record): | ||
"""Get a selector on the root ``ArticleSet`` node of the record. | ||
|
||
This can be overridden in case some preprocessing needs to be done on | ||
the XML. | ||
|
||
Args: | ||
record(Union[str, scrapy.selector.Selector]): | ||
the record in NLM format. | ||
|
||
Returns: | ||
scrapy.selector.Selector: a selector on the root ``<article>`` | ||
node. | ||
""" | ||
if isinstance(record, six.string_types): | ||
root = get_node(record) | ||
else: | ||
root = record | ||
|
||
return root | ||
|
||
def get_author(self, author_node): | ||
"""Get HEP conforming author information | ||
|
||
Args: | ||
author_node(scrapy.selector.Selector): <Author> node | ||
|
||
Returns: | ||
dict: extracted author information | ||
""" | ||
first = author_node.xpath('./FirstName/text()').extract_first() | ||
middle = author_node.xpath('./MiddleName/text()').extract_first() | ||
last = author_node.xpath('./LastName/text()').extract_first() | ||
suffix = author_node.xpath('./Suffix/text()').extract_first() | ||
full_name = ParsedName.from_parts(first, last, middle, suffix).dumps() | ||
|
||
affiliations = author_node.xpath('.//Affiliation/text()').extract() | ||
affiliations = [self.normalize_space(aff) for aff in affiliations] | ||
ids = author_node.xpath('./Identifier/text()').extract() | ||
|
||
return self.builder.make_author( | ||
full_name, | ||
raw_affiliations=affiliations, | ||
ids=[(None, id_) for id_ in ids], | ||
) | ||
|
||
@staticmethod | ||
def partial_date_from_date_node(node): | ||
"""Parse an XML date node into PartialDate, if possible. | ||
|
||
Args: | ||
node (scrapy.selector.Selector): an XML node to parse | ||
|
||
Returns: | ||
Union[PartialDate, None]: a PartialDate of None if couldn't parse | ||
""" | ||
try: | ||
day = node.xpath('./Day/text()').extract_first() | ||
month = node.xpath('./Month/text()').extract_first() | ||
year = node.xpath('./Year/text()').extract_first() | ||
return PartialDate( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's better to use
|
||
maybe_int(year), | ||
maybe_int(month), | ||
maybe_int(day) | ||
) | ||
except ValueError: | ||
return None | ||
|
||
@staticmethod | ||
def normalize_space(text): | ||
"""XML normalize space. | ||
|
||
Removes leading and trailing whitespace, | ||
replaces strings of whitespace with single space. | ||
|
||
Args: | ||
text (string): input string | ||
|
||
Returns: | ||
string: normalized string | ||
""" | ||
return " ".join(text.split()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
'Republished': 'reprint'
also