diff --git a/hepcrawl/parsers/__init__.py b/hepcrawl/parsers/__init__.py index 1276d151..eea57145 100644 --- a/hepcrawl/parsers/__init__.py +++ b/hepcrawl/parsers/__init__.py @@ -12,3 +12,4 @@ from __future__ import absolute_import, division, print_function from .jats import JatsParser +from .nlm import NLMParser diff --git a/hepcrawl/parsers/nlm.py b/hepcrawl/parsers/nlm.py new file mode 100644 index 00000000..a2fbbef2 --- /dev/null +++ b/hepcrawl/parsers/nlm.py @@ -0,0 +1,359 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2018 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +"""Parser for NLM data format""" + +from __future__ import absolute_import, division, print_function + +import six + +from itertools import chain + +from inspire_schemas.api import LiteratureBuilder +from inspire_utils.date import PartialDate +from inspire_utils.helpers import remove_tags +from inspire_utils.name import ParsedName + +from ..utils import get_node + + +NLM_OBJECT_TYPE_TO_HEP_MAP = { + 'Erratum': 'erratum', + 'Reprint': 'reprint', + 'Republished': 'reprint', + 'Update': 'addendum', + 'Dataset': 'data', +} +"""Mapping from Object/@Type to HEP material. +See: https://www.ncbi.nlm.nih.gov/books/NBK3828/#publisherhelp.Object_O +""" + + +class NLMParser(object): + """Parser for the NLM format. + + It can be used directly by invoking the :func:`NLMParser.parse` method, + or be subclassed to customize its behavior. + + Args: + nlm_record (Union[string, scrapy.selector.Selector]): the record in NLM + format to parse. + source (Optional[string]): if provided, sets the ``source`` everywhere + in the record. Otherwise the source is extracted from the metadata. + """ + def __init__(self, nlm_record, source=None): + self.root = self.get_root_node(nlm_record) + if not source: + source = self.publisher + self.builder = LiteratureBuilder(source) + + def parse(self): + """Extract an NLM record into an Inspire HEP record. + + Returns: + dict: the same record in the Inspire Literature schema. + """ + self.builder.add_abstract(self.abstract) + self.builder.add_title(self.title) + self.builder.add_copyright(**self.copyright) + self.builder.add_document_type(self.document_type) + for author in self.authors: + self.builder.add_author(author) + self.builder.add_publication_info(**self.publication_info) + self.builder.add_publication_type(self.publication_type) + for collab in self.collaborations: + self.builder.add_collaboration(collab) + for doi in self.dois: + self.builder.add_doi(**doi) + for keyword in self.keywords: + self.builder.add_keyword(keyword) + if self.print_publication_date: + self.builder.add_imprint_date(self.print_publication_date.dumps()) + + return self.builder.record + + @classmethod + def bulk_parse(cls, nlm_records, source=None): + """Parse a whole ArticleSet. + + Args: + nlm_records (Union[string, scrapy.selector.Selector]): records + source (Optional[string]): source passed to `__init__` + + Returns: + List[dict]: list of HEP records, each corresponding to an Article + in the provided ArticleSet + """ + root = cls.get_root_node(nlm_records) + nlm_records = root.xpath('/ArticleSet/Article').extract() + return [ + cls(nlm_record, source=source).parse() + for nlm_record in nlm_records + ] + + @property + def abstract(self): + abstract_node = self.root.xpath('./Abstract') + + if not abstract_node: + return None + + abstract = self.normalize_space( + remove_tags( + abstract_node[0], + allowed_tags=['sup', 'sub'], + allowed_trees=['math'], + ) + ) + return abstract + + @property + def title(self): + return self.root.xpath('./ArticleTitle/text()').extract_first() + + @property + def copyright(self): + return { + 'material': self.material, + 'statement': self.copyright_statement, + } + + @property + def copyright_statement(self): + return self.root.xpath( + 'normalize-space(./CopyrightInformation)' + ).extract_first() + + @property + def document_type(self): + """Return an applicable inspire document_type. + + For list of NLM PublicationTypes see: + www.ncbi.nlm.nih.gov/books/NBK3828/#publisherhelp.PublicationType_O + """ + pub_type = self.root.xpath( + './PublicationType/text()' + ).extract_first(default='') + + if 'Conference' in pub_type or pub_type == 'Congresses': + return 'proceedings' + if 'Report' in pub_type: + return 'report' + + return 'article' + + @property + def publication_type(self): + """Return an applicable inspire publication_type. + + For list of NLM PublicationTypes see: + www.ncbi.nlm.nih.gov/books/NBK3828/#publisherhelp.PublicationType_O + """ + pub_type = self.root.xpath('./PublicationType/text()').extract_first() + + if pub_type == 'Lectures': + return 'lectures' + if pub_type == 'Review': + return 'review' + + @property + def authors(self): + authors = self.root.xpath('./AuthorList/Author') + authors_in_collaborations = self.root.xpath( + './GroupList/Group' + '[GroupName/text()=../../AuthorList/Author/CollectiveName/text()]' + '/IndividualName' + ) + return [ + self.get_author(author) + for author in chain(authors, authors_in_collaborations) + if self.get_author(author) is not None + ] + + @property + def publication_info(self): + pub_date = self.print_publication_date or self.online_publication_date + + publication_info = { + 'journal_title': self.journal_title, + 'journal_issue': self.journal_issue, + 'journal_volume': self.journal_volume, + 'material': self.material, + 'page_start': self.page_start, + 'page_end': self.page_end, + 'year': pub_date.year if pub_date else None, + } + + return publication_info + + @property + def journal_title(self): + return self.root.xpath('./Journal/JournalTitle/text()').extract_first() + + @property + def journal_issue(self): + return self.root.xpath('./Journal/Issue/text()').extract_first() + + @property + def journal_volume(self): + return self.root.xpath('./Journal/Volume/text()').extract_first() + + @property + def material(self): + object_type = self.root.xpath('Object/@Type').extract_first() + + # See: www.ncbi.nlm.nih.gov/books/NBK3828/#publisherhelp.Object_O + if object_type in NLM_OBJECT_TYPE_TO_HEP_MAP: + return NLM_OBJECT_TYPE_TO_HEP_MAP[object_type] + + pub_type = self.root.xpath('./PublicationType/text()').extract_first() + # See: www.ncbi.nlm.nih.gov/books/NBK3828/#publisherhelp.PublicationType_O + if pub_type == 'Published Erratum': + return 'erratum' + + return 'publication' + + + @property + def page_start(self): + return self.root.xpath('./FirstPage/text()').extract_first() + + @property + def page_end(self): + return self.root.xpath('./LastPage/text()').extract_first() + + @property + def collaborations(self): + return self.root.xpath('.//Author/CollectiveName/text()').extract() + + @property + def dois(self): + dois = self.root.xpath( + './/ArticleIdList/ArticleId[@IdType="doi"]/text()' + ).extract() + + if not dois: + dois = self.root.xpath( + './/ELocationID[@EIdType="doi"]/text()' + ).extract() + + return [{'doi': value, 'material': self.material} for value in dois] + + @property + def keywords(self): + return self.root.xpath( + './ObjectList/Object[@Type="keyword"]/Param[@Name="value"]/text()' + ).extract() + + @property + def print_publication_date(self): + """Date of the print publication. + + PubDate tags may appear in root of the Article or as part of + article's History. + """ + pub_date = self.root.xpath('.//PubDate[@PubStatus="ppublish"]') + pub_date_no_tag = self.root.xpath('.//PubDate[not(@PubStatus)]') + return self.partial_date_from_date_node(pub_date or pub_date_no_tag) + + @property + def online_publication_date(self): + """Date of the only-only publication. + + PubDate tags may appear in root of the Article or as part of + article's History. + """ + pub_date = self.root.xpath('.//PubDate[@PubStatus="epublish"]') + return self.partial_date_from_date_node(pub_date) + + @property + def publisher(self): + return self.root.xpath( + './Journal/PublisherName/text()' + ).extract_first() + + @staticmethod + def get_root_node(record): + """Get a selector on the root ``ArticleSet`` node of the record. + + This can be overridden in case some preprocessing needs to be done on + the XML. + + Args: + record(Union[str, scrapy.selector.Selector]): + the record in NLM format. + + Returns: + scrapy.selector.Selector: a selector on the root ``
`` + node. + """ + if isinstance(record, six.string_types): + root = get_node(record) + else: + root = record + + return root + + def get_author(self, author_node): + """Get HEP conforming author information + + Args: + author_node(scrapy.selector.Selector): node + + Returns: + dict: extracted author information + """ + first = author_node.xpath('./FirstName/text()').extract_first() + middle = author_node.xpath('./MiddleName/text()').extract_first() + last = author_node.xpath('./LastName/text()').extract_first() + suffix = author_node.xpath('./Suffix/text()').extract_first() + full_name = ParsedName.from_parts(first, last, middle, suffix).dumps() + + affiliations = author_node.xpath('.//Affiliation/text()').extract() + affiliations = [self.normalize_space(aff) for aff in affiliations] + ids = author_node.xpath('./Identifier/text()').extract() + + return self.builder.make_author( + full_name, + raw_affiliations=affiliations, + ids=[(None, id_) for id_ in ids], + ) + + @staticmethod + def partial_date_from_date_node(node): + """Parse an XML date node into PartialDate, if possible. + + Args: + node (scrapy.selector.Selector): an XML node to parse + + Returns: + Union[PartialDate, None]: a PartialDate of None if couldn't parse + """ + try: + day = node.xpath('./Day/text()').extract_first() + month = node.xpath('./Month/text()').extract_first() + year = node.xpath('./Year/text()').extract_first() + return PartialDate.from_parts(year, month, day) + except ValueError: + return None + + @staticmethod + def normalize_space(text): + """XML normalize space. + + Removes leading and trailing whitespace, + replaces strings of whitespace with single space. + + Args: + text (string): input string + + Returns: + string: normalized string + """ + return " ".join(text.split()) diff --git a/t.json b/t.json new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/responses/iop/expected.yaml b/tests/unit/responses/iop/expected.yaml new file mode 100644 index 00000000..a4aee6c6 --- /dev/null +++ b/tests/unit/responses/iop/expected.yaml @@ -0,0 +1,80 @@ +abstract: This is a sample text containing maths, + f(x)1+xdx and superscript text. + Resection cases (2009-2013) with absent MLH1 and PMS2 and prior BRAF mutation + polymerase chain reaction results were chosen (n = 57). To mimic biopsy + specimens, tissue microarrays (TMAs) were constructed. In addition, available + biopsies performed prior to the resection were available in 15 cases. BRAF + V600E IHC was performed and graded on TMAs, available biopsy specimens, and + whole-section slides. Mutation status was compared with IHC, and + cost-benefit analysis was performed. BRAF V600E IHC was similar in TMAs, + biopsy specimens, and whole-section slides, with only four (7%) showing + discordance between IHC and mutation status. Using BRAF V600E IHC in our + Lynch syndrome screening algorithm, we found a 10% cost savings compared + with mutational analysis. BRAF V600E IHC was concordant between TMAs, + biopsy specimens, and whole-section slides, suggesting biopsy specimens are + as useful as whole sections. IHC remained cost beneficial compared with + mutational analysis, even though more patients needed additional molecular + testing to exclude Lynch syndrome. +title: 'A Modified Lynch Syndrome Screening Algorithm in Colon Cancer: BRAF + Immunohistochemistry Is Efficacious and Cost Beneficial.' +copyright_statement: Copyright© by the American Society for Clinical Pathology +document_type: article +publication_type: null +authors: +- full_name: Roth, Rachel M. + raw_affiliations: + - value: Department of Pathology, + The Ohio State University Wexner Medical Center, Columbus + source: American Society for Clinical Pathology +- full_name: Hampel, Heather + raw_affiliations: + - value: Department of Human Genetics, + The Ohio State University Wexner Medical Center Columbus + source: American Society for Clinical Pathology +- full_name: Arnold, Christina A. + raw_affiliations: + - value: Department of Pathology, + The Ohio State University Wexner Medical Center, Columbus + source: American Society for Clinical Pathology + - value: Department of Microbiology, + The Ohio State University Wexner Medical Center, Columbus + source: American Society for Clinical Pathology +- full_name: Yearsley, Martha M. + raw_affiliations: + - value: Department of Pathology, + The Ohio State University Wexner Medical Center, Columbus + source: American Society for Clinical Pathology +- full_name: Marsh, William L. + raw_affiliations: + - value: Department of Pathology, + The Ohio State University Wexner Medical Center, Columbus + source: American Society for Clinical Pathology +- full_name: Frankel, Wendy L. + raw_affiliations: + - value: Department of Pathology, + The Ohio State University Wexner Medical Center, Columbus + source: American Society for Clinical Pathology + - value: Department of Human Genetics, + The Ohio State University Wexner Medical Center, Columbus + source: American Society for Clinical Pathology +- full_name: Smith, John +- full_name: Jones, Mary +journal_title: Am J Clin Pathol +journal_issue: '3' +journal_volume: '143' +material: publication +page_end: '343' +page_start: '336' +collaborations: +- Cancer Genome Center +dois: +- doi: 10.1309/AJCP4D7RXOBHLKGJ + material: publication +keywords: +- BRAF +- MLH1 +- Immunohistochemistry +- Cost-benefit analysis +print_publication_date: 2015-03 +online_publication_date: null +publisher: American Society for Clinical Pathology \ No newline at end of file diff --git a/tests/unit/responses/iop/xml/test_standard.xml b/tests/unit/responses/iop/xml/test_standard.xml index 69fde3ce..4aa9e05a 100644 --- a/tests/unit/responses/iop/xml/test_standard.xml +++ b/tests/unit/responses/iop/xml/test_standard.xml @@ -71,12 +71,28 @@ Columbus + + Cancer Genome Center + + + + Cancer Genome Center + + John + Smith + + + Mary + Jones + + + Journal Article j143/3/336 110.1309/AJCP4D7RXOBHLKGJ + IdType="doi">10.1309/AJCP4D7RXOBHLKGJ @@ -96,7 +112,7 @@ - Somatic BRAF mutation in colon cancer essentially excludes Lynch syndrome. We compared BRAF V600E immunohistochemistry (IHC) with BRAF mutation in core, biopsy, and whole-section slides to determine whether IHC is similar and to assess the cost-benefit of IHC. + This is a sample text containing maths, f(x)1+xdx and superscript text. Resection cases (2009-2013) with absent MLH1 and PMS2 and prior BRAF mutation polymerase chain reaction results were chosen (n = 57). To mimic biopsy specimens, tissue microarrays (TMAs) were constructed. In addition, available biopsies performed prior to the resection were available in 15 cases. BRAF V600E IHC was performed and graded on TMAs, available biopsy specimens, and whole-section slides. Mutation status was compared with IHC, and cost-benefit analysis was performed. BRAF V600E IHC was similar in TMAs, biopsy specimens, and whole-section slides, with only four (7%) showing discordance between IHC and mutation status. Using BRAF V600E IHC in our Lynch syndrome screening algorithm, we found a 10% cost savings compared with mutational analysis. BRAF V600E IHC was concordant between TMAs, biopsy specimens, and whole-section slides, suggesting biopsy specimens are as useful as whole sections. IHC remained cost beneficial compared with mutational analysis, even though more patients needed additional molecular testing to exclude Lynch diff --git a/tests/unit/test_iop.py b/tests/unit/test_iop.py index 1e48fb8a..79fe52de 100644 --- a/tests/unit/test_iop.py +++ b/tests/unit/test_iop.py @@ -46,12 +46,6 @@ def record(): return parsed_item.record -def test_abstract(record): - """Test extracting abstract.""" - assert "abstract" in record - assert record["abstract"].startswith("Somatic BRAF mutation") - - def test_title(record): """Test extracting title.""" title = 'A Modified Lynch Syndrome Screening Algorithm in Colon Cancer: BRAF Immunohistochemistry Is Efficacious and Cost Beneficial.' @@ -87,7 +81,7 @@ def test_free_keywords(record): def test_dois(record): """Test extracting dois.""" assert record["dois"] - assert record["dois"][0]["value"] == '110.1309/AJCP4D7RXOBHLKGJ' + assert record["dois"][0]["value"] == '10.1309/AJCP4D7RXOBHLKGJ' def test_collections(record): @@ -117,29 +111,6 @@ def test_publication_info(record): assert record["journal_issn"][0] == journal_issn -def test_authors(record): - """Test authors.""" - authors = ['Roth, Rachel M', 'Hampel, Heather', 'Arnold, Christina A', - 'Yearsley, Martha M', 'Marsh, William L', 'Frankel, Wendy L'] - - affiliations = [ - [{'value': u'Department of Pathology, The Ohio State University Wexner Medical Center, Columbus'}], - [{'value': u'Department of Human Genetics, The Ohio State University Wexner Medical Center Columbus'}], - [{'value': u'Department of Pathology, The Ohio State University Wexner Medical Center, Columbus'}, - {'value': u'Department of Microbiology, The Ohio State University Wexner Medical Center, Columbus'}], - [{'value': u'Department of Pathology, The Ohio State University Wexner Medical Center, Columbus'}], - [{'value': u'Department of Pathology, The Ohio State University Wexner Medical Center, Columbus'}], - [{'value': u'Department of Pathology, The Ohio State University Wexner Medical Center, Columbus'}, - {'value': u'Department of Human Genetics, The Ohio State University Wexner Medical Center, Columbus'}] - ] - - assert "authors" in record - assert len(record["authors"]) == 6 - for index, (name, aff) in enumerate(zip(authors, affiliations)): - assert record["authors"][index]["full_name"] == name - assert record["authors"][index]["affiliations"] == aff - - def test_copyrights(record): """Test extracting copyright.""" copyright_holder = "American Society for Clinical Pathology" diff --git a/tests/unit/test_parsers_nlm.py b/tests/unit/test_parsers_nlm.py new file mode 100644 index 00000000..205d5c47 --- /dev/null +++ b/tests/unit/test_parsers_nlm.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- +# +# This file is part of hepcrawl. +# Copyright (C) 2018 CERN. +# +# hepcrawl is a free software; you can redistribute it and/or modify it +# under the terms of the Revised BSD License; see LICENSE file for +# more details. + +from __future__ import ( + absolute_import, + division, + print_function, +) + +import pytest +import yaml + +from inspire_schemas.utils import validate +from hepcrawl.testlib.fixtures import get_test_suite_path +from hepcrawl.parsers.nlm import NLMParser + + +@pytest.fixture(scope='module') +def expected(): + """A dictionary holding the parsed elements of the record.""" + path = get_test_suite_path('responses', 'iop', 'expected.yaml') + with open(path) as f: + nlm_expected_dict = yaml.load(f) + + return nlm_expected_dict + + +@pytest.fixture(scope='module') +def xml_test_string(): + path = get_test_suite_path('responses', 'iop', 'xml', 'test_standard.xml') + with open(path) as f: + return f.read() + + +@pytest.fixture(scope='module') +def parser(xml_test_string): + """An NLMParser instanciated on a PubMed article.""" + root = NLMParser.get_root_node(xml_test_string) + article = root.xpath('/ArticleSet/Article').extract_first() + return NLMParser(article) + + +def test_bulk_parse(xml_test_string): + for record in NLMParser.bulk_parse(xml_test_string): + assert validate(record, 'hep') == None + + +FIELDS_TO_CHECK = [ + 'abstract', + 'title', + 'copyright_statement', + 'document_type', + 'publication_type', + 'authors', + 'journal_title', + 'journal_issue', + 'journal_volume', + 'material', + 'page_start', + 'page_end', + 'collaborations', + 'dois', + 'keywords', + 'online_publication_date', + 'publisher', +] +FIELDS_TO_CHECK_SEPARATELY = [ + 'print_publication_date', +] + + +def test_data_completeness(expected): + tested_fields = FIELDS_TO_CHECK + FIELDS_TO_CHECK_SEPARATELY + for field in expected.keys(): + assert field in tested_fields + + +@pytest.mark.parametrize( + 'field_name', + FIELDS_TO_CHECK +) +def test_field(field_name, expected, parser): + assert field_name in expected + result = getattr(parser, field_name) + expected = expected[field_name] + + assert result == expected + + +def test_print_publication_date(expected, parser): + assert 'print_publication_date' in expected + assert expected['print_publication_date'] == parser.print_publication_date.dumps()