Skip to content

Commit

Permalink
Elsevier: doi exception handling, filepath logging
Browse files Browse the repository at this point in the history
  • Loading branch information
ErnestaP committed Dec 6, 2023
1 parent 804b21d commit 13e99eb
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 58 deletions.
137 changes: 79 additions & 58 deletions hepcrawl/extractors/s3_elsevier_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,62 +55,78 @@ def parse_node(self, meta, node):
record = HEPLoader(item=HEPRecord(), selector=node)

article_type = node.xpath('./@docsubtype').extract()
article_type = map(lambda x: self.article_type_mapping.get(x, 'other'), article_type)
article_type = map(lambda x: self.article_type_mapping.get(
x, 'other'), article_type)
record.add_value('journal_doctype', article_type)

dois = node.xpath('./item-info/doi/text()').extract()
doi = dois[0]
record.add_value('dois', dois)

if article_type in ['correction', 'addendum']:
logger.info('Adding related_article_doi for article %s.' % dois)
record.add_xpath('related_article_doi', "//related-article[@ext-link-type='doi']/@href")

record.add_xpath('abstract', './*[self::head | self::simple-head]/abstract[1]/abstract-sec')
record.add_xpath('title', './*[self::head | self::simple-head]/title/text()')
record.add_xpath('subtitle', './*[self::head | self::simple-head]/subtitle/text()')

record.add_value('authors', self.get_authors(node, dois))
record.add_xpath('collaborations', "./*[self::head | self::simple-head]/author-group/collaboration/text/text()")

record.add_value('journal_title', meta['articles'][doi]['journal'])
record.add_value('journal_volume', meta['volume'])
record.add_xpath('journal_artid', '//item-info/aid/text()')

first_page = meta['articles'][doi].get('first-page')
last_page = meta['articles'][doi].get('last-page')
record.add_value('journal_fpage', first_page)
record.add_value('journal_lpage', last_page)

if first_page is not None and last_page is not None:
try:
page_nr = int(last_page) - int(first_page) + 1
record.add_value('page_nr', page_nr)
except ValueError as e:
logger.error('Failed to parse last_page or first_page for article %s: %s' % (dois, e))

published_date = datetime.datetime.strptime(meta['articles'][doi]['publication-date'], "%Y-%m-%dT%H:%M:%S")
record.add_value('journal_year', published_date.year)
record.add_value('date_published', published_date.strftime("%Y-%m-%d"))

record.add_xpath('copyright_holder', './item-info/copyright/text()')
record.add_xpath('copyright_year', './item-info/copyright/@year')
record.add_xpath('copyright_statement', './item-info/copyright/text()')

license = get_license(
license_url='http://creativecommons.org/licenses/by/3.0/'
)
record.add_value('license', license)

record.add_value('collections', [meta['articles'][doi]['journal']])

# local file paths
local_files = []
for filetype in meta['articles'][doi]['files']:
local_files.append({'filetype': filetype, 'path': meta['articles'][doi]['files'][filetype]})
record.add_value('local_files', local_files)

return dict(record.load_item())
try:
doi = dois[0]
record.add_value('dois', dois)

if article_type in ['correction', 'addendum']:
logger.info(
'Adding related_article_doi for article %s.' % dois)
record.add_xpath('related_article_doi',
"//related-article[@ext-link-type='doi']/@href")

record.add_xpath(
'abstract', './*[self::head | self::simple-head]/abstract[1]/abstract-sec')
record.add_xpath(
'title', './*[self::head | self::simple-head]/title/text()')
record.add_xpath(
'subtitle', './*[self::head | self::simple-head]/subtitle/text()')

record.add_value('authors', self.get_authors(node, dois))
record.add_xpath(
'collaborations', "./*[self::head | self::simple-head]/author-group/collaboration/text/text()")

record.add_value('journal_title', meta['articles'][doi]['journal'])
record.add_value('journal_volume', meta['volume'])
record.add_xpath('journal_artid', '//item-info/aid/text()')

first_page = meta['articles'][doi].get('first-page')
last_page = meta['articles'][doi].get('last-page')
record.add_value('journal_fpage', first_page)
record.add_value('journal_lpage', last_page)

if first_page is not None and last_page is not None:
try:
page_nr = int(last_page) - int(first_page) + 1
record.add_value('page_nr', page_nr)
except ValueError as e:
logger.error(
'Failed to parse last_page or first_page for article %s: %s' % (dois, e))

published_date = datetime.datetime.strptime(
meta['articles'][doi]['publication-date'], "%Y-%m-%dT%H:%M:%S")
record.add_value('journal_year', published_date.year)
record.add_value('date_published',
published_date.strftime("%Y-%m-%d"))

record.add_xpath('copyright_holder',
'./item-info/copyright/text()')
record.add_xpath('copyright_year', './item-info/copyright/@year')
record.add_xpath('copyright_statement',
'./item-info/copyright/text()')

license = get_license(
license_url='http://creativecommons.org/licenses/by/3.0/'
)
record.add_value('license', license)

record.add_value('collections', [meta['articles'][doi]['journal']])

# local file paths
local_files = []
for filetype in meta['articles'][doi]['files']:
local_files.append(
{'filetype': filetype, 'path': meta['articles'][doi]['files'][filetype]})
record.add_value('local_files', local_files)

return dict(record.load_item())
except IndexError:
logger.error("Article has no DOI")

def get_authors(self, node, dois):
"""Get the authors."""
Expand All @@ -120,7 +136,8 @@ def get_authors(self, node, dois):
for author in author_group.xpath("./author"):
surname = author.xpath("./surname/text()")
given_names = author.xpath("./given-name/text()")
affiliations = self._get_affiliations(author_group, author, dois)
affiliations = self._get_affiliations(
author_group, author, dois)
orcid = self._get_orcid(author)
emails = author.xpath("./e-address/text()")

Expand All @@ -133,7 +150,8 @@ def get_authors(self, node, dois):
if orcid:
auth_dict['orcid'] = orcid
if affiliations:
auth_dict['affiliations'] = [{"value": aff} for aff in affiliations]
auth_dict['affiliations'] = [
{"value": aff} for aff in affiliations]
if emails:
auth_dict['email'] = emails.extract_first()

Expand All @@ -159,9 +177,11 @@ def _find_affiliations_by_id(author_group, ref_ids):
"""
affiliations_by_id = []
for aff_id in ref_ids:
ce_affiliation = author_group.xpath("//affiliation[@id='" + aff_id + "']")
ce_affiliation = author_group.xpath(
"//affiliation[@id='" + aff_id + "']")
if ce_affiliation.xpath(".//affiliation"):
aff = ce_affiliation.xpath(".//*[self::organization or self::city or self::country or self::address-line]/text()")
aff = ce_affiliation.xpath(
".//*[self::organization or self::city or self::country or self::address-line]/text()")
affiliations_by_id.append(", ".join(aff.extract()))
elif ce_affiliation:
aff = ce_affiliation.xpath("./textfn/text()").extract_first()
Expand All @@ -178,7 +198,8 @@ def _get_affiliations(self, author_group, author, dois):
"""

ref_ids = author.xpath(".//@refid").extract()
group_affs = author_group.xpath(".//affiliation[not(@*)]/textfn/text()")
group_affs = author_group.xpath(
".//affiliation[not(@*)]/textfn/text()")
all_group_affs = author_group.xpath(".//affiliation/textfn/text()")

# Don't take correspondence (cor1) or deceased (fn1):
Expand Down
1 change: 1 addition & 0 deletions hepcrawl/spiders/s3_elsevier_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ def parse_dataset(self, target_folder, filename, zip_filepath, f):

for i in range(len(journal_data)):
for doi, data in journal_data[i]['articles'].items():
self.log("Starting to parse file: '%s'" % data['files']['xml'], logging.INFO)
with open(data['files']['xml'], 'r') as xml_file:
xml_file_content = xml_file.read()
for nodename in self.itertag:
Expand Down

0 comments on commit 13e99eb

Please sign in to comment.