Elsevier: doi exception handling, filepath logging

SCOAP3 · Dec 6, 2023 · 13e99eb · 13e99eb
1 parent 804b21d
commit 13e99eb
Show file tree

Hide file tree

Showing 2 changed files with 80 additions and 58 deletions.
diff --git a/hepcrawl/extractors/s3_elsevier_parser.py b/hepcrawl/extractors/s3_elsevier_parser.py
@@ -55,62 +55,78 @@ def parse_node(self, meta, node):
         record = HEPLoader(item=HEPRecord(), selector=node)
 
         article_type = node.xpath('./@docsubtype').extract()
-        article_type = map(lambda x: self.article_type_mapping.get(x, 'other'), article_type)
+        article_type = map(lambda x: self.article_type_mapping.get(
+            x, 'other'), article_type)
         record.add_value('journal_doctype', article_type)
 
         dois = node.xpath('./item-info/doi/text()').extract()
-        doi = dois[0]
-        record.add_value('dois', dois)
-
-        if article_type in ['correction', 'addendum']:
-            logger.info('Adding related_article_doi for article %s.' % dois)
-            record.add_xpath('related_article_doi', "//related-article[@ext-link-type='doi']/@href")
-
-        record.add_xpath('abstract', './*[self::head | self::simple-head]/abstract[1]/abstract-sec')
-        record.add_xpath('title', './*[self::head | self::simple-head]/title/text()')
-        record.add_xpath('subtitle', './*[self::head | self::simple-head]/subtitle/text()')
-
-        record.add_value('authors', self.get_authors(node, dois))
-        record.add_xpath('collaborations', "./*[self::head | self::simple-head]/author-group/collaboration/text/text()")
-
-        record.add_value('journal_title', meta['articles'][doi]['journal'])
-        record.add_value('journal_volume', meta['volume'])
-        record.add_xpath('journal_artid', '//item-info/aid/text()')
-
-        first_page = meta['articles'][doi].get('first-page')
-        last_page = meta['articles'][doi].get('last-page')
-        record.add_value('journal_fpage', first_page)
-        record.add_value('journal_lpage', last_page)
-
-        if first_page is not None and last_page is not None:
-            try:
-                page_nr = int(last_page) - int(first_page) + 1
-                record.add_value('page_nr', page_nr)
-            except ValueError as e:
-                logger.error('Failed to parse last_page or first_page for article %s: %s' % (dois, e))
-
-        published_date = datetime.datetime.strptime(meta['articles'][doi]['publication-date'], "%Y-%m-%dT%H:%M:%S")
-        record.add_value('journal_year', published_date.year)
-        record.add_value('date_published', published_date.strftime("%Y-%m-%d"))
-
-        record.add_xpath('copyright_holder', './item-info/copyright/text()')
-        record.add_xpath('copyright_year', './item-info/copyright/@year')
-        record.add_xpath('copyright_statement', './item-info/copyright/text()')
-
-        license = get_license(
-            license_url='http://creativecommons.org/licenses/by/3.0/'
-        )
-        record.add_value('license', license)
-
-        record.add_value('collections', [meta['articles'][doi]['journal']])
-
-        # local file paths
-        local_files = []
-        for filetype in meta['articles'][doi]['files']:
-            local_files.append({'filetype': filetype, 'path': meta['articles'][doi]['files'][filetype]})
-        record.add_value('local_files', local_files)
-
-        return dict(record.load_item())
+        try:
+            doi = dois[0]
+            record.add_value('dois', dois)
+
+            if article_type in ['correction', 'addendum']:
+                logger.info(
+                    'Adding related_article_doi for article %s.' % dois)
+                record.add_xpath('related_article_doi',
+                                 "//related-article[@ext-link-type='doi']/@href")
+
+            record.add_xpath(
+                'abstract', './*[self::head | self::simple-head]/abstract[1]/abstract-sec')
+            record.add_xpath(
+                'title', './*[self::head | self::simple-head]/title/text()')
+            record.add_xpath(
+                'subtitle', './*[self::head | self::simple-head]/subtitle/text()')
+
+            record.add_value('authors', self.get_authors(node, dois))
+            record.add_xpath(
+                'collaborations', "./*[self::head | self::simple-head]/author-group/collaboration/text/text()")
+
+            record.add_value('journal_title', meta['articles'][doi]['journal'])
+            record.add_value('journal_volume', meta['volume'])
+            record.add_xpath('journal_artid', '//item-info/aid/text()')
+
+            first_page = meta['articles'][doi].get('first-page')
+            last_page = meta['articles'][doi].get('last-page')
+            record.add_value('journal_fpage', first_page)
+            record.add_value('journal_lpage', last_page)
+
+            if first_page is not None and last_page is not None:
+                try:
+                    page_nr = int(last_page) - int(first_page) + 1
+                    record.add_value('page_nr', page_nr)
+                except ValueError as e:
+                    logger.error(
+                        'Failed to parse last_page or first_page for article %s: %s' % (dois, e))
+
+            published_date = datetime.datetime.strptime(
+                meta['articles'][doi]['publication-date'], "%Y-%m-%dT%H:%M:%S")
+            record.add_value('journal_year', published_date.year)
+            record.add_value('date_published',
+                             published_date.strftime("%Y-%m-%d"))
+
+            record.add_xpath('copyright_holder',
+                             './item-info/copyright/text()')
+            record.add_xpath('copyright_year', './item-info/copyright/@year')
+            record.add_xpath('copyright_statement',
+                             './item-info/copyright/text()')
+
+            license = get_license(
+                license_url='http://creativecommons.org/licenses/by/3.0/'
+            )
+            record.add_value('license', license)
+
+            record.add_value('collections', [meta['articles'][doi]['journal']])
+
+            # local file paths
+            local_files = []
+            for filetype in meta['articles'][doi]['files']:
+                local_files.append(
+                    {'filetype': filetype, 'path': meta['articles'][doi]['files'][filetype]})
+            record.add_value('local_files', local_files)
+
+            return dict(record.load_item())
+        except IndexError:
+            logger.error("Article has no DOI")
 
     def get_authors(self, node, dois):
         """Get the authors."""
@@ -120,7 +136,8 @@ def get_authors(self, node, dois):
             for author in author_group.xpath("./author"):
                 surname = author.xpath("./surname/text()")
                 given_names = author.xpath("./given-name/text()")
-                affiliations = self._get_affiliations(author_group, author, dois)
+                affiliations = self._get_affiliations(
+                    author_group, author, dois)
                 orcid = self._get_orcid(author)
                 emails = author.xpath("./e-address/text()")
 
@@ -133,7 +150,8 @@ def get_authors(self, node, dois):
                 if orcid:
                     auth_dict['orcid'] = orcid
                 if affiliations:
-                    auth_dict['affiliations'] = [{"value": aff} for aff in affiliations]
+                    auth_dict['affiliations'] = [
+                        {"value": aff} for aff in affiliations]
                 if emails:
                     auth_dict['email'] = emails.extract_first()
 
@@ -159,9 +177,11 @@ def _find_affiliations_by_id(author_group, ref_ids):
         """
         affiliations_by_id = []
         for aff_id in ref_ids:
-            ce_affiliation = author_group.xpath("//affiliation[@id='" + aff_id + "']")
+            ce_affiliation = author_group.xpath(
+                "//affiliation[@id='" + aff_id + "']")
             if ce_affiliation.xpath(".//affiliation"):
-                aff = ce_affiliation.xpath(".//*[self::organization or self::city or self::country or self::address-line]/text()")
+                aff = ce_affiliation.xpath(
+                    ".//*[self::organization or self::city or self::country or self::address-line]/text()")
                 affiliations_by_id.append(", ".join(aff.extract()))
             elif ce_affiliation:
                 aff = ce_affiliation.xpath("./textfn/text()").extract_first()
@@ -178,7 +198,8 @@ def _get_affiliations(self, author_group, author, dois):
         """
 
         ref_ids = author.xpath(".//@refid").extract()
-        group_affs = author_group.xpath(".//affiliation[not(@*)]/textfn/text()")
+        group_affs = author_group.xpath(
+            ".//affiliation[not(@*)]/textfn/text()")
         all_group_affs = author_group.xpath(".//affiliation/textfn/text()")
 
         # Don't take correspondence (cor1) or deceased (fn1):

diff --git a/hepcrawl/spiders/s3_elsevier_spider.py b/hepcrawl/spiders/s3_elsevier_spider.py
@@ -248,6 +248,7 @@ def parse_dataset(self, target_folder, filename, zip_filepath, f):
 
             for i in range(len(journal_data)):
                 for doi, data in journal_data[i]['articles'].items():
+                    self.log("Starting to parse file: '%s'" % data['files']['xml'], logging.INFO)
                     with open(data['files']['xml'], 'r') as xml_file:
                         xml_file_content = xml_file.read()
                         for nodename in self.itertag: