first version of crossref & opencitations seracher #41

ChristianGerloff · Jun 11, 2023 · dd6346e · dd6346e
1 parent 735d4b7
commit dd6346e
Show file tree

Hide file tree

Showing 4 changed files with 248 additions and 17 deletions.
diff --git a/findpapers/searchers/acm_searcher.py b/findpapers/searchers/acm_searcher.py
@@ -177,7 +177,7 @@ def _get_paper(paper_page: html.HtmlElement, paper_doi: str, paper_url: str) ->
     else:
         publication = None
 
-    paper_authors = [f"{x.get('family')}, {x.get('given')}" for x in paper_metadata.get("author", [])]
+    paper_authors = [f"{a.get('family')}, {a.get('given')}" for a in paper_metadata.get("author", [])]
 
     if issued := paper_metadata.get("issued"):
         date_parts = issued["date-parts"][0]

diff --git a/findpapers/searchers/cross_ref_searcher.py b/findpapers/searchers/cross_ref_searcher.py
@@ -1,15 +1,13 @@
 import logging
 from datetime import date
 
-import requests
-from requests.adapters import HTTPAdapter
-from urllib3.util.retry import Retry
+from crossref.restful import Works
 
 from findpapers.models.paper import Paper
 from findpapers.models.publication import Publication
 from findpapers.models.search import Search
 
-CROSSREF_API = "https://api.crossref.org/works/"
+# CROSSREF_API = "https://api.crossref.org/works/"
 DATABASE_LABEL = "CR"  # short for opencitations
 SPLIT_AUTHOR = "; "
 
@@ -41,16 +39,150 @@ def _y_date(self) -> None:
 
 
 def _get_paper_entry(doi: str) -> dict:
-    pass
+    """Use the DOI and extract the metadata of the paper from Crossref API.
+
+    Args:
+        doi (str): DOI of the paper.
+
+    Returns:
+        dict: Paper entry from the Crossref API.
+    """
+    return Works().doi(doi=doi)
 
 
 def _get_publication(paper_entry: dict) -> Publication:
-    pass
+    """Generate publication instance from a paper entry.
+
+    Args:
+        paper_entry (dict): Paper entry retrieved from Crossref API.
+
+    Returns:
+        Publication: A publication instance.
+    """
+    publication_title = (
+        DATABASE_LABEL if not paper_entry.get("container-title") else paper_entry.get("container-title")[0]
+    )
+
+    publication_issn = paper_entry.get("ISSN")[0] if paper_entry.get("ISSN") else None
+
+    categories = {
+        "journal-article": "Journal",
+        "book-chapter": "Book",
+        "book": "Book",
+        "proceedings-article": "Other",
+        "dataset": "Other",
+        "posted-contend": "Other",
+        "other": "Other",
+    }
+    publication_category = categories.get(paper_entry.get("type"), "Other")
+
+    return Publication(
+        title=publication_title,
+        issn=publication_issn,
+        publisher=paper_entry.get("publisher"),
+        category=publication_category,
+    )
 
 
 def _get_paper(paper_entry: dict, publication: Publication) -> Paper:
-    pass
+    """Create a paper instance from paper entry.
+
+    Args:
+        paper_entry (dict): A paper entry retrieved from Opencitations API.
+        publication (Publication): Publication instance associated with the paper.
+
+    Returns:
+        Paper: A paper instance.
+    """
+    title = paper_entry.get("title")
+
+    # add only papers with titles
+    if not title:
+        return None
+
+    paper_title = title[0]
+
+    paper_abstract = paper_entry.get("abstract")
+
+    # exclude cross-refs without abstracts
+    if not paper_abstract:
+        return None
+
+    remove_abstract = ["<jats:sec>", "</jats:sec>", "<jats:title>", "</jats:title>", "<jats:p>", "</jats:p>"]
+    for abstract in remove_abstract:
+        paper_abstract = paper_abstract.replace(abstract, "")
+
+    paper_authors = [f"{a.get('family')}, {a.get('given')}" for a in paper_entry.get("author", [])]
+
+    # esnure publication date
+    published = paper_entry.get("published")
+    if not published:
+        return None
+
+    date_parts = paper_entry.get("published").get("date-parts")
+    paper_date = DateConverter(date_parts[0]).date
+    paper_urls = set()
+    paper_urls.add(paper_entry.get("URL"))
+    paper_doi = paper_entry.get("DOI")
+    paper_pages = paper_entry.get("page")
+    references = paper_entry.get("reference")
+    paper_references = [d.get("DOI") for d in (references if references else [])]
+
+    # note: check if ok i think these are counts
+    return Paper(
+        paper_title,
+        paper_abstract,
+        paper_authors,
+        publication,
+        paper_date,
+        paper_urls,
+        paper_doi,
+        pages=paper_pages,
+        references=paper_references,
+    )
 
 
 def _add_papers(search: Search, source: str) -> None:
-    pass
+    """Add paper to the search.
+
+    Args:
+        search (Search): A Search instance.
+        source (str): Source of paper.
+    """
+    # get references/citations
+    source_dois = [d for _, p in search.paper_by_doi.items() for d in getattr(p, source)]
+    # avoid duplicates
+    source_dois = list(set(source_dois))
+
+    # gather paper metadata
+    if source_dois:
+        logging.info(f"Cross-References {len(source_dois)} papers found")
+        for idx, doi in enumerate(source_dois):
+            paper_entry = _get_paper_entry(doi=doi)
+            if not paper_entry:
+                continue  # doi was not found
+            publication = _get_publication(paper_entry=paper_entry)
+            paper = _get_paper(paper_entry=paper_entry, publication=publication)
+
+            if paper:
+                logging.info(f"({idx}/{len(source_dois)}) Fetching paper: {doi}")
+                paper.source = source
+                paper.add_database(database_name=DATABASE_LABEL)
+                search.add_paper(paper=paper)
+
+
+def run(search: Search, references: bool = True, citations: bool = True) -> None:
+    """Fetch paper from Crossref API and add the collected papers to the search instance.
+
+    Args:
+        search (Search): A Search instance.
+        references (bool, optional): If references should be used. Defaults to True.
+        citations (bool, optional): If citations should be used. Defaults to True.
+    """
+    try:
+        if references:
+            _add_papers(search=search, source="references")
+        if citations:
+            _add_papers(search=search, source="cites")
+    except Exception as e:
+        logging.debug(e, exc_info=True)
diff --git a/findpapers/searchers/opencitations_searcher.py b/findpapers/searchers/opencitations_searcher.py
@@ -1,33 +1,131 @@
 import logging
+from datetime import date
+
 import requests
 
-from datetime import date
 from findpapers.models.paper import Paper
 from findpapers.models.publication import Publication
 from findpapers.models.search import Search
 
-# from findpapers.tools.references_tool import References
-
 OPENCITATIONS_API = "https://opencitations.net/index/api/v1/metadata/"
 DATABASE_LABEL = "OC"  # short for opencitations
 SPLIT_AUTHOR = "; "
 
 
 def _get_paper_entry(doi: str) -> dict:
-    pass
+    """Use the DOI and extract the metadata of the paper from Opencitations API.
+
+    Args:
+        doi (str): DOI of the paper.
+
+    Returns:
+        dict: Paper entry from the Opencitations API.
+    """
+    return requests.get(url=OPENCITATIONS_API + doi).json()[0]
 
 
 def _get_publication(paper_entry: dict) -> Publication:
-    pass
+    """Generate publication instance from a paper entry.
+
+    Args:
+        paper_entry (dict): Paper entry retrieved from Opencitations API.
+
+    Returns:
+        Publication: A Publication instance.
+    """
+    publication_title = paper_entry.get("source_title")
+
+    if not publication_title:
+        publication_title = DATABASE_LABEL
+
+    # publication_category = 'Preprint' if publication_title is None else None
+    publication_category = None
+
+    return Publication(title=publication_title, category=publication_category)
 
 
 def _get_paper(paper_entry: dict, publication: Publication) -> Paper:
-    pass
+    """Create paper instance from paper entry.
+
+    Args:
+        paper_entry (dict): A paper entry retrieved from Opencitations API.
+        publication (Publication): Publication instance associated with the paper.
+
+    Returns:
+        Paper: A Paper instance.
+    """
+    paper_title = paper_entry.get("title")
+    paper_abstract = None
+    paper_authors = paper_entry.get("author").split(SPLIT_AUTHOR)
+    paper_publication_year = int(paper_entry.get("year"))
+    paper_publication_date = date(year=paper_publication_year, month=1, day=1)
+    paper_urls = [paper_entry.get("oa_link")]
+    paper_doi = paper_entry.get("doi")
+    paper_pages = paper_entry.get("page")
+    paper_citations_count = paper_entry.get("citation_count")
+
+    paper_citations = []
+    paper_references = []
+
+    # add cross references as a list of clean DOIs
+    if len(paper_entry.get("citation")) > 0:
+        paper_citations = paper_entry.get("citation").replace(" ", "").split(";")
+    if len(paper_entry.get("reference")) > 0:
+        paper_references = paper_entry.get("reference").replace(" ", "").split(";")
+
+    # note: check if ok i think these are counts
+    return Paper(
+        title=paper_title,
+        abstract=paper_abstract,
+        authors=paper_authors,
+        publication=publication,
+        publication_date=paper_publication_date,
+        urls=paper_urls,
+        doi=paper_doi,
+        citations=paper_citations_count,
+        pages=paper_pages,
+        references=paper_references,
+        cites=paper_citations,
+    )
 
 
 def _add_papers(search: Search, source: str) -> None:
-    pass
+    """Add paper to the search.
+
+    Args:
+        search (Search): A Search instance.
+        source (str): Source of paper.
+    """
+    # get references/citations
+    source_dois = [d for _, p in search.paper_by_doi.items() for d in getattr(p, source)]
+
+    # gather paper metadata
+    if source_dois:
+        logging.info(f"Opencitations: {len(source_dois)} papers found")
+        for idx, doi in enumerate(source_dois):
+            paper_entry = _get_paper_entry(doi=doi)
+            publication = _get_publication(paper_entry=paper_entry)
+            paper = _get_paper(paper_entry=paper_entry, publication=publication)
+
+            if paper:
+                logging.info(f"({idx}/{len(source_dois)}) Fetching paper: {doi}")
+                paper.source = source
+                paper.add_database(database_name=DATABASE_LABEL)
+                search.add_paper(paper=paper)
 
 
 def run(search: Search, references: bool = True, citations: bool = True) -> None:
-    pass
+    """Fetch paper from Opencitations API and add the collected papers to the search instance.
+
+    Args:
+        search (Search): A Search instance.
+        references (bool, optional): If references should be used. Defaults to True.
+        citations (bool, optional): If citations should be used. Defaults to True.
+    """
+    try:
+        if references:
+            _add_papers(search=search, source="references")
+        if citations:
+            _add_papers(search=search, source="cites")
+    except Exception as e:
+        logging.debug(e, exc_info=True)
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,6 +19,7 @@ inquirer = "^3.1.3"
 xmltodict = "^0.13.0"
 importlib-metadata = "^6.6.0"
 arxiv = "^1.4.7"
+crossrefapi = "^1.5.0"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^7.2.2"