diff --git a/findpapers/searchers/acm_searcher.py b/findpapers/searchers/acm_searcher.py
index f6634b2..1a3f653 100644
--- a/findpapers/searchers/acm_searcher.py
+++ b/findpapers/searchers/acm_searcher.py
@@ -177,7 +177,7 @@ def _get_paper(paper_page: html.HtmlElement, paper_doi: str, paper_url: str) ->
else:
publication = None
- paper_authors = [f"{x.get('family')}, {x.get('given')}" for x in paper_metadata.get("author", [])]
+ paper_authors = [f"{a.get('family')}, {a.get('given')}" for a in paper_metadata.get("author", [])]
if issued := paper_metadata.get("issued"):
date_parts = issued["date-parts"][0]
diff --git a/findpapers/searchers/cross_ref_searcher.py b/findpapers/searchers/cross_ref_searcher.py
index 127d67e..b6c59d6 100644
--- a/findpapers/searchers/cross_ref_searcher.py
+++ b/findpapers/searchers/cross_ref_searcher.py
@@ -1,15 +1,13 @@
import logging
from datetime import date
-import requests
-from requests.adapters import HTTPAdapter
-from urllib3.util.retry import Retry
+from crossref.restful import Works
from findpapers.models.paper import Paper
from findpapers.models.publication import Publication
from findpapers.models.search import Search
-CROSSREF_API = "https://api.crossref.org/works/"
+# CROSSREF_API = "https://api.crossref.org/works/"
DATABASE_LABEL = "CR" # short for opencitations
SPLIT_AUTHOR = "; "
@@ -41,16 +39,150 @@ def _y_date(self) -> None:
def _get_paper_entry(doi: str) -> dict:
- pass
+ """Use the DOI and extract the metadata of the paper from Crossref API.
+
+ Args:
+ doi (str): DOI of the paper.
+
+ Returns:
+ dict: Paper entry from the Crossref API.
+ """
+ return Works().doi(doi=doi)
def _get_publication(paper_entry: dict) -> Publication:
- pass
+ """Generate publication instance from a paper entry.
+
+ Args:
+ paper_entry (dict): Paper entry retrieved from Crossref API.
+
+ Returns:
+ Publication: A publication instance.
+ """
+ publication_title = (
+ DATABASE_LABEL if not paper_entry.get("container-title") else paper_entry.get("container-title")[0]
+ )
+
+ publication_issn = paper_entry.get("ISSN")[0] if paper_entry.get("ISSN") else None
+
+ categories = {
+ "journal-article": "Journal",
+ "book-chapter": "Book",
+ "book": "Book",
+ "proceedings-article": "Other",
+ "dataset": "Other",
+ "posted-contend": "Other",
+ "other": "Other",
+ }
+ publication_category = categories.get(paper_entry.get("type"), "Other")
+
+ return Publication(
+ title=publication_title,
+ issn=publication_issn,
+ publisher=paper_entry.get("publisher"),
+ category=publication_category,
+ )
def _get_paper(paper_entry: dict, publication: Publication) -> Paper:
- pass
+ """Create a paper instance from paper entry.
+
+ Args:
+ paper_entry (dict): A paper entry retrieved from Opencitations API.
+ publication (Publication): Publication instance associated with the paper.
+
+ Returns:
+ Paper: A paper instance.
+ """
+ title = paper_entry.get("title")
+
+ # add only papers with titles
+ if not title:
+ return None
+
+ paper_title = title[0]
+
+ paper_abstract = paper_entry.get("abstract")
+
+ # exclude cross-refs without abstracts
+ if not paper_abstract:
+ return None
+
+ remove_abstract = ["", "", "", "", "", ""]
+ for abstract in remove_abstract:
+ paper_abstract = paper_abstract.replace(abstract, "")
+
+ paper_authors = [f"{a.get('family')}, {a.get('given')}" for a in paper_entry.get("author", [])]
+
+ # esnure publication date
+ published = paper_entry.get("published")
+ if not published:
+ return None
+
+ date_parts = paper_entry.get("published").get("date-parts")
+ paper_date = DateConverter(date_parts[0]).date
+ paper_urls = set()
+ paper_urls.add(paper_entry.get("URL"))
+ paper_doi = paper_entry.get("DOI")
+ paper_pages = paper_entry.get("page")
+ references = paper_entry.get("reference")
+ paper_references = [d.get("DOI") for d in (references if references else [])]
+
+ # note: check if ok i think these are counts
+ return Paper(
+ paper_title,
+ paper_abstract,
+ paper_authors,
+ publication,
+ paper_date,
+ paper_urls,
+ paper_doi,
+ pages=paper_pages,
+ references=paper_references,
+ )
def _add_papers(search: Search, source: str) -> None:
- pass
+ """Add paper to the search.
+
+ Args:
+ search (Search): A Search instance.
+ source (str): Source of paper.
+ """
+ # get references/citations
+ source_dois = [d for _, p in search.paper_by_doi.items() for d in getattr(p, source)]
+ # avoid duplicates
+ source_dois = list(set(source_dois))
+
+ # gather paper metadata
+ if source_dois:
+ logging.info(f"Cross-References {len(source_dois)} papers found")
+ for idx, doi in enumerate(source_dois):
+ paper_entry = _get_paper_entry(doi=doi)
+ if not paper_entry:
+ continue # doi was not found
+ publication = _get_publication(paper_entry=paper_entry)
+ paper = _get_paper(paper_entry=paper_entry, publication=publication)
+
+ if paper:
+ logging.info(f"({idx}/{len(source_dois)}) Fetching paper: {doi}")
+ paper.source = source
+ paper.add_database(database_name=DATABASE_LABEL)
+ search.add_paper(paper=paper)
+
+
+def run(search: Search, references: bool = True, citations: bool = True) -> None:
+ """Fetch paper from Crossref API and add the collected papers to the search instance.
+
+ Args:
+ search (Search): A Search instance.
+ references (bool, optional): If references should be used. Defaults to True.
+ citations (bool, optional): If citations should be used. Defaults to True.
+ """
+ try:
+ if references:
+ _add_papers(search=search, source="references")
+ if citations:
+ _add_papers(search=search, source="cites")
+ except Exception as e:
+ logging.debug(e, exc_info=True)
diff --git a/findpapers/searchers/opencitations_searcher.py b/findpapers/searchers/opencitations_searcher.py
index 4208a66..87ae88a 100644
--- a/findpapers/searchers/opencitations_searcher.py
+++ b/findpapers/searchers/opencitations_searcher.py
@@ -1,33 +1,131 @@
import logging
+from datetime import date
+
import requests
-from datetime import date
from findpapers.models.paper import Paper
from findpapers.models.publication import Publication
from findpapers.models.search import Search
-# from findpapers.tools.references_tool import References
-
OPENCITATIONS_API = "https://opencitations.net/index/api/v1/metadata/"
DATABASE_LABEL = "OC" # short for opencitations
SPLIT_AUTHOR = "; "
def _get_paper_entry(doi: str) -> dict:
- pass
+ """Use the DOI and extract the metadata of the paper from Opencitations API.
+
+ Args:
+ doi (str): DOI of the paper.
+
+ Returns:
+ dict: Paper entry from the Opencitations API.
+ """
+ return requests.get(url=OPENCITATIONS_API + doi).json()[0]
def _get_publication(paper_entry: dict) -> Publication:
- pass
+ """Generate publication instance from a paper entry.
+
+ Args:
+ paper_entry (dict): Paper entry retrieved from Opencitations API.
+
+ Returns:
+ Publication: A Publication instance.
+ """
+ publication_title = paper_entry.get("source_title")
+
+ if not publication_title:
+ publication_title = DATABASE_LABEL
+
+ # publication_category = 'Preprint' if publication_title is None else None
+ publication_category = None
+
+ return Publication(title=publication_title, category=publication_category)
def _get_paper(paper_entry: dict, publication: Publication) -> Paper:
- pass
+ """Create paper instance from paper entry.
+
+ Args:
+ paper_entry (dict): A paper entry retrieved from Opencitations API.
+ publication (Publication): Publication instance associated with the paper.
+
+ Returns:
+ Paper: A Paper instance.
+ """
+ paper_title = paper_entry.get("title")
+ paper_abstract = None
+ paper_authors = paper_entry.get("author").split(SPLIT_AUTHOR)
+ paper_publication_year = int(paper_entry.get("year"))
+ paper_publication_date = date(year=paper_publication_year, month=1, day=1)
+ paper_urls = [paper_entry.get("oa_link")]
+ paper_doi = paper_entry.get("doi")
+ paper_pages = paper_entry.get("page")
+ paper_citations_count = paper_entry.get("citation_count")
+
+ paper_citations = []
+ paper_references = []
+
+ # add cross references as a list of clean DOIs
+ if len(paper_entry.get("citation")) > 0:
+ paper_citations = paper_entry.get("citation").replace(" ", "").split(";")
+ if len(paper_entry.get("reference")) > 0:
+ paper_references = paper_entry.get("reference").replace(" ", "").split(";")
+
+ # note: check if ok i think these are counts
+ return Paper(
+ title=paper_title,
+ abstract=paper_abstract,
+ authors=paper_authors,
+ publication=publication,
+ publication_date=paper_publication_date,
+ urls=paper_urls,
+ doi=paper_doi,
+ citations=paper_citations_count,
+ pages=paper_pages,
+ references=paper_references,
+ cites=paper_citations,
+ )
def _add_papers(search: Search, source: str) -> None:
- pass
+ """Add paper to the search.
+
+ Args:
+ search (Search): A Search instance.
+ source (str): Source of paper.
+ """
+ # get references/citations
+ source_dois = [d for _, p in search.paper_by_doi.items() for d in getattr(p, source)]
+
+ # gather paper metadata
+ if source_dois:
+ logging.info(f"Opencitations: {len(source_dois)} papers found")
+ for idx, doi in enumerate(source_dois):
+ paper_entry = _get_paper_entry(doi=doi)
+ publication = _get_publication(paper_entry=paper_entry)
+ paper = _get_paper(paper_entry=paper_entry, publication=publication)
+
+ if paper:
+ logging.info(f"({idx}/{len(source_dois)}) Fetching paper: {doi}")
+ paper.source = source
+ paper.add_database(database_name=DATABASE_LABEL)
+ search.add_paper(paper=paper)
def run(search: Search, references: bool = True, citations: bool = True) -> None:
- pass
+ """Fetch paper from Opencitations API and add the collected papers to the search instance.
+
+ Args:
+ search (Search): A Search instance.
+ references (bool, optional): If references should be used. Defaults to True.
+ citations (bool, optional): If citations should be used. Defaults to True.
+ """
+ try:
+ if references:
+ _add_papers(search=search, source="references")
+ if citations:
+ _add_papers(search=search, source="cites")
+ except Exception as e:
+ logging.debug(e, exc_info=True)
diff --git a/pyproject.toml b/pyproject.toml
index 46eb855..9f69ca1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,6 +19,7 @@ inquirer = "^3.1.3"
xmltodict = "^0.13.0"
importlib-metadata = "^6.6.0"
arxiv = "^1.4.7"
+crossrefapi = "^1.5.0"
[tool.poetry.group.dev.dependencies]
pytest = "^7.2.2"