From a2f97827baed58135dff7a24c59da88227e68ce9 Mon Sep 17 00:00:00 2001 From: Kashyap maheshwari Date: Sun, 11 Jun 2023 18:50:44 +0200 Subject: [PATCH] first version of crossref & opencitations seracher #41 --- findpapers/searchers/acm_searcher.py | 2 +- findpapers/searchers/cross_ref_searcher.py | 148 +++++++++++++++++- .../searchers/opencitations_searcher.py | 114 +++++++++++++- pyproject.toml | 1 + 4 files changed, 248 insertions(+), 17 deletions(-) diff --git a/findpapers/searchers/acm_searcher.py b/findpapers/searchers/acm_searcher.py index f6634b2..1a3f653 100644 --- a/findpapers/searchers/acm_searcher.py +++ b/findpapers/searchers/acm_searcher.py @@ -177,7 +177,7 @@ def _get_paper(paper_page: html.HtmlElement, paper_doi: str, paper_url: str) -> else: publication = None - paper_authors = [f"{x.get('family')}, {x.get('given')}" for x in paper_metadata.get("author", [])] + paper_authors = [f"{a.get('family')}, {a.get('given')}" for a in paper_metadata.get("author", [])] if issued := paper_metadata.get("issued"): date_parts = issued["date-parts"][0] diff --git a/findpapers/searchers/cross_ref_searcher.py b/findpapers/searchers/cross_ref_searcher.py index 127d67e..b6c59d6 100644 --- a/findpapers/searchers/cross_ref_searcher.py +++ b/findpapers/searchers/cross_ref_searcher.py @@ -1,15 +1,13 @@ import logging from datetime import date -import requests -from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry +from crossref.restful import Works from findpapers.models.paper import Paper from findpapers.models.publication import Publication from findpapers.models.search import Search -CROSSREF_API = "https://api.crossref.org/works/" +# CROSSREF_API = "https://api.crossref.org/works/" DATABASE_LABEL = "CR" # short for opencitations SPLIT_AUTHOR = "; " @@ -41,16 +39,150 @@ def _y_date(self) -> None: def _get_paper_entry(doi: str) -> dict: - pass + """Use the DOI and extract the metadata of the paper from Crossref API. + + Args: + doi (str): DOI of the paper. + + Returns: + dict: Paper entry from the Crossref API. + """ + return Works().doi(doi=doi) def _get_publication(paper_entry: dict) -> Publication: - pass + """Generate publication instance from a paper entry. + + Args: + paper_entry (dict): Paper entry retrieved from Crossref API. + + Returns: + Publication: A publication instance. + """ + publication_title = ( + DATABASE_LABEL if not paper_entry.get("container-title") else paper_entry.get("container-title")[0] + ) + + publication_issn = paper_entry.get("ISSN")[0] if paper_entry.get("ISSN") else None + + categories = { + "journal-article": "Journal", + "book-chapter": "Book", + "book": "Book", + "proceedings-article": "Other", + "dataset": "Other", + "posted-contend": "Other", + "other": "Other", + } + publication_category = categories.get(paper_entry.get("type"), "Other") + + return Publication( + title=publication_title, + issn=publication_issn, + publisher=paper_entry.get("publisher"), + category=publication_category, + ) def _get_paper(paper_entry: dict, publication: Publication) -> Paper: - pass + """Create a paper instance from paper entry. + + Args: + paper_entry (dict): A paper entry retrieved from Opencitations API. + publication (Publication): Publication instance associated with the paper. + + Returns: + Paper: A paper instance. + """ + title = paper_entry.get("title") + + # add only papers with titles + if not title: + return None + + paper_title = title[0] + + paper_abstract = paper_entry.get("abstract") + + # exclude cross-refs without abstracts + if not paper_abstract: + return None + + remove_abstract = ["", "", "", "", "", ""] + for abstract in remove_abstract: + paper_abstract = paper_abstract.replace(abstract, "") + + paper_authors = [f"{a.get('family')}, {a.get('given')}" for a in paper_entry.get("author", [])] + + # esnure publication date + published = paper_entry.get("published") + if not published: + return None + + date_parts = paper_entry.get("published").get("date-parts") + paper_date = DateConverter(date_parts[0]).date + paper_urls = set() + paper_urls.add(paper_entry.get("URL")) + paper_doi = paper_entry.get("DOI") + paper_pages = paper_entry.get("page") + references = paper_entry.get("reference") + paper_references = [d.get("DOI") for d in (references if references else [])] + + # note: check if ok i think these are counts + return Paper( + paper_title, + paper_abstract, + paper_authors, + publication, + paper_date, + paper_urls, + paper_doi, + pages=paper_pages, + references=paper_references, + ) def _add_papers(search: Search, source: str) -> None: - pass + """Add paper to the search. + + Args: + search (Search): A Search instance. + source (str): Source of paper. + """ + # get references/citations + source_dois = [d for _, p in search.paper_by_doi.items() for d in getattr(p, source)] + # avoid duplicates + source_dois = list(set(source_dois)) + + # gather paper metadata + if source_dois: + logging.info(f"Cross-References {len(source_dois)} papers found") + for idx, doi in enumerate(source_dois): + paper_entry = _get_paper_entry(doi=doi) + if not paper_entry: + continue # doi was not found + publication = _get_publication(paper_entry=paper_entry) + paper = _get_paper(paper_entry=paper_entry, publication=publication) + + if paper: + logging.info(f"({idx}/{len(source_dois)}) Fetching paper: {doi}") + paper.source = source + paper.add_database(database_name=DATABASE_LABEL) + search.add_paper(paper=paper) + + +def run(search: Search, references: bool = True, citations: bool = True) -> None: + """Fetch paper from Crossref API and add the collected papers to the search instance. + + Args: + search (Search): A Search instance. + references (bool, optional): If references should be used. Defaults to True. + citations (bool, optional): If citations should be used. Defaults to True. + """ + try: + if references: + _add_papers(search=search, source="references") + if citations: + _add_papers(search=search, source="cites") + except Exception as e: + logging.debug(e, exc_info=True) diff --git a/findpapers/searchers/opencitations_searcher.py b/findpapers/searchers/opencitations_searcher.py index 4208a66..87ae88a 100644 --- a/findpapers/searchers/opencitations_searcher.py +++ b/findpapers/searchers/opencitations_searcher.py @@ -1,33 +1,131 @@ import logging +from datetime import date + import requests -from datetime import date from findpapers.models.paper import Paper from findpapers.models.publication import Publication from findpapers.models.search import Search -# from findpapers.tools.references_tool import References - OPENCITATIONS_API = "https://opencitations.net/index/api/v1/metadata/" DATABASE_LABEL = "OC" # short for opencitations SPLIT_AUTHOR = "; " def _get_paper_entry(doi: str) -> dict: - pass + """Use the DOI and extract the metadata of the paper from Opencitations API. + + Args: + doi (str): DOI of the paper. + + Returns: + dict: Paper entry from the Opencitations API. + """ + return requests.get(url=OPENCITATIONS_API + doi).json()[0] def _get_publication(paper_entry: dict) -> Publication: - pass + """Generate publication instance from a paper entry. + + Args: + paper_entry (dict): Paper entry retrieved from Opencitations API. + + Returns: + Publication: A Publication instance. + """ + publication_title = paper_entry.get("source_title") + + if not publication_title: + publication_title = DATABASE_LABEL + + # publication_category = 'Preprint' if publication_title is None else None + publication_category = None + + return Publication(title=publication_title, category=publication_category) def _get_paper(paper_entry: dict, publication: Publication) -> Paper: - pass + """Create paper instance from paper entry. + + Args: + paper_entry (dict): A paper entry retrieved from Opencitations API. + publication (Publication): Publication instance associated with the paper. + + Returns: + Paper: A Paper instance. + """ + paper_title = paper_entry.get("title") + paper_abstract = None + paper_authors = paper_entry.get("author").split(SPLIT_AUTHOR) + paper_publication_year = int(paper_entry.get("year")) + paper_publication_date = date(year=paper_publication_year, month=1, day=1) + paper_urls = [paper_entry.get("oa_link")] + paper_doi = paper_entry.get("doi") + paper_pages = paper_entry.get("page") + paper_citations_count = paper_entry.get("citation_count") + + paper_citations = [] + paper_references = [] + + # add cross references as a list of clean DOIs + if len(paper_entry.get("citation")) > 0: + paper_citations = paper_entry.get("citation").replace(" ", "").split(";") + if len(paper_entry.get("reference")) > 0: + paper_references = paper_entry.get("reference").replace(" ", "").split(";") + + # note: check if ok i think these are counts + return Paper( + title=paper_title, + abstract=paper_abstract, + authors=paper_authors, + publication=publication, + publication_date=paper_publication_date, + urls=paper_urls, + doi=paper_doi, + citations=paper_citations_count, + pages=paper_pages, + references=paper_references, + cites=paper_citations, + ) def _add_papers(search: Search, source: str) -> None: - pass + """Add paper to the search. + + Args: + search (Search): A Search instance. + source (str): Source of paper. + """ + # get references/citations + source_dois = [d for _, p in search.paper_by_doi.items() for d in getattr(p, source)] + + # gather paper metadata + if source_dois: + logging.info(f"Opencitations: {len(source_dois)} papers found") + for idx, doi in enumerate(source_dois): + paper_entry = _get_paper_entry(doi=doi) + publication = _get_publication(paper_entry=paper_entry) + paper = _get_paper(paper_entry=paper_entry, publication=publication) + + if paper: + logging.info(f"({idx}/{len(source_dois)}) Fetching paper: {doi}") + paper.source = source + paper.add_database(database_name=DATABASE_LABEL) + search.add_paper(paper=paper) def run(search: Search, references: bool = True, citations: bool = True) -> None: - pass + """Fetch paper from Opencitations API and add the collected papers to the search instance. + + Args: + search (Search): A Search instance. + references (bool, optional): If references should be used. Defaults to True. + citations (bool, optional): If citations should be used. Defaults to True. + """ + try: + if references: + _add_papers(search=search, source="references") + if citations: + _add_papers(search=search, source="cites") + except Exception as e: + logging.debug(e, exc_info=True) diff --git a/pyproject.toml b/pyproject.toml index 46eb855..9f69ca1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ inquirer = "^3.1.3" xmltodict = "^0.13.0" importlib-metadata = "^6.6.0" arxiv = "^1.4.7" +crossrefapi = "^1.5.0" [tool.poetry.group.dev.dependencies] pytest = "^7.2.2"