Skip to content

Commit

Permalink
first version of crossref & opencitations seracher #41
Browse files Browse the repository at this point in the history
  • Loading branch information
Kashyap Maheshwari committed Jun 11, 2023
1 parent 735d4b7 commit dd6346e
Show file tree
Hide file tree
Showing 4 changed files with 248 additions and 17 deletions.
2 changes: 1 addition & 1 deletion findpapers/searchers/acm_searcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def _get_paper(paper_page: html.HtmlElement, paper_doi: str, paper_url: str) ->
else:
publication = None

paper_authors = [f"{x.get('family')}, {x.get('given')}" for x in paper_metadata.get("author", [])]
paper_authors = [f"{a.get('family')}, {a.get('given')}" for a in paper_metadata.get("author", [])]

if issued := paper_metadata.get("issued"):
date_parts = issued["date-parts"][0]
Expand Down
148 changes: 140 additions & 8 deletions findpapers/searchers/cross_ref_searcher.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
import logging
from datetime import date

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from crossref.restful import Works

from findpapers.models.paper import Paper
from findpapers.models.publication import Publication
from findpapers.models.search import Search

CROSSREF_API = "https://api.crossref.org/works/"
# CROSSREF_API = "https://api.crossref.org/works/"
DATABASE_LABEL = "CR" # short for opencitations
SPLIT_AUTHOR = "; "

Expand Down Expand Up @@ -41,16 +39,150 @@ def _y_date(self) -> None:


def _get_paper_entry(doi: str) -> dict:
pass
"""Use the DOI and extract the metadata of the paper from Crossref API.
Args:
doi (str): DOI of the paper.
Returns:
dict: Paper entry from the Crossref API.
"""
return Works().doi(doi=doi)


def _get_publication(paper_entry: dict) -> Publication:
pass
"""Generate publication instance from a paper entry.
Args:
paper_entry (dict): Paper entry retrieved from Crossref API.
Returns:
Publication: A publication instance.
"""
publication_title = (
DATABASE_LABEL if not paper_entry.get("container-title") else paper_entry.get("container-title")[0]
)

publication_issn = paper_entry.get("ISSN")[0] if paper_entry.get("ISSN") else None

categories = {
"journal-article": "Journal",
"book-chapter": "Book",
"book": "Book",
"proceedings-article": "Other",
"dataset": "Other",
"posted-contend": "Other",
"other": "Other",
}
publication_category = categories.get(paper_entry.get("type"), "Other")

return Publication(
title=publication_title,
issn=publication_issn,
publisher=paper_entry.get("publisher"),
category=publication_category,
)


def _get_paper(paper_entry: dict, publication: Publication) -> Paper:
pass
"""Create a paper instance from paper entry.
Args:
paper_entry (dict): A paper entry retrieved from Opencitations API.
publication (Publication): Publication instance associated with the paper.
Returns:
Paper: A paper instance.
"""
title = paper_entry.get("title")

# add only papers with titles
if not title:
return None

paper_title = title[0]

paper_abstract = paper_entry.get("abstract")

# exclude cross-refs without abstracts
if not paper_abstract:
return None

remove_abstract = ["<jats:sec>", "</jats:sec>", "<jats:title>", "</jats:title>", "<jats:p>", "</jats:p>"]
for abstract in remove_abstract:
paper_abstract = paper_abstract.replace(abstract, "")

paper_authors = [f"{a.get('family')}, {a.get('given')}" for a in paper_entry.get("author", [])]

# esnure publication date
published = paper_entry.get("published")
if not published:
return None

date_parts = paper_entry.get("published").get("date-parts")
paper_date = DateConverter(date_parts[0]).date
paper_urls = set()
paper_urls.add(paper_entry.get("URL"))
paper_doi = paper_entry.get("DOI")
paper_pages = paper_entry.get("page")
references = paper_entry.get("reference")
paper_references = [d.get("DOI") for d in (references if references else [])]

# note: check if ok i think these are counts
return Paper(
paper_title,
paper_abstract,
paper_authors,
publication,
paper_date,
paper_urls,
paper_doi,
pages=paper_pages,
references=paper_references,
)


def _add_papers(search: Search, source: str) -> None:
pass
"""Add paper to the search.
Args:
search (Search): A Search instance.
source (str): Source of paper.
"""
# get references/citations
source_dois = [d for _, p in search.paper_by_doi.items() for d in getattr(p, source)]
# avoid duplicates
source_dois = list(set(source_dois))

# gather paper metadata
if source_dois:
logging.info(f"Cross-References {len(source_dois)} papers found")
for idx, doi in enumerate(source_dois):
paper_entry = _get_paper_entry(doi=doi)
if not paper_entry:
continue # doi was not found
publication = _get_publication(paper_entry=paper_entry)
paper = _get_paper(paper_entry=paper_entry, publication=publication)

if paper:
logging.info(f"({idx}/{len(source_dois)}) Fetching paper: {doi}")
paper.source = source
paper.add_database(database_name=DATABASE_LABEL)
search.add_paper(paper=paper)


def run(search: Search, references: bool = True, citations: bool = True) -> None:
"""Fetch paper from Crossref API and add the collected papers to the search instance.
Args:
search (Search): A Search instance.
references (bool, optional): If references should be used. Defaults to True.
citations (bool, optional): If citations should be used. Defaults to True.
"""
try:
if references:
_add_papers(search=search, source="references")
if citations:
_add_papers(search=search, source="cites")
except Exception as e:
logging.debug(e, exc_info=True)
114 changes: 106 additions & 8 deletions findpapers/searchers/opencitations_searcher.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,131 @@
import logging
from datetime import date

import requests

from datetime import date
from findpapers.models.paper import Paper
from findpapers.models.publication import Publication
from findpapers.models.search import Search

# from findpapers.tools.references_tool import References

OPENCITATIONS_API = "https://opencitations.net/index/api/v1/metadata/"
DATABASE_LABEL = "OC" # short for opencitations
SPLIT_AUTHOR = "; "


def _get_paper_entry(doi: str) -> dict:
pass
"""Use the DOI and extract the metadata of the paper from Opencitations API.
Args:
doi (str): DOI of the paper.
Returns:
dict: Paper entry from the Opencitations API.
"""
return requests.get(url=OPENCITATIONS_API + doi).json()[0]


def _get_publication(paper_entry: dict) -> Publication:
pass
"""Generate publication instance from a paper entry.
Args:
paper_entry (dict): Paper entry retrieved from Opencitations API.
Returns:
Publication: A Publication instance.
"""
publication_title = paper_entry.get("source_title")

if not publication_title:
publication_title = DATABASE_LABEL

# publication_category = 'Preprint' if publication_title is None else None
publication_category = None

return Publication(title=publication_title, category=publication_category)


def _get_paper(paper_entry: dict, publication: Publication) -> Paper:
pass
"""Create paper instance from paper entry.
Args:
paper_entry (dict): A paper entry retrieved from Opencitations API.
publication (Publication): Publication instance associated with the paper.
Returns:
Paper: A Paper instance.
"""
paper_title = paper_entry.get("title")
paper_abstract = None
paper_authors = paper_entry.get("author").split(SPLIT_AUTHOR)
paper_publication_year = int(paper_entry.get("year"))
paper_publication_date = date(year=paper_publication_year, month=1, day=1)
paper_urls = [paper_entry.get("oa_link")]
paper_doi = paper_entry.get("doi")
paper_pages = paper_entry.get("page")
paper_citations_count = paper_entry.get("citation_count")

paper_citations = []
paper_references = []

# add cross references as a list of clean DOIs
if len(paper_entry.get("citation")) > 0:
paper_citations = paper_entry.get("citation").replace(" ", "").split(";")
if len(paper_entry.get("reference")) > 0:
paper_references = paper_entry.get("reference").replace(" ", "").split(";")

# note: check if ok i think these are counts
return Paper(
title=paper_title,
abstract=paper_abstract,
authors=paper_authors,
publication=publication,
publication_date=paper_publication_date,
urls=paper_urls,
doi=paper_doi,
citations=paper_citations_count,
pages=paper_pages,
references=paper_references,
cites=paper_citations,
)


def _add_papers(search: Search, source: str) -> None:
pass
"""Add paper to the search.
Args:
search (Search): A Search instance.
source (str): Source of paper.
"""
# get references/citations
source_dois = [d for _, p in search.paper_by_doi.items() for d in getattr(p, source)]

# gather paper metadata
if source_dois:
logging.info(f"Opencitations: {len(source_dois)} papers found")
for idx, doi in enumerate(source_dois):
paper_entry = _get_paper_entry(doi=doi)
publication = _get_publication(paper_entry=paper_entry)
paper = _get_paper(paper_entry=paper_entry, publication=publication)

if paper:
logging.info(f"({idx}/{len(source_dois)}) Fetching paper: {doi}")
paper.source = source
paper.add_database(database_name=DATABASE_LABEL)
search.add_paper(paper=paper)


def run(search: Search, references: bool = True, citations: bool = True) -> None:
pass
"""Fetch paper from Opencitations API and add the collected papers to the search instance.
Args:
search (Search): A Search instance.
references (bool, optional): If references should be used. Defaults to True.
citations (bool, optional): If citations should be used. Defaults to True.
"""
try:
if references:
_add_papers(search=search, source="references")
if citations:
_add_papers(search=search, source="cites")
except Exception as e:
logging.debug(e, exc_info=True)
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ inquirer = "^3.1.3"
xmltodict = "^0.13.0"
importlib-metadata = "^6.6.0"
arxiv = "^1.4.7"
crossrefapi = "^1.5.0"

[tool.poetry.group.dev.dependencies]
pytest = "^7.2.2"
Expand Down

0 comments on commit dd6346e

Please sign in to comment.