From c42f9af15025ab29f29dd95635726ca39ec2c87d Mon Sep 17 00:00:00 2001 From: Kashyap Maheshwari Date: Fri, 9 Jun 2023 15:13:36 +0200 Subject: [PATCH] add searcher & tools modules with function names from findpapers #41 --- findpapers/searchers/acm_searcher.py | 41 +++++ findpapers/searchers/arxiv_searcher.py | 44 +++++ findpapers/searchers/biorxiv_searcher.py | 8 + findpapers/searchers/cross_ref_searcher.py | 56 ++++++ findpapers/searchers/ieee_searcher.py | 39 +++++ findpapers/searchers/medrxiv_searcher.py | 8 + .../searchers/opencitations_searcher.py | 33 ++++ findpapers/searchers/pubmed_searcher.py | 44 +++++ findpapers/searchers/rxiv_searcher.py | 47 +++++ findpapers/searchers/scopus_searcher.py | 49 ++++++ findpapers/tools/__init__.py | 17 ++ findpapers/tools/bibtex_generator_tool.py | 17 ++ findpapers/tools/cross_references_tool.py | 11 ++ findpapers/tools/downloader_tool.py | 26 +++ findpapers/tools/rayyan_tool.py | 114 ++++++++++++ findpapers/tools/refiner_tool.py | 42 +++++ findpapers/tools/refman_tool.py | 162 ++++++++++++++++++ findpapers/tools/search_runner_tool.py | 87 ++++++++++ 18 files changed, 845 insertions(+) create mode 100644 findpapers/searchers/acm_searcher.py create mode 100644 findpapers/searchers/arxiv_searcher.py create mode 100644 findpapers/searchers/biorxiv_searcher.py create mode 100644 findpapers/searchers/cross_ref_searcher.py create mode 100644 findpapers/searchers/ieee_searcher.py create mode 100644 findpapers/searchers/medrxiv_searcher.py create mode 100644 findpapers/searchers/opencitations_searcher.py create mode 100644 findpapers/searchers/pubmed_searcher.py create mode 100644 findpapers/searchers/rxiv_searcher.py create mode 100644 findpapers/searchers/scopus_searcher.py create mode 100644 findpapers/tools/__init__.py create mode 100644 findpapers/tools/bibtex_generator_tool.py create mode 100644 findpapers/tools/cross_references_tool.py create mode 100644 findpapers/tools/downloader_tool.py create mode 100644 findpapers/tools/rayyan_tool.py create mode 100644 findpapers/tools/refiner_tool.py create mode 100644 findpapers/tools/refman_tool.py create mode 100644 findpapers/tools/search_runner_tool.py diff --git a/findpapers/searchers/acm_searcher.py b/findpapers/searchers/acm_searcher.py new file mode 100644 index 0000000..7ea0638 --- /dev/null +++ b/findpapers/searchers/acm_searcher.py @@ -0,0 +1,41 @@ +import datetime +import logging +from typing import Optional +from urllib.parse import urlencode + +from lxml import html + +import findpapers.utils.common_utils as common_util +import findpapers.utils.query_utils as query_util +from findpapers.models.paper import Paper +from findpapers.models.publication import Publication +from findpapers.models.search import Search +from findpapers.utils.requests_utils import DefaultSession + +DATABASE_LABEL = "ACM" +BASE_URL = "https://dl.acm.org" +MAX_ENTRIES_PER_PAGE = 100 + + +def _get_search_url(search: Search, start_record: Optional[int] = 0) -> str: + pass + + +def _get_result(search: Search, start_record: Optional[int] = 0) -> dict: # pragma: no cover + pass + + +def _get_paper_page(url: str) -> html.HtmlElement: # pragma: no cover + pass + + +def _get_paper_metadata(doi: str) -> dict: # pragma: no cover + pass + + +def _get_paper(paper_page: html.HtmlElement, paper_doi: str, paper_url: str) -> Paper: + pass + + +def run(search: Search, pbar=None) -> None: + pass diff --git a/findpapers/searchers/arxiv_searcher.py b/findpapers/searchers/arxiv_searcher.py new file mode 100644 index 0000000..8fcb322 --- /dev/null +++ b/findpapers/searchers/arxiv_searcher.py @@ -0,0 +1,44 @@ +import datetime +import logging +import math +import re +import time +from typing import Optional + +import requests +import xmltodict +from lxml import html + +import findpapers.utils.common_utils as common_util +import findpapers.utils.query_utils as query_util +from findpapers.models.paper import Paper +from findpapers.models.publication import Publication +from findpapers.models.search import Search +from findpapers.utils.requests_utils import DefaultSession + +from findpapers.data.subject_area_by_key import SUBJECT_AREA_BY_KEY + +DATABASE_LABEL = "arXiv" +BASE_URL = "http://export.arxiv.org" +MAX_ENTRIES_PER_PAGE = 200 + + +def _get_search_url(search: Search, start_record: Optional[int] = 0) -> str: + pass + + +# pragma: no cover +def _get_api_result(search: Search, start_record: Optional[int] = 0) -> dict: + pass + + +def _get_publication(paper_entry: dict) -> Publication: + pass + + +def _get_paper(paper_entry: dict, paper_publication_date: datetime.date, publication: Publication) -> Paper: + pass + + +def run(search: Search, pbar=None) -> None: + pass diff --git a/findpapers/searchers/biorxiv_searcher.py b/findpapers/searchers/biorxiv_searcher.py new file mode 100644 index 0000000..ef5c3d2 --- /dev/null +++ b/findpapers/searchers/biorxiv_searcher.py @@ -0,0 +1,8 @@ +import findpapers.searchers.rxiv_searcher as rxiv_searcher +from findpapers.models.search import Search + +DATABASE_LABEL = "bioRxiv" + + +def run(search: Search, pbar=None) -> None: + pass diff --git a/findpapers/searchers/cross_ref_searcher.py b/findpapers/searchers/cross_ref_searcher.py new file mode 100644 index 0000000..127d67e --- /dev/null +++ b/findpapers/searchers/cross_ref_searcher.py @@ -0,0 +1,56 @@ +import logging +from datetime import date + +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + +from findpapers.models.paper import Paper +from findpapers.models.publication import Publication +from findpapers.models.search import Search + +CROSSREF_API = "https://api.crossref.org/works/" +DATABASE_LABEL = "CR" # short for opencitations +SPLIT_AUTHOR = "; " + + +class DateConverter(object): + def __init__(self, date_parts: list) -> None: + self.date_parts = date_parts + date_functions = {3: "_ymd_date", 2: "_ym_date", 1: "_y_date"} + + date_getter = date_functions.get(len(date_parts)) + converter = getattr(self, date_getter) + converter() + self.date = date(year=self.year, month=self.month, day=self.day) + + def _ymd_date(self) -> None: + self.year = int(self.date_parts[0]) + self.month = int(self.date_parts[1]) + self.day = int(self.date_parts[2]) + + def _ym_date(self) -> None: + self.year = int(self.date_parts[0]) + self.month = int(self.date_parts[1]) + self.day = 1 + + def _y_date(self) -> None: + self.year = int(self.date_parts[0]) + self.month = 1 + self.day = 1 + + +def _get_paper_entry(doi: str) -> dict: + pass + + +def _get_publication(paper_entry: dict) -> Publication: + pass + + +def _get_paper(paper_entry: dict, publication: Publication) -> Paper: + pass + + +def _add_papers(search: Search, source: str) -> None: + pass diff --git a/findpapers/searchers/ieee_searcher.py b/findpapers/searchers/ieee_searcher.py new file mode 100644 index 0000000..df92e2e --- /dev/null +++ b/findpapers/searchers/ieee_searcher.py @@ -0,0 +1,39 @@ +import datetime +import logging +import math +import re +from typing import Optional + +import requests +from lxml import html + +import findpapers.utils.common_utils as common_util +import findpapers.utils.query_utils as query_util +from findpapers.models.paper import Paper +from findpapers.models.publication import Publication +from findpapers.models.search import Search +from findpapers.utils.requests_utils import DefaultSession + +DATABASE_LABEL = "IEEE" +BASE_URL = "http://ieeexploreapi.ieee.org" +MAX_ENTRIES_PER_PAGE = 200 + + +def _get_search_url(search: Search, api_token: str, start_record: Optional[int] = 1) -> str: + pass + + +def _get_api_result(search: Search, api_token: str, start_record: Optional[int] = 1) -> dict: # pragma: no cover + pass + + +def _get_publication(paper_entry: dict) -> Publication: + pass + + +def _get_paper(paper_entry: dict, publication: Publication) -> Paper: + pass + + +def run(search: Search, api_token: str, pbar=None) -> None: + pass diff --git a/findpapers/searchers/medrxiv_searcher.py b/findpapers/searchers/medrxiv_searcher.py new file mode 100644 index 0000000..2d9a52d --- /dev/null +++ b/findpapers/searchers/medrxiv_searcher.py @@ -0,0 +1,8 @@ +import findpapers.searchers.rxiv_searcher as rxiv_searcher +from findpapers.models.search import Search + +DATABASE_LABEL = "medRxiv" + + +def run(search: Search, pbar=None) -> None: + pass diff --git a/findpapers/searchers/opencitations_searcher.py b/findpapers/searchers/opencitations_searcher.py new file mode 100644 index 0000000..4208a66 --- /dev/null +++ b/findpapers/searchers/opencitations_searcher.py @@ -0,0 +1,33 @@ +import logging +import requests + +from datetime import date +from findpapers.models.paper import Paper +from findpapers.models.publication import Publication +from findpapers.models.search import Search + +# from findpapers.tools.references_tool import References + +OPENCITATIONS_API = "https://opencitations.net/index/api/v1/metadata/" +DATABASE_LABEL = "OC" # short for opencitations +SPLIT_AUTHOR = "; " + + +def _get_paper_entry(doi: str) -> dict: + pass + + +def _get_publication(paper_entry: dict) -> Publication: + pass + + +def _get_paper(paper_entry: dict, publication: Publication) -> Paper: + pass + + +def _add_papers(search: Search, source: str) -> None: + pass + + +def run(search: Search, references: bool = True, citations: bool = True) -> None: + pass diff --git a/findpapers/searchers/pubmed_searcher.py b/findpapers/searchers/pubmed_searcher.py new file mode 100644 index 0000000..f3d91dc --- /dev/null +++ b/findpapers/searchers/pubmed_searcher.py @@ -0,0 +1,44 @@ +import datetime +import logging +from typing import Optional + +import xmltodict + +import findpapers.utils.common_utils as common_util +import findpapers.utils.query_utils as query_util +from findpapers.models.paper import Paper +from findpapers.models.publication import Publication +from findpapers.models.search import Search +from findpapers.utils.requests_utils import DefaultSession + +DATABASE_LABEL = "PubMed" +BASE_URL = "https://eutils.ncbi.nlm.nih.gov" +MAX_ENTRIES_PER_PAGE = 50 + + +def _get_search_url(search: Search, start_record: Optional[int] = 0) -> str: + pass + + +def _get_api_result(search: Search, start_record: Optional[int] = 0) -> dict: + pass + + +def _get_paper_entry(pubmed_id: str) -> dict: # pragma: no cover + pass + + +def _get_publication(paper_entry: dict) -> Publication: + pass + + +def _get_text_recursively(text_entry) -> str: + pass + + +def _get_paper(paper_entry: dict, publication: Publication) -> Paper: + pass + + +def run(search: Search, pbar=None) -> None: + pass diff --git a/findpapers/searchers/rxiv_searcher.py b/findpapers/searchers/rxiv_searcher.py new file mode 100644 index 0000000..bb27a93 --- /dev/null +++ b/findpapers/searchers/rxiv_searcher.py @@ -0,0 +1,47 @@ +import datetime +import logging +from typing import List + +from lxml import html + +import findpapers.utils.common_utils as common_util +import findpapers.utils.query_utils as query_util +from findpapers.models.paper import Paper +from findpapers.models.publication import Publication +from findpapers.models.search import Search +from findpapers.utils.requests_utils import DefaultSession + +BASE_URL = "https://www.medrxiv.org" +API_BASE_URL = "https://api.biorxiv.org" + + +def _get_search_urls(search: Search, database: str) -> List[str]: + pass + + +def _get_result(url: str) -> html.HtmlElement: # pragma: no cover + pass + + +def _get_result_page_data(result_page: html.HtmlElement) -> dict: + pass + + +def _get_paper_metadata(doi: str, database: str) -> dict: # pragma: no cover + pass + + +def _get_data(url: str) -> List[dict]: + pass + + +def _get_publication(paper_entry: dict, database: str) -> Publication: + pass + + +def _get_paper(paper_metadata: dict, database: str) -> Paper: + pass + + +def run(search: Search, database: str, pbar=None) -> None: + pass diff --git a/findpapers/searchers/scopus_searcher.py b/findpapers/searchers/scopus_searcher.py new file mode 100644 index 0000000..285cccb --- /dev/null +++ b/findpapers/searchers/scopus_searcher.py @@ -0,0 +1,49 @@ +import datetime +import logging +import re +from typing import Optional + +import requests +from lxml import html + +import findpapers.utils.common_utils as common_util +import findpapers.utils.query_utils as query_util +from findpapers.models.paper import Paper +from findpapers.models.publication import Publication +from findpapers.models.search import Search +from findpapers.utils.requests_utils import DefaultSession + +DATABASE_LABEL = "Scopus" +BASE_URL = "https://api.elsevier.com" + + +def _get_query(search: Search) -> str: + pass + + +def _get_publication_entry(publication_issn: str, api_token: str) -> dict: # pragma: no cover + pass + + +def _get_publication(paper_entry: dict, api_token: str) -> Publication: + pass + + +def _get_paper_page(url: str) -> object: # pragma: no cover + pass + + +def _get_paper(paper_entry: dict, publication: Publication) -> Paper: + pass + + +def _get_search_results(search: Search, api_token: str, url: Optional[str] = None) -> dict: # pragma: no cover + pass + + +def enrich_publication_data(search: Search, api_token: str) -> None: + pass + + +def run(search: Search, api_token: str, pbar=None, url: Optional[str] = None, papers_count: Optional[int] = 0) -> None: + pass diff --git a/findpapers/tools/__init__.py b/findpapers/tools/__init__.py new file mode 100644 index 0000000..4e561fc --- /dev/null +++ b/findpapers/tools/__init__.py @@ -0,0 +1,17 @@ +import logging +import os +from typing import Optional + +from findpapers.tools.bibtex_generator_tool import generate_bibtex +from findpapers.tools.downloader_tool import download +from findpapers.tools.rayyan_tool import RayyanExport +from findpapers.tools.refiner_tool import refine +from findpapers.tools.refman_tool import RisExport +from findpapers.tools.search_runner_tool import search + +try: + import importlib.metadata as importlib_metadata +except ModuleNotFoundError: + import importlib_metadata + +__version__ = importlib_metadata.version(__name__) diff --git a/findpapers/tools/bibtex_generator_tool.py b/findpapers/tools/bibtex_generator_tool.py new file mode 100644 index 0000000..5f95392 --- /dev/null +++ b/findpapers/tools/bibtex_generator_tool.py @@ -0,0 +1,17 @@ +import datetime +import logging +from typing import Optional + +import findpapers.utils.common_utils as common_util +import findpapers.utils.persistence_utils as persistence_util + + +def generate_bibtex( + search_path: str, + outputpath: str, + only_selected_papers: Optional[bool] = False, + categories_filter: Optional[dict] = None, + add_findpapers_citation: Optional[bool] = False, + verbose: Optional[bool] = False, +) -> None: + pass diff --git a/findpapers/tools/cross_references_tool.py b/findpapers/tools/cross_references_tool.py new file mode 100644 index 0000000..74bb157 --- /dev/null +++ b/findpapers/tools/cross_references_tool.py @@ -0,0 +1,11 @@ +from typing import List, Tuple + +import requests + +OPENCITATIONS_API = "https://opencitations.net/index/api/v1/metadata/" +REFERENCES_SPLIT = "; " +CITATIONS_SPLIT = "; " + + +def get_cross_references(doi: str = "") -> Tuple[List[str], List[str]]: + pass diff --git a/findpapers/tools/downloader_tool.py b/findpapers/tools/downloader_tool.py new file mode 100644 index 0000000..d8b889e --- /dev/null +++ b/findpapers/tools/downloader_tool.py @@ -0,0 +1,26 @@ +import datetime +import json +import logging +import os +import re +import urllib.parse +from typing import List, Optional + +import requests +from lxml import html + +import findpapers.utils.common_utils as common_util +import findpapers.utils.persistence_utils as persistence_util +from findpapers.models.search import Search +from findpapers.utils.requests_utils import DefaultSession + + +def download( + search_path: str, + output_directory: str, + only_selected_papers: Optional[bool] = False, + categories_filter: Optional[dict] = None, + proxy: Optional[str] = None, + verbose: Optional[bool] = False, +) -> None: + pass diff --git a/findpapers/tools/rayyan_tool.py b/findpapers/tools/rayyan_tool.py new file mode 100644 index 0000000..345151e --- /dev/null +++ b/findpapers/tools/rayyan_tool.py @@ -0,0 +1,114 @@ +import logging +from dataclasses import dataclass, fields + +import pandas as pd + +from findpapers.models.search import Search + + +@dataclass +class RayyanPaper: + key: int + title: str + authors: list[str] + databases: list[str] + journal: str + issn: str + day: int + month: int + year: int + volume: int = None + issue: int = None + pages: str = None + publisher: str = None + pmc_id: str = None + pubmed_id: str = None + url: list[str] = None + abstract: str = None + notes: str = None + + +class RayyanExport: + def __init__(self, search_results: Search) -> None: + self.search = search_results + + @property + def rayyan(self) -> list: + """List of rayyan papers. + + Returns: + list: pandas compatible search results + """ + return self.__rayyan + + @property + def search(self) -> Search: + """Results of literature search. + + Returns: + Search: search results + """ + return self.__search + + @search.setter + def search(self, search_results) -> None: + if len(search_results.papers) > 0: + self.__search = search_results + self._convert_to_rayyan() + + def _convert_to_rayyan(self) -> None: + """Convert findpapers results for rayyan.""" + papers = self.search.papers + try: + rayyan = [ + RayyanPaper( + key=i, + title=p.title, + authors=p.authors, + databases=list(p.databases), + journal=p.publication.title, + issn=p.publication.issn, + day=p.publication_date.day, + month=p.publication_date.month, + year=p.publication_date.year, + pages=p.pages, + publisher=p.publication.publisher, + url=list(p.urls), + abstract=p.abstract, + notes=f"doi: {p.doi}", + ) + for i, p in enumerate(papers, 1) + ] # start key from 1 + except Exception: + logging.warning("Results can not be converted to rayyan", exc_info=True) + else: + self.__rayyan = rayyan + + def generate_rayyan_csv(self, filename: str = None) -> tuple[bytes | None, pd.DataFrame | None]: + """Convert and save search results in a rayyan compatibe csv. + + Args: + filename (str, optional): filename of csv. Defaults to None. + + Returns: + csv: a rayyan compatible and encoded csv obj. Defaults to None. + papers: pamdas dataframe of rayyan objects. Defaults to None. + """ + if hasattr(self, "rayyan"): + papers = pd.DataFrame(self.rayyan) + + # convert lists to strings + list_names = [field.name for field in fields(RayyanPaper) if field.type == list[str]] + + csv_content = papers.copy() + for f in list_names: + csv_content[f] = [", ".join(l) for l in papers[f]] + + csv = csv_content.to_csv(index=False).encode("utf-8") + if filename is not None: + csv_content.to_csv(filename, index=False) + else: + papers = None + csv = None + logging.info("Empty results") + return csv, papers diff --git a/findpapers/tools/refiner_tool.py b/findpapers/tools/refiner_tool.py new file mode 100644 index 0000000..646cadc --- /dev/null +++ b/findpapers/tools/refiner_tool.py @@ -0,0 +1,42 @@ +import os +import re +from typing import List, Optional + +import inquirer +from colorama import Back, Fore, Style, init + +import findpapers.utils.common_utils as common_util +import findpapers.utils.persistence_utils as persistence_util +from findpapers.models.paper import Paper +from findpapers.models.search import Search + + +def _print_paper_details( + paper: Paper, + highlights: List[str], + show_abstract: bool, + show_extra_info: bool, +) -> None: # pragma: no cover + pass + + +def _get_select_question_input(): # pragma: no cover + pass + + +def _get_category_question_input(categories: dict) -> None: # pragma: no cover + pass + + +def refine( + search_path: str, + categories: Optional[dict] = None, + highlights: Optional[list] = None, + show_abstract: Optional[bool] = False, + show_extra_info: Optional[bool] = False, + only_selected_papers: Optional[bool] = False, + only_removed_papers: Optional[bool] = False, + read_only: Optional[bool] = False, + verbose: Optional[bool] = False, +) -> None: + pass diff --git a/findpapers/tools/refman_tool.py b/findpapers/tools/refman_tool.py new file mode 100644 index 0000000..1fbbb92 --- /dev/null +++ b/findpapers/tools/refman_tool.py @@ -0,0 +1,162 @@ +import datetime +import logging +from dataclasses import asdict, dataclass +from typing import List + +import pandas as pd +import rispy + +from findpapers.models.search import Search + + +def _split_page_information(pages: str = None): + """Split page information into start and end page. + + Args: + pages (str, optional): page string. Defaults to None. + + Returns: + str: start_page, end_page + """ + if pages is None: + return "", "" + if "-" in pages: + return pages.split("-") + elif "," in pages: + return pages.split(",") + elif " " in pages: + return pages.split(" ") + else: + return pages, "" + + +@dataclass +class RisPaper: + id: int + abstract: str + authors: List[str] + custom1: bool + custom2: bool + custom3: List[str] + custom4: int + custom5: List[str] + custom6: List[str] + date: datetime + name_of_database: List[str] + doi: str + start_page: str + end_page: str + alternate_title3: str + journal_name: str + keywords: List[str] + label: bool + notes: str + publisher: str + year: int + reviewed_item: bool + issn: str + title: str + type_of_reference: str + url: List[str] + publication_year: int + access_date: datetime + + +class RisExport: + def __init__(self, search_results: Search) -> None: + self.search = search_results + + @property + def ris(self) -> list: + """List of papers. + + Returns: + list: pandas compatible search results + """ + return self.__ris + + @property + def search(self) -> Search: + """Results of literature search. + + Returns: + Search: search results + """ + return self.__search + + @search.setter + def search(self, search_results) -> None: + if len(search_results.papers) > 0: + self.__search = search_results + self._convert_to_ris() + + def _convert_to_ris(self) -> None: + papers = self.search.papers + + entry_type = {"Journal": "JOUR", "Book": "BOOK", "Conference Proceedings": "CONF", "Preprint": "UNPB"} + + try: + ris = [ + RisPaper( + id=i, + abstract=p.abstract, + authors=p.authors, + custom1=p.selected, + custom2=p.reviewed, + custom3=(list(p.criteria) if p.criteria is not None else None), + custom4=p.citations, + custom5=list(p.publication.subject_areas), + custom6=["selected", "reviewed", "criteria", "citations", "subject_areas", "custom_explanation"], + date=p.publication_date, + name_of_database=list(p.databases), + doi=p.doi, + start_page=_split_page_information(p.pages)[0], + end_page=_split_page_information(p.pages)[1], + alternate_title3=p.publication.title, + journal_name=p.publication.title, + keywords=list(p.keywords), + label=p.selected, + notes=p.comments, + publisher=p.publication.publisher, + year=p.publication_date.year, + reviewed_item=(True if p.selected is not None else False), + issn=p.publication.issn, + title=p.title, + type_of_reference=entry_type.get(p.publication.category, "JOUR"), + url=list(p.urls), + publication_year=p.publication_date.year, + access_date=self.search.processed_at.date(), + ) + for i, p in enumerate(papers, 1) + ] # start key from 1 + except Exception: + logging.warning("Results can not be converted to RIS", exc_info=True) + else: + self.__ris = ris + + def generate_ris(self, filename: str = None) -> tuple[str | None, pd.DataFrame | None]: + """Convert and save search results as ris. + + Args: + filename (str, optional): filename of csv. Defaults to None. + + Returns: + ris: a RIS compatible and encoded txtio obj. Defaults to None. + ris_df: pandas dataframe of ris objects. Defaults to None. + """ + if hasattr(self, "ris"): + ris_df = pd.DataFrame(self.ris) + + # convert to ris + raw_entries = [asdict(p) for p in self.ris] # convert to dict + entries = [{k: v for k, v in p.items() if v is not None} for p in raw_entries] + ris = rispy.dumps(entries, skip_unknown_tags=True, enforce_list_tags=False) # convert to ris + + if filename is not None: + with open(filename, "w") as file: + file.writelines(ris) + else: + ris_df = None + ris = None + logging.info("Empty results") + return ris, ris_df diff --git a/findpapers/tools/search_runner_tool.py b/findpapers/tools/search_runner_tool.py new file mode 100644 index 0000000..820174e --- /dev/null +++ b/findpapers/tools/search_runner_tool.py @@ -0,0 +1,87 @@ +import copy +import datetime +import logging +import os +import re +from typing import List, Optional +from urllib.parse import urlparse + +import requests +from lxml import html + +import findpapers.searchers.acm_searcher as acm_searcher +import findpapers.searchers.arxiv_searcher as arxiv_searcher +import findpapers.searchers.biorxiv_searcher as biorxiv_searcher +import findpapers.searchers.cross_ref_searcher as cross_ref_searcher +import findpapers.searchers.ieee_searcher as ieee_searcher +import findpapers.searchers.medrxiv_searcher as medrxiv_searcher +import findpapers.searchers.opencitations_searcher as opencitations_searcher +import findpapers.searchers.pubmed_searcher as pubmed_searcher +import findpapers.searchers.scopus_searcher as scopus_searcher +import findpapers.tools.cross_references_tool as cr +import findpapers.utils.common_utils as common_util +import findpapers.utils.persistence_utils as persistence_util +import findpapers.utils.publication_utils as publication_util +from findpapers.models.paper import Paper +from findpapers.models.publication import Publication +from findpapers.models.search import Search +from findpapers.utils.requests_utils import DefaultSession + + +def _get_paper_metadata_by_url(url: str): + pass + + +def _force_single_metadata_value_by_key(metadata_entry: dict, metadata_key: str) -> None: + pass + + +def _enrich(search: Search, scopus_api_token: Optional[str] = None) -> None: + pass + + +def _filter(search: Search) -> None: + pass + + +def _flag_potentially_predatory_publications(search: Search) -> None: + pass + + +def _database_safe_run(function: callable, search: Search, database_label: str) -> None: + pass + + +def _sanitize_query(query: str) -> str: + pass + + +def _is_query_ok(query: str) -> bool: + pass + + +def _add_refs_cites(search: Search) -> None: + pass + + +def search( + outputpath: str, + query: Optional[str] = None, + since: Optional[datetime.date] = None, + until: Optional[datetime.date] = None, + limit: Optional[int] = None, + limit_per_database: Optional[int] = None, + databases: Optional[List[str]] = None, + publication_types: Optional[List[str]] = None, + scopus_api_token: Optional[str] = None, + ieee_api_token: Optional[str] = None, + proxy: Optional[str] = None, + similarity_threshold: Optional[float] = 0.95, + rxiv_query: Optional[str] = None, + cross_reference_search: Optional[bool] = False, + enrich: Optional[bool] = False, + only_title_abstract: Optional[bool] = False, + verbose: Optional[bool] = False, + pbar=None, +) -> dict: + pass