diff --git a/findpapers/__init__.py b/findpapers/__init__.py index 4e561fc..a9b35f4 100644 --- a/findpapers/__init__.py +++ b/findpapers/__init__.py @@ -9,9 +9,9 @@ from findpapers.tools.refman_tool import RisExport from findpapers.tools.search_runner_tool import search -try: - import importlib.metadata as importlib_metadata -except ModuleNotFoundError: - import importlib_metadata +# try: +# import importlib.metadata as importlib_metadata +# except ModuleNotFoundError: +# import importlib_metadata -__version__ = importlib_metadata.version(__name__) +# __version__ = importlib_metadata.version(__name__) diff --git a/findpapers/data/available_databases.py b/findpapers/data/available_databases.py index 0967af9..675e19b 100644 --- a/findpapers/data/available_databases.py +++ b/findpapers/data/available_databases.py @@ -1,23 +1,35 @@ -from findpapers.searchers import ( - acm_searcher, - arxiv_searcher, - biorxiv_searcher, - cross_ref_searcher, - ieee_searcher, - medrxiv_searcher, - opencitations_searcher, - pubmed_searcher, - scopus_searcher, -) +# from findpapers.searchers import ( +# acm_searcher, +# arxiv_searcher, +# biorxiv_searcher, +# cross_ref_searcher, +# ieee_searcher, +# medrxiv_searcher, +# opencitations_searcher, +# pubmed_searcher, +# scopus_searcher, +# ) + +# AVAILABLE_DATABASES: list[str] = [ +# scopus_searcher.DATABASE_LABEL, +# ieee_searcher.DATABASE_LABEL, +# pubmed_searcher.DATABASE_LABEL, +# arxiv_searcher.DATABASE_LABEL, +# acm_searcher.DATABASE_LABEL, +# medrxiv_searcher.DATABASE_LABEL, +# biorxiv_searcher.DATABASE_LABEL, +# opencitations_searcher.DATABASE_LABEL, +# cross_ref_searcher.DATABASE_LABEL, +# ] AVAILABLE_DATABASES: list[str] = [ - scopus_searcher.DATABASE_LABEL, - ieee_searcher.DATABASE_LABEL, - pubmed_searcher.DATABASE_LABEL, - arxiv_searcher.DATABASE_LABEL, - acm_searcher.DATABASE_LABEL, - medrxiv_searcher.DATABASE_LABEL, - biorxiv_searcher.DATABASE_LABEL, - opencitations_searcher.DATABASE_LABEL, - cross_ref_searcher.DATABASE_LABEL, + "Scopus", + "IEEE", + "PubMed", + "arXiv", + "ACM", + "medRxiv", + "bioRxiv", + "OC", + "CR", ] diff --git a/findpapers/searchers/acm_searcher.py b/findpapers/searchers/acm_searcher.py index 7ea0638..f6634b2 100644 --- a/findpapers/searchers/acm_searcher.py +++ b/findpapers/searchers/acm_searcher.py @@ -1,16 +1,22 @@ import datetime import logging +import os +from random import choice from typing import Optional from urllib.parse import urlencode +import requests from lxml import html +from stqdm import stqdm -import findpapers.utils.common_utils as common_util -import findpapers.utils.query_utils as query_util +# import findpapers.utils.common_utils as common_util +from findpapers.data.user_agents import USER_AGENTS from findpapers.models.paper import Paper from findpapers.models.publication import Publication from findpapers.models.search import Search -from findpapers.utils.requests_utils import DefaultSession +from findpapers.utils.query_utils import replace_search_term_enclosures + +# from findpapers.utils.requests_utils import DefaultSession DATABASE_LABEL = "ACM" BASE_URL = "https://dl.acm.org" @@ -18,24 +24,268 @@ def _get_search_url(search: Search, start_record: Optional[int] = 0) -> str: - pass + """Return the URL to retrieve data from ACM database. See https://dl.acm.org/search/advanced for query tips. + + Args: + search (Search): A search instance. + start_record (Optional[int], optional): Sequence number of first record to fetch. Defaults to 0. + + Returns: + str: URL to be used to retrieve data from ACM database. + """ + # when a wildcard is present, the search term cannot be enclosed in quotes + transformed_query = replace_search_term_enclosures( + query=search.query, + open_replacement="", + close_replacement="", + only_on_wildcards=True, + ) + + # some additional query transformations + transformed_query = transformed_query.replace(" AND NOT ", " NOT ") + transformed_query = replace_search_term_enclosures( + query=transformed_query, + open_replacement='"', + close_replacement='"', + ) + + query = f"Abstract:({transformed_query}) OR Keyword:({transformed_query}) OR Title:({transformed_query})" + url_parameters = { + "fillQuickSearch": "false", + "expand": "all", + "AllField": query, + "pageSize": MAX_ENTRIES_PER_PAGE, + "startPage": start_record, + "sortBy": "Ppub", + } + + if search.since: + url_parameters.update({"AfterMonth": search.since.month, "AfterYear": search.since.year}) + + if search.until: + url_parameters.update({"BeforeMonth": search.until.month, "BeforeYear": search.until.year}) + + return f"{BASE_URL}/action/doSearch?{urlencode(url_parameters)}" + + +def _requests_get(url: str) -> html.HtmlElement: + """Make a GET request to the specified ACM database URL and return the parsed HTML content. + + Args: + url (str): ACM database URL. + + Returns: + html.HtmlElement: Result from ACM database. + """ + # TODO: Is this proxy block required? + proxy = os.getenv("FINDPAPERS_PROXY") + if proxy: + proxies = {"http": proxy, "https": proxy} + + response = requests.get(url, headers={"User-Agent": choice(USER_AGENTS)}, proxies=proxies) + return html.fromstring(response.content) + +# TODO: does this return a dict or html.HtmlElement? +def _get_result(search: Search, start_record: Optional[int] = 0) -> html.HtmlElement: + """Return results from ACM database using the provided search parameters. -def _get_result(search: Search, start_record: Optional[int] = 0) -> dict: # pragma: no cover - pass + Args: + search (Search): A search instance. + start_record (Optional[int], optional): Sequence number of first record to fetch. Defaults to 0. + Returns: + html.HtmlElement: Result from ACM database. + """ + url: str = _get_search_url(search, start_record) + # TODO: Can the try_success be replaced with requests? + # response = common_util.try_success(lambda: DefaultSession().get(url), 2) + return _requests_get(url=url) -def _get_paper_page(url: str) -> html.HtmlElement: # pragma: no cover - pass +def _get_paper_page(url: str) -> html.HtmlElement: + """Get a paper page element from a provided URL. -def _get_paper_metadata(doi: str) -> dict: # pragma: no cover - pass + Args: + url (str): Paper URL. + + Returns: + html.HtmlElement: A HTML element representing the paper given by the provided URL. + """ + return _requests_get(url=url) + + +def _get_paper_metadata(doi: str) -> dict: + """Get a paper metadata from a provided DOI. + + Args: + doi (str): Paper DOI. + + Returns: + dict: The ACM paper metadata, or None if there's no metadata available. + """ + form = {"dois": doi, "targetFile": "custom-bibtex", "format": "bibTex"} + + # # TODO: Can the try_success be replaced with requests like below? + # response = common_util.try_success( + # lambda: DefaultSession().post(f"{BASE_URL}/action/exportCiteProcCitation", data=form).json(), 2 + # ) + + response = requests.post(f"{BASE_URL}/action/exportCiteProcCitation", data=form).json() + return response.get("items", [{}])[0].get(doi, {}) def _get_paper(paper_page: html.HtmlElement, paper_doi: str, paper_url: str) -> Paper: - pass + """Build a paper instance using the paper entry provided. + + Args: + paper_page (html.HtmlElement): A paper page retrieved from ACM. + paper_doi (str): Paper DOI. + paper_url (str): ACM paper URL. + + Returns: + Paper: A Paper instance. + """ + paper_metadata = _get_paper_metadata(paper_doi) + + if not paper_metadata: + return None + + simple_abstract = paper_page.xpath('//*[contains(@class, "abstractSection")]/p') + full_abstract = paper_page.xpath( + '//*[contains(@class, "abstractSection abstractInFull")]/' + '*[contains(@class, "abstractSection abstractInFull")]/section/p', + ) + + paper_abstract = simple_abstract[0].text if simple_abstract else full_abstract[0].text if full_abstract else None + + cite_paper = paper_page.xpath('//*[contains(@class, "article-metric citation")]//span') + cite_miscs = paper_page.xpath('//*[@class="bibliometrics__count"]/span') + + paper_citations = int(cite_paper[0].text) if cite_paper else int(cite_miscs[0].text) if cite_miscs else None + + paper_title = paper_metadata.get("title") + + if paper_title: + publication = Publication( + title=paper_metadata.get("container-title"), + isbn=paper_metadata.get("ISBN"), + issn=paper_metadata.get("ISSN"), + publisher=paper_metadata.get("publisher"), + category=paper_metadata.get("type"), + ) + else: + publication = None + + paper_authors = [f"{x.get('family')}, {x.get('given')}" for x in paper_metadata.get("author", [])] + + if issued := paper_metadata.get("issued"): + date_parts = issued["date-parts"][0] + if len(date_parts) == 1: # only year + paper_publication_date = datetime.date(date_parts[0], 1, 1) + else: + paper_publication_date = datetime.date(date_parts[0], date_parts[1], date_parts[2]) + else: + paper_publication_date = None + + if not paper_publication_date: + return None + + paper_keywords = ( + set([x.strip() for x in paper_metadata["keyword"].split(",")]) if paper_metadata.get("keyword") else set() + ) + + paper_pages = paper_metadata.get("page") + if paper_pages: + paper_pages = paper_pages.replace("\u2013", "-") + + paper_number_of_pages = int(paper_metadata.get("number-of-pages") or None) + + if not paper_doi: + paper_doi = paper_metadata.get("DOI") + + return Paper( + title=paper_title, + abstract=paper_abstract, + authors=paper_authors, + publication=publication, + publication_date=paper_publication_date, + urls={paper_url}, + doi=paper_doi, + citations=paper_citations, + keywords=paper_keywords, + comments=None, + number_of_pages=paper_number_of_pages, + pages=paper_pages, + ) + + +def run(search: Search, pbar: stqdm = None) -> None: + """Fetch papers from ACM database using the provided search parameters. After fetching the data from ACM, the collected papers are added to the provided search instance. + + Args: + search (Search): A search instance. + pbar (stqdm, optional): stqdm instance for progress bar. Defaults to None. + """ + result = _get_result(search=search) + + try: + total_papers = int(result.xpath('//*[@class="hitsLength"]')[0].text.strip()) + except Exception: # pragma: no cover + total_papers = 0 + + logging.info(f"ACM: {total_papers} papers to fetch") + + papers_count = 0 + page_index = 0 + while papers_count < total_papers and not search.reached_its_limit(database=DATABASE_LABEL): + pub_urls = [BASE_URL + x.attrib["href"] for x in result.xpath('//*[@class="hlFld-Title"]/a')] + misc_urls = [BASE_URL + x.attrib["href"] for x in result.xpath('//*[@class="hlFld-ContentGroupTitle"]/a')] + papers_urls = pub_urls + misc_urls + + if not papers_urls: + break + + for paper_url in papers_urls: + if papers_count >= total_papers or search.reached_its_limit(database=DATABASE_LABEL): + break + + try: + papers_count += 1 + paper_page = _get_paper_page(url=paper_url) + + if paper_url in pub_urls: + paper_title = paper_page.xpath('//*[@class="citation__title"]')[0].text + else: + paper_title = paper_page.xpath('//*[@class="article__tocHeading"]')[3].text + + logging.info(f"({papers_count}/{total_papers}) Fetching ACM paper: {paper_title}") + + paper_doi = None + if "/abs/" in paper_url: + paper_doi = paper_url.split("/abs/")[1] + elif "/book/" in paper_url: + paper_doi = paper_url.split("/book/")[1] + else: + paper_doi = paper_url.split("/doi/")[1] + + paper = _get_paper(paper_page=paper_page, paper_doi=paper_doi, paper_url=paper_url) + + if not paper: + continue + + paper.add_database(database_name=DATABASE_LABEL) + search.add_paper(paper=paper) + + except Exception as e: # pragma: no cover + logging.debug(e, exc_info=True) + try: + if pbar: + pbar.update(1) + except Exception as e: # pragma: no cover + logging.debug(e, exc_info=True) -def run(search: Search, pbar=None) -> None: - pass + if papers_count < total_papers and not search.reached_its_limit(database=DATABASE_LABEL): + page_index += 1 + result = _get_result(search=search, start_record=page_index) diff --git a/findpapers/searchers/arxiv_searcher.py b/findpapers/searchers/arxiv_searcher.py index 8fcb322..9780352 100644 --- a/findpapers/searchers/arxiv_searcher.py +++ b/findpapers/searchers/arxiv_searcher.py @@ -1,44 +1,185 @@ -import datetime import logging import math import re import time -from typing import Optional +from datetime import datetime +from typing import Generator, Optional -import requests -import xmltodict -from lxml import html +import arxiv +from stqdm import stqdm -import findpapers.utils.common_utils as common_util -import findpapers.utils.query_utils as query_util +from findpapers.data.subject_area_by_key import SUBJECT_AREA_BY_KEY from findpapers.models.paper import Paper from findpapers.models.publication import Publication from findpapers.models.search import Search -from findpapers.utils.requests_utils import DefaultSession - -from findpapers.data.subject_area_by_key import SUBJECT_AREA_BY_KEY DATABASE_LABEL = "arXiv" -BASE_URL = "http://export.arxiv.org" +# BASE_URL = "http://export.arxiv.org" MAX_ENTRIES_PER_PAGE = 200 -def _get_search_url(search: Search, start_record: Optional[int] = 0) -> str: - pass +def _arxiv_search(search: Search, start_record: Optional[int] = 0) -> Generator[arxiv.Result, None, None]: + """Search the arXiv database using the provided search parameters. + Args: + search (Search): A search instance. + start_record (Optional[int], optional): Index at which record should be searched from. Defaults to 0. -# pragma: no cover -def _get_api_result(search: Search, start_record: Optional[int] = 0) -> dict: - pass + Returns: + Generator[arxiv.Result, None, None]: Search results. + """ + # TODO: do we need to transform the query like it has been done in findpapers repo? + arxiv_search = arxiv.Search( + query=search.query, + max_results=MAX_ENTRIES_PER_PAGE, + sort_by=arxiv.SortCriterion.SubmittedDate, + sort_order=arxiv.SortOrder.Descending, + ) + return arxiv_search.results(offset=start_record) def _get_publication(paper_entry: dict) -> Publication: - pass + """Build a publication instance using paper entry provided. + + Args: + paper_entry (dict): A paper entry retrieved from arXiv. + + Returns: + Publication: A Publication instance. + """ + if "arxiv:journal_ref" in paper_entry: + # overwrite with published title + publication_title = paper_entry.get("arxiv:journal_ref").get("#text") + + if not publication_title: + return None + + else: + publication_title = DATABASE_LABEL # unpublished preprints + + subject_areas = set() + categories = paper_entry.get("category") + if categories: + if isinstance(categories, list): + for category in categories: + subject_area = SUBJECT_AREA_BY_KEY.get(category.get("@term"), None) + if subject_area: + subject_areas.add(subject_area) + else: + subject_area = SUBJECT_AREA_BY_KEY.get(categories.get("@term"), None) + if subject_area: + subject_areas.add(subject_area) + + return Publication(title=publication_title, category="Preprint", subject_areas=subject_areas) def _get_paper(paper_entry: dict, paper_publication_date: datetime.date, publication: Publication) -> Paper: - pass + """Build a paper instance using a paper entry provided. + + Args: + paper_entry (dict): Paper entry retrieved from arXiv. + paper_publication_date (datetime.date): Paper publication date. + publication (Publication): A publication instance that will be associated with the paper. + + Returns: + Paper: Paper instance. + """ + paper_title = paper_entry.get("title") + + if not paper_title: + return None + + paper_title = paper_title.replace("\n", "") + paper_title = re.sub(pattern=" +", repl=" ", string=paper_title) + + paper_doi = paper_entry.get("arxiv:doi").get("#text") if "arxiv:doi" in paper_entry else None + paper_abstract = paper_entry.get("summary") + paper_urls = set() + paper_authors = [] + + links = paper_entry.get("link") + if links: + if isinstance(links, list): + for link in links: + paper_urls.add(link.get("@href")) + else: + paper_urls.add(links.get("@href")) + + authors = paper_entry.get("author") + if authors: + if isinstance(authors, list): + for author in authors: + paper_authors.append(author.get("name")) + else: + paper_authors.append(authors.get("name")) + + paper_comments = paper_entry.get("arxiv:comment", {}).get("#text", None) + + return Paper( + title=paper_title, + abstract=paper_abstract, + authors=paper_authors, + publication=publication, + publication_date=paper_publication_date, + urls=paper_urls, + doi=paper_doi, + comments=paper_comments, + ) + + +def run(search: Search, pbar: stqdm = None) -> None: + """Fetch papers from arXiv database using the provided search parameters. After fetching the data from arXiv, the collected papers are added to the provided search instance. + + Args: + search (Search): A search instance. + pbar (stqdm, optional): stqdm instance for progress bar. Defaults to None. + """ + papers_count = 0 + entries = _arxiv_search(search) + + total_papers = len(list(entries)) + logging.info(f"arXiv: {total_papers} papers to fetch") + + while papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL): + for paper_entry in entries: + if papers_count >= total_papers or search.reached_its_limit(DATABASE_LABEL): + break + + papers_count += 1 + try: + paper_title = paper_entry.title + logging.info(f"({papers_count}/{total_papers}) Fetching arXiv paper: {paper_title}") + + published_date = datetime.strptime(str(paper_entry.published), "%Y-%m-%d %H:%M:%S%z").date() + + # nowadays we don't have a date filter on arXiv API, so we need to do it by ourselves' + if search.since and published_date < search.since: + logging.info('Skipping paper due to "since" date constraint') + continue + elif search.until and published_date > search.until: + logging.info('Skipping paper due to "until" date constraint') + continue + + publication = _get_publication(paper_entry=paper_entry) + paper = _get_paper( + paper_entry=paper_entry, + paper_publication_date=published_date, + publication=publication, + ) + + if paper: + paper.add_database(DATABASE_LABEL) + search.add_paper(paper) + + except Exception as e: # pragma: no cover + logging.debug(e, exc_info=True) + try: + if pbar: + pbar.update(1) + except Exception as e: # pragma: no cover + logging.debug(e, exc_info=True) -def run(search: Search, pbar=None) -> None: - pass + if papers_count < total_papers and not search.reached_its_limit(DATABASE_LABEL): + time.sleep(1) # sleep for 1 second to avoid server blocking + entries = _arxiv_search(search=search, start_record=papers_count) diff --git a/pyproject.toml b/pyproject.toml index 7233632..46eb855 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ lxml = "^4.9.2" inquirer = "^3.1.3" xmltodict = "^0.13.0" importlib-metadata = "^6.6.0" +arxiv = "^1.4.7" [tool.poetry.group.dev.dependencies] pytest = "^7.2.2"