sulpub code + test

Port over sul_pub code from rialto-data, and add some tests. Closes #3
sul-dlss · Jun 14, 2024 · 81dbacc · 81dbacc
1 parent 55a2366
commit 81dbacc
Show file tree

Hide file tree

Showing 8 changed files with 301 additions and 1 deletion.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,33 @@
+name: Test
+on:
+  push
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.12]
+    steps:
+
+    - name: checkout
+      uses: actions/checkout@v3
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Lint
+      uses: chartboost/ruff-action@v1
+      # it may move, see https://github.com/astral-sh/ruff/issues/8400
+
+    - name: Setup uv
+      use: yexz123/setup-uv@v4
+
+    - name: Install dependencies
+      run: |
+        uv pip install -r requirements.txt
+        uv pip install -r requirements-dev.txt
+
+    - name: Run tests
+      run: pytest
diff --git a/README.md b/README.md
@@ -56,3 +56,23 @@ uv pip compile pyproject.toml -o requirements.txt
 ```
 
 Unlike poetry, uv's dependency resolution is not platform-agnostic. If we find we need to generate a requirements.txt for linux, we can use [uv's multi-platform resolution options](https://github.com/astral-sh/uv?tab=readme-ov-file#multi-platform-resolution).
+
+## Run Tests
+
+First enable the virtual environment:
+
+```
+source .env/bin/activate
+```
+
+Then ensure the app dependencies and dev dependencies are installed.
+
+```
+uv pip install -r requirements.txt -r requirements-dev.txt
+```
+
+Then run the tests:
+
+```
+pytest
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,8 +7,14 @@ requires-python = ">= 3.12"
 dependencies = [
     "pandas",
     "requests",
+    "python-dotenv"
+]
+
+[tool.pytest.ini_options]
+pythonpath = [
+    "."
 ]
 
 [build-system]
 requires = ["setuptools"]
-build-backend = "setuptools.build_meta"
+build-backend = "setuptools.build_meta"
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -0,0 +1 @@
+pytest
diff --git a/rialto_airflow/__init__.py b/rialto_airflow/__init__.py
diff --git a/rialto_airflow/utils/helper.py b/rialto_airflow/utils/helper.py
@@ -0,0 +1,73 @@
+import os
+import glob
+from datetime import datetime
+import csv
+import pandas as pd
+
+
+def write_to_csv(data, field_names, file, **kwargs):
+    normalize_data = kwargs.get("normalize_data", None)
+    directory = "/".join(file.split("/")[:-1]) + "/"
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+        with open(file, "w") as f:
+            writer = csv.DictWriter(f, fieldnames=field_names)
+            writer.writeheader()
+
+    with open(file, "a") as f:
+        writer = csv.DictWriter(f, fieldnames=field_names)
+        if normalize_data:
+            writer.writerow(normalize_data(data))
+        else:
+            writer.writerow(data)
+
+
+def append_unique(dois: list, orcid: str, doi_orcids: dict):
+    for doi in dois:
+        if doi_orcids.get(doi):
+            doi_orcids[doi].append(orcid)
+        else:
+            doi_orcids[doi] = [orcid]
+
+
+def get_nested_values(authorship: list, key) -> list:
+    return [i.get(key) for i in authorship]
+
+
+def rialto_authors():
+    authors_files = glob.glob("data/rialto_app/*.csv")
+    authors_files.sort()
+
+    file_date = datetime.strptime(
+        authors_files[-1].split("_")[-1].replace(".csv", ""), "%Y-%m-%d"
+    )
+    time_since_refreshing = datetime.now() - file_date
+    assert time_since_refreshing.days < 90, (
+        "The authors file from the "
+        "RIALTO application is more than 90 days old. Please download "
+        "a new file from https://sul-rialto-dev.stanford.edu/authors?"
+        "q=&orcid_filter=&commit=Search, change the file name to the "
+        "authors_YYYY-MM-DD.csv format and move the file to "
+        "data/railto_app."
+    )
+
+    return authors_files[-1]
+
+
+def read_sul_pub_pubs():
+    pubs_files = glob.glob("data/sul_pub/*.csv")
+    pubs_files.sort()
+
+    return pd.read_csv(pubs_files[-1])
+
+
+def read_dimensions_pubs():
+    pubs_files = glob.glob("data/dimensions/publications-*.csv")
+
+    return pd.concat([pd.read_csv(f) for f in pubs_files])
+
+
+def read_openalex_pubs():
+    pubs_files = glob.glob("data/openalex/test-pubs.csv")
+
+    return pd.concat([pd.read_csv(f) for f in pubs_files])
diff --git a/rialto_airflow/utils/sulpub.py b/rialto_airflow/utils/sulpub.py
@@ -0,0 +1,162 @@
+import csv
+import glob
+import logging
+import os
+from datetime import datetime, timedelta
+from pathlib import Path
+
+import dotenv
+import pandas as pd
+import requests
+
+from .helper import write_to_csv
+
+dotenv.load_dotenv()
+logging.basicConfig(level=logging.DEBUG)
+
+
+def latest_harvest_date():
+    df = pd.DataFrame()
+    for file in glob.glob("data/sul_pub/*.csv"):
+        df = pd.concat([df, pd.read_csv(file)])
+
+    df["converted_date"] = pd.to_datetime(
+        df["sul_pub_last_updated"], errors="coerce", utc=True
+    )
+
+    return df["converted_date"].max()
+
+
+def sul_pub_pubs(
+    all_pubs_setting=False,
+    file_path=f"data/sul_pub/sul_pub_{datetime.now().year}-{datetime.now().month}-{datetime.now().day}.csv",
+):
+    if not os.path.exists(file_path):
+        Path(file_path).parent.mkdir(exist_ok=True, parents=True)
+
+    sul_pub_fields = [
+        "sul_pub_authorship",
+        "sul_pub_title",
+        "sul_pub_abstract",
+        "sul_pub_author",
+        "sul_pub_year",
+        "sul_pub_type",
+        "sul_pub_mesh_headings",
+        "sul_pub_publisher",
+        "sul_pub_journal",
+        "sul_pub_provenance",
+        "sul_pub_doi",
+        "sul_pub_issn",
+        "sul_pub_sulpubid",
+        "sul_pub_sw_id",
+        "sul_pub_pmid",
+        "sul_pub_identifier",
+        "sul_pub_last_updated",
+        "sul_pub_pages",
+        "sul_pub_date",
+        "sul_pub_country",
+        "sul_pub_booktitle",
+        "sul_pub_edition",
+        "sul_pub_series",
+        "sul_pub_chapter",
+        "sul_pub_editor",
+    ]
+
+    with open(file_path, "w", newline="") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=sul_pub_fields)
+        writer.writeheader()
+
+    page = 1
+
+    # Harcoding the number of pages since there is no way of retreiving that value from the API.
+    # The number will increase as more publications are added which will impact the status bar.
+    if all_pubs_setting is True:
+        sul_pub_progress = range(646 - page)
+    else:
+        sul_pub_progress = range(5 - page)
+
+    while (
+        int(
+            sul_pub_page(page, all_pubs=all_pubs_setting).get("metadata").get("records")
+        )
+        > 0
+    ):
+        for pub in sul_pub_page(page, all_pubs=all_pubs_setting).get("records"):
+            write_to_csv(
+                pub, sul_pub_fields, file_path, normalize_data=normalize_sul_pub
+            )
+        page += 1
+        sul_pub_progress.update(1)
+
+    # Drop duplicates
+    df = pd.read_csv(file_path).drop_duplicates()
+    df.to_csv(file_path)
+
+
+def sul_pub_page(page: int, all_pubs=False) -> str:
+    hostname = os.environ.get("SUL_PUB_HOST")
+    headers = {"CAPKEY": os.environ.get("SUL_PUB_KEY")}
+
+    if all_pubs is True:
+        resp = requests.get(
+            f"https://{hostname}/publications.json?page={page}&per=1000",
+            headers=headers,
+        )
+    else:
+        resp = requests.get(
+            f"https://{hostname}/publications.json?page={page}&per=1000&changedSince={latest_harvest_date() - timedelta(days=1)}",
+            headers=headers,
+        )
+
+    return resp.json()
+
+
+def normalize_sul_pub(result: dict) -> dict:
+    translated_result = {}
+    for i in result:
+        translated_result[f"sul_pub_{i}"] = result[i]
+
+    unwanted_fields = [
+        "sul_pub_apa_citation",
+        "sul_pub_mla_citation",
+        "sul_pub_chicago_citation",
+        "sul_pub_keywords_sw",
+        "sul_pub_publicationcategoryrankinglist_sw",
+        "sul_pub_documentcategory_sw",
+        "sul_pub_isobsolete_sw",
+        "sul_pub_timenotselfcited_sw",
+        "sul_pub_normalizedrank_sw",
+        "sul_pub_authorcitationcountlist_sw",
+        "sul_pub_normalizedrank_sw",
+        "sul_pub_rank_sw",
+        "sul_pub_authorcount",
+        "sul_pub_stateprovince",
+        "sul_pub_newpublicationid_sw",
+        "sul_pub_publicationimpactfactorlist_sw",
+        "sul_pub_ordinalrank_sw",
+        "sul_pub_documenttypes_sw",
+        "sul_pub_numberofreferences_sw",
+        "sul_pub_timescited_sw_retricted",
+        "sul_pub_city",
+        "sul_pub_abstract_restricted",
+        "sul_pub_conference",
+        "sul_pub_allAuthors",
+        "sul_pub_additionalProperties",
+        "sul_pub_etal",
+        "sul_pub_articlenumber",
+        "sul_pub_howpublished",
+        "sul_pub_bibtex_type",
+        "sul_pub_address",
+        "sul_pub_publicationSource",
+        "sul_pub_publicationUrl",
+        "sul_pub_publicationUrlLabel",
+        "sul_pub_wos_uid",
+        "sul_pub_eissn",
+        "sul_pub_wos_item_id",
+    ]
+
+    for field in unwanted_fields:
+        if field in translated_result:
+            del translated_result[field]
+
+    return translated_result
diff --git a/test/utils/test_sulpub.py b/test/utils/test_sulpub.py
@@ -0,0 +1,5 @@
+from rialto_airflow.utils import sulpub
+
+
+def test_ok():
+    assert True