Skip to content

Commit

Permalink
sulpub code + test
Browse files Browse the repository at this point in the history
Port over sul_pub code from rialto-data, and add some tests.

Closes #3
  • Loading branch information
edsu committed Jun 14, 2024
1 parent 55a2366 commit 81dbacc
Show file tree
Hide file tree
Showing 8 changed files with 301 additions and 1 deletion.
33 changes: 33 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: Test
on:
push
jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.12]
steps:

- name: checkout
uses: actions/checkout@v3

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

- name: Lint
uses: chartboost/ruff-action@v1
# it may move, see https://github.com/astral-sh/ruff/issues/8400

- name: Setup uv
use: yexz123/setup-uv@v4

- name: Install dependencies
run: |
uv pip install -r requirements.txt
uv pip install -r requirements-dev.txt
- name: Run tests
run: pytest
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,23 @@ uv pip compile pyproject.toml -o requirements.txt
```

Unlike poetry, uv's dependency resolution is not platform-agnostic. If we find we need to generate a requirements.txt for linux, we can use [uv's multi-platform resolution options](https://github.com/astral-sh/uv?tab=readme-ov-file#multi-platform-resolution).

## Run Tests

First enable the virtual environment:

```
source .env/bin/activate
```

Then ensure the app dependencies and dev dependencies are installed.

```
uv pip install -r requirements.txt -r requirements-dev.txt
```

Then run the tests:

```
pytest
```
8 changes: 7 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,14 @@ requires-python = ">= 3.12"
dependencies = [
"pandas",
"requests",
"python-dotenv"
]

[tool.pytest.ini_options]
pythonpath = [
"."
]

[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"
build-backend = "setuptools.build_meta"
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pytest
Empty file added rialto_airflow/__init__.py
Empty file.
73 changes: 73 additions & 0 deletions rialto_airflow/utils/helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import os
import glob
from datetime import datetime
import csv
import pandas as pd


def write_to_csv(data, field_names, file, **kwargs):
normalize_data = kwargs.get("normalize_data", None)
directory = "/".join(file.split("/")[:-1]) + "/"
if not os.path.exists(directory):
os.makedirs(directory)
with open(file, "w") as f:
writer = csv.DictWriter(f, fieldnames=field_names)
writer.writeheader()

with open(file, "a") as f:
writer = csv.DictWriter(f, fieldnames=field_names)
if normalize_data:
writer.writerow(normalize_data(data))
else:
writer.writerow(data)


def append_unique(dois: list, orcid: str, doi_orcids: dict):
for doi in dois:
if doi_orcids.get(doi):
doi_orcids[doi].append(orcid)
else:
doi_orcids[doi] = [orcid]


def get_nested_values(authorship: list, key) -> list:
return [i.get(key) for i in authorship]


def rialto_authors():
authors_files = glob.glob("data/rialto_app/*.csv")
authors_files.sort()

file_date = datetime.strptime(
authors_files[-1].split("_")[-1].replace(".csv", ""), "%Y-%m-%d"
)
time_since_refreshing = datetime.now() - file_date
assert time_since_refreshing.days < 90, (
"The authors file from the "
"RIALTO application is more than 90 days old. Please download "
"a new file from https://sul-rialto-dev.stanford.edu/authors?"
"q=&orcid_filter=&commit=Search, change the file name to the "
"authors_YYYY-MM-DD.csv format and move the file to "
"data/railto_app."
)

return authors_files[-1]


def read_sul_pub_pubs():
pubs_files = glob.glob("data/sul_pub/*.csv")
pubs_files.sort()

return pd.read_csv(pubs_files[-1])


def read_dimensions_pubs():
pubs_files = glob.glob("data/dimensions/publications-*.csv")

return pd.concat([pd.read_csv(f) for f in pubs_files])


def read_openalex_pubs():
pubs_files = glob.glob("data/openalex/test-pubs.csv")

return pd.concat([pd.read_csv(f) for f in pubs_files])
162 changes: 162 additions & 0 deletions rialto_airflow/utils/sulpub.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import csv
import glob
import logging
import os
from datetime import datetime, timedelta
from pathlib import Path

import dotenv
import pandas as pd
import requests

from .helper import write_to_csv

dotenv.load_dotenv()
logging.basicConfig(level=logging.DEBUG)


def latest_harvest_date():
df = pd.DataFrame()
for file in glob.glob("data/sul_pub/*.csv"):
df = pd.concat([df, pd.read_csv(file)])

df["converted_date"] = pd.to_datetime(
df["sul_pub_last_updated"], errors="coerce", utc=True
)

return df["converted_date"].max()


def sul_pub_pubs(
all_pubs_setting=False,
file_path=f"data/sul_pub/sul_pub_{datetime.now().year}-{datetime.now().month}-{datetime.now().day}.csv",
):
if not os.path.exists(file_path):
Path(file_path).parent.mkdir(exist_ok=True, parents=True)

sul_pub_fields = [
"sul_pub_authorship",
"sul_pub_title",
"sul_pub_abstract",
"sul_pub_author",
"sul_pub_year",
"sul_pub_type",
"sul_pub_mesh_headings",
"sul_pub_publisher",
"sul_pub_journal",
"sul_pub_provenance",
"sul_pub_doi",
"sul_pub_issn",
"sul_pub_sulpubid",
"sul_pub_sw_id",
"sul_pub_pmid",
"sul_pub_identifier",
"sul_pub_last_updated",
"sul_pub_pages",
"sul_pub_date",
"sul_pub_country",
"sul_pub_booktitle",
"sul_pub_edition",
"sul_pub_series",
"sul_pub_chapter",
"sul_pub_editor",
]

with open(file_path, "w", newline="") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=sul_pub_fields)
writer.writeheader()

page = 1

# Harcoding the number of pages since there is no way of retreiving that value from the API.
# The number will increase as more publications are added which will impact the status bar.
if all_pubs_setting is True:
sul_pub_progress = range(646 - page)
else:
sul_pub_progress = range(5 - page)

while (
int(
sul_pub_page(page, all_pubs=all_pubs_setting).get("metadata").get("records")
)
> 0
):
for pub in sul_pub_page(page, all_pubs=all_pubs_setting).get("records"):
write_to_csv(
pub, sul_pub_fields, file_path, normalize_data=normalize_sul_pub
)
page += 1
sul_pub_progress.update(1)

# Drop duplicates
df = pd.read_csv(file_path).drop_duplicates()
df.to_csv(file_path)


def sul_pub_page(page: int, all_pubs=False) -> str:
hostname = os.environ.get("SUL_PUB_HOST")
headers = {"CAPKEY": os.environ.get("SUL_PUB_KEY")}

if all_pubs is True:
resp = requests.get(
f"https://{hostname}/publications.json?page={page}&per=1000",
headers=headers,
)
else:
resp = requests.get(
f"https://{hostname}/publications.json?page={page}&per=1000&changedSince={latest_harvest_date() - timedelta(days=1)}",
headers=headers,
)

return resp.json()


def normalize_sul_pub(result: dict) -> dict:
translated_result = {}
for i in result:
translated_result[f"sul_pub_{i}"] = result[i]

unwanted_fields = [
"sul_pub_apa_citation",
"sul_pub_mla_citation",
"sul_pub_chicago_citation",
"sul_pub_keywords_sw",
"sul_pub_publicationcategoryrankinglist_sw",
"sul_pub_documentcategory_sw",
"sul_pub_isobsolete_sw",
"sul_pub_timenotselfcited_sw",
"sul_pub_normalizedrank_sw",
"sul_pub_authorcitationcountlist_sw",
"sul_pub_normalizedrank_sw",
"sul_pub_rank_sw",
"sul_pub_authorcount",
"sul_pub_stateprovince",
"sul_pub_newpublicationid_sw",
"sul_pub_publicationimpactfactorlist_sw",
"sul_pub_ordinalrank_sw",
"sul_pub_documenttypes_sw",
"sul_pub_numberofreferences_sw",
"sul_pub_timescited_sw_retricted",
"sul_pub_city",
"sul_pub_abstract_restricted",
"sul_pub_conference",
"sul_pub_allAuthors",
"sul_pub_additionalProperties",
"sul_pub_etal",
"sul_pub_articlenumber",
"sul_pub_howpublished",
"sul_pub_bibtex_type",
"sul_pub_address",
"sul_pub_publicationSource",
"sul_pub_publicationUrl",
"sul_pub_publicationUrlLabel",
"sul_pub_wos_uid",
"sul_pub_eissn",
"sul_pub_wos_item_id",
]

for field in unwanted_fields:
if field in translated_result:
del translated_result[field]

return translated_result
5 changes: 5 additions & 0 deletions test/utils/test_sulpub.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from rialto_airflow.utils import sulpub


def test_ok():
assert True

0 comments on commit 81dbacc

Please sign in to comment.