-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Port over sul_pub code from rialto-data, and add some tests. Closes #3
- Loading branch information
Showing
8 changed files
with
301 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
name: Test | ||
on: | ||
push | ||
jobs: | ||
build: | ||
runs-on: ubuntu-latest | ||
strategy: | ||
matrix: | ||
python-version: [3.12] | ||
steps: | ||
|
||
- name: checkout | ||
uses: actions/checkout@v3 | ||
|
||
- name: Set up Python ${{ matrix.python-version }} | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: ${{ matrix.python-version }} | ||
|
||
- name: Lint | ||
uses: chartboost/ruff-action@v1 | ||
# it may move, see https://github.com/astral-sh/ruff/issues/8400 | ||
|
||
- name: Setup uv | ||
use: yexz123/setup-uv@v4 | ||
|
||
- name: Install dependencies | ||
run: | | ||
uv pip install -r requirements.txt | ||
uv pip install -r requirements-dev.txt | ||
- name: Run tests | ||
run: pytest |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
pytest |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import os | ||
import glob | ||
from datetime import datetime | ||
import csv | ||
import pandas as pd | ||
|
||
|
||
def write_to_csv(data, field_names, file, **kwargs): | ||
normalize_data = kwargs.get("normalize_data", None) | ||
directory = "/".join(file.split("/")[:-1]) + "/" | ||
if not os.path.exists(directory): | ||
os.makedirs(directory) | ||
with open(file, "w") as f: | ||
writer = csv.DictWriter(f, fieldnames=field_names) | ||
writer.writeheader() | ||
|
||
with open(file, "a") as f: | ||
writer = csv.DictWriter(f, fieldnames=field_names) | ||
if normalize_data: | ||
writer.writerow(normalize_data(data)) | ||
else: | ||
writer.writerow(data) | ||
|
||
|
||
def append_unique(dois: list, orcid: str, doi_orcids: dict): | ||
for doi in dois: | ||
if doi_orcids.get(doi): | ||
doi_orcids[doi].append(orcid) | ||
else: | ||
doi_orcids[doi] = [orcid] | ||
|
||
|
||
def get_nested_values(authorship: list, key) -> list: | ||
return [i.get(key) for i in authorship] | ||
|
||
|
||
def rialto_authors(): | ||
authors_files = glob.glob("data/rialto_app/*.csv") | ||
authors_files.sort() | ||
|
||
file_date = datetime.strptime( | ||
authors_files[-1].split("_")[-1].replace(".csv", ""), "%Y-%m-%d" | ||
) | ||
time_since_refreshing = datetime.now() - file_date | ||
assert time_since_refreshing.days < 90, ( | ||
"The authors file from the " | ||
"RIALTO application is more than 90 days old. Please download " | ||
"a new file from https://sul-rialto-dev.stanford.edu/authors?" | ||
"q=&orcid_filter=&commit=Search, change the file name to the " | ||
"authors_YYYY-MM-DD.csv format and move the file to " | ||
"data/railto_app." | ||
) | ||
|
||
return authors_files[-1] | ||
|
||
|
||
def read_sul_pub_pubs(): | ||
pubs_files = glob.glob("data/sul_pub/*.csv") | ||
pubs_files.sort() | ||
|
||
return pd.read_csv(pubs_files[-1]) | ||
|
||
|
||
def read_dimensions_pubs(): | ||
pubs_files = glob.glob("data/dimensions/publications-*.csv") | ||
|
||
return pd.concat([pd.read_csv(f) for f in pubs_files]) | ||
|
||
|
||
def read_openalex_pubs(): | ||
pubs_files = glob.glob("data/openalex/test-pubs.csv") | ||
|
||
return pd.concat([pd.read_csv(f) for f in pubs_files]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
import csv | ||
import glob | ||
import logging | ||
import os | ||
from datetime import datetime, timedelta | ||
from pathlib import Path | ||
|
||
import dotenv | ||
import pandas as pd | ||
import requests | ||
|
||
from .helper import write_to_csv | ||
|
||
dotenv.load_dotenv() | ||
logging.basicConfig(level=logging.DEBUG) | ||
|
||
|
||
def latest_harvest_date(): | ||
df = pd.DataFrame() | ||
for file in glob.glob("data/sul_pub/*.csv"): | ||
df = pd.concat([df, pd.read_csv(file)]) | ||
|
||
df["converted_date"] = pd.to_datetime( | ||
df["sul_pub_last_updated"], errors="coerce", utc=True | ||
) | ||
|
||
return df["converted_date"].max() | ||
|
||
|
||
def sul_pub_pubs( | ||
all_pubs_setting=False, | ||
file_path=f"data/sul_pub/sul_pub_{datetime.now().year}-{datetime.now().month}-{datetime.now().day}.csv", | ||
): | ||
if not os.path.exists(file_path): | ||
Path(file_path).parent.mkdir(exist_ok=True, parents=True) | ||
|
||
sul_pub_fields = [ | ||
"sul_pub_authorship", | ||
"sul_pub_title", | ||
"sul_pub_abstract", | ||
"sul_pub_author", | ||
"sul_pub_year", | ||
"sul_pub_type", | ||
"sul_pub_mesh_headings", | ||
"sul_pub_publisher", | ||
"sul_pub_journal", | ||
"sul_pub_provenance", | ||
"sul_pub_doi", | ||
"sul_pub_issn", | ||
"sul_pub_sulpubid", | ||
"sul_pub_sw_id", | ||
"sul_pub_pmid", | ||
"sul_pub_identifier", | ||
"sul_pub_last_updated", | ||
"sul_pub_pages", | ||
"sul_pub_date", | ||
"sul_pub_country", | ||
"sul_pub_booktitle", | ||
"sul_pub_edition", | ||
"sul_pub_series", | ||
"sul_pub_chapter", | ||
"sul_pub_editor", | ||
] | ||
|
||
with open(file_path, "w", newline="") as csvfile: | ||
writer = csv.DictWriter(csvfile, fieldnames=sul_pub_fields) | ||
writer.writeheader() | ||
|
||
page = 1 | ||
|
||
# Harcoding the number of pages since there is no way of retreiving that value from the API. | ||
# The number will increase as more publications are added which will impact the status bar. | ||
if all_pubs_setting is True: | ||
sul_pub_progress = range(646 - page) | ||
else: | ||
sul_pub_progress = range(5 - page) | ||
|
||
while ( | ||
int( | ||
sul_pub_page(page, all_pubs=all_pubs_setting).get("metadata").get("records") | ||
) | ||
> 0 | ||
): | ||
for pub in sul_pub_page(page, all_pubs=all_pubs_setting).get("records"): | ||
write_to_csv( | ||
pub, sul_pub_fields, file_path, normalize_data=normalize_sul_pub | ||
) | ||
page += 1 | ||
sul_pub_progress.update(1) | ||
|
||
# Drop duplicates | ||
df = pd.read_csv(file_path).drop_duplicates() | ||
df.to_csv(file_path) | ||
|
||
|
||
def sul_pub_page(page: int, all_pubs=False) -> str: | ||
hostname = os.environ.get("SUL_PUB_HOST") | ||
headers = {"CAPKEY": os.environ.get("SUL_PUB_KEY")} | ||
|
||
if all_pubs is True: | ||
resp = requests.get( | ||
f"https://{hostname}/publications.json?page={page}&per=1000", | ||
headers=headers, | ||
) | ||
else: | ||
resp = requests.get( | ||
f"https://{hostname}/publications.json?page={page}&per=1000&changedSince={latest_harvest_date() - timedelta(days=1)}", | ||
headers=headers, | ||
) | ||
|
||
return resp.json() | ||
|
||
|
||
def normalize_sul_pub(result: dict) -> dict: | ||
translated_result = {} | ||
for i in result: | ||
translated_result[f"sul_pub_{i}"] = result[i] | ||
|
||
unwanted_fields = [ | ||
"sul_pub_apa_citation", | ||
"sul_pub_mla_citation", | ||
"sul_pub_chicago_citation", | ||
"sul_pub_keywords_sw", | ||
"sul_pub_publicationcategoryrankinglist_sw", | ||
"sul_pub_documentcategory_sw", | ||
"sul_pub_isobsolete_sw", | ||
"sul_pub_timenotselfcited_sw", | ||
"sul_pub_normalizedrank_sw", | ||
"sul_pub_authorcitationcountlist_sw", | ||
"sul_pub_normalizedrank_sw", | ||
"sul_pub_rank_sw", | ||
"sul_pub_authorcount", | ||
"sul_pub_stateprovince", | ||
"sul_pub_newpublicationid_sw", | ||
"sul_pub_publicationimpactfactorlist_sw", | ||
"sul_pub_ordinalrank_sw", | ||
"sul_pub_documenttypes_sw", | ||
"sul_pub_numberofreferences_sw", | ||
"sul_pub_timescited_sw_retricted", | ||
"sul_pub_city", | ||
"sul_pub_abstract_restricted", | ||
"sul_pub_conference", | ||
"sul_pub_allAuthors", | ||
"sul_pub_additionalProperties", | ||
"sul_pub_etal", | ||
"sul_pub_articlenumber", | ||
"sul_pub_howpublished", | ||
"sul_pub_bibtex_type", | ||
"sul_pub_address", | ||
"sul_pub_publicationSource", | ||
"sul_pub_publicationUrl", | ||
"sul_pub_publicationUrlLabel", | ||
"sul_pub_wos_uid", | ||
"sul_pub_eissn", | ||
"sul_pub_wos_item_id", | ||
] | ||
|
||
for field in unwanted_fields: | ||
if field in translated_result: | ||
del translated_result[field] | ||
|
||
return translated_result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from rialto_airflow.utils import sulpub | ||
|
||
|
||
def test_ok(): | ||
assert True |