-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Port over sul_pub code from rialto-data, and add some tests. Closes #3
- Loading branch information
Showing
12 changed files
with
317 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
name: Test | ||
on: | ||
- push | ||
- pull_request | ||
jobs: | ||
build: | ||
runs-on: ubuntu-latest | ||
strategy: | ||
matrix: | ||
python-version: [3.12] | ||
steps: | ||
|
||
- name: checkout | ||
uses: actions/checkout@v3 | ||
|
||
- name: Set up Python ${{ matrix.python-version }} | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: ${{ matrix.python-version }} | ||
|
||
- name: Lint | ||
uses: chartboost/ruff-action@v1 | ||
# it may move, see https://github.com/astral-sh/ruff/issues/8400 | ||
|
||
- name: Install dependencies | ||
run: | | ||
pip install -r requirements.txt | ||
pip install -r requirements-dev.txt | ||
- name: Run tests | ||
run: pytest |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
pytest | ||
python-dotenv | ||
apache-airflow==2.9.2 |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
import datetime | ||
import pathlib | ||
|
||
from airflow.models import Variable | ||
from airflow.decorators import dag, task | ||
|
||
from rialto_airflow.utils import last_harvest, create_snapshot_dir | ||
from rialto_airflow.harvest.sul_pub import sul_pub_csv | ||
|
||
data_dir = Variable.get("data_dir") | ||
sul_pub_host = Variable.get("sul_pub_host") | ||
sul_pub_key = Variable.get("sul_pub_key") | ||
|
||
@dag( | ||
schedule=None, | ||
start_date=datetime.datetime(2024, 1, 1), | ||
catchup=False, | ||
) | ||
def update_data(): | ||
|
||
@task(multiple_outputs=True) | ||
def setup(): | ||
""" | ||
Setup the data directory to write to and determine the last harvest. | ||
""" | ||
return { | ||
"last_harvest": last_harvest(), | ||
"snapshot_dir": create_snapshot_dir(data_dir) | ||
} | ||
|
||
@task() | ||
def fetch_sul_pub(last_harvest, snapshot_dir): | ||
""" | ||
Harvest data from sul_pub using the last harvest date. | ||
""" | ||
csv_file = pathlib.Path(snapshot_dir) / "sulpub.csv" | ||
sul_pub_csv(csv_file, sul_pub_host, sul_pub_key, since=last_harvest) | ||
|
||
return str(csv_file) | ||
|
||
@task() | ||
def extract_doi(sulpub): | ||
""" | ||
Extract a unique list of DOIs from the new publications data. | ||
""" | ||
return True | ||
|
||
@task() | ||
def fetch_openalex(dois): | ||
""" | ||
Fetch the data by DOI from OpenAlex. | ||
""" | ||
return True | ||
|
||
@task() | ||
def fetch_dimensions(dois): | ||
""" | ||
Fetch the data by DOI from Dimensions. | ||
""" | ||
return True | ||
|
||
@task() | ||
def merge_publications(sul_pub, openalex, dimensions): | ||
""" | ||
Merge the OpenAlex, Dimensions and sul_pub data. | ||
""" | ||
return True | ||
|
||
@task() | ||
def merge_contributors(pubs): | ||
""" | ||
Merge in contributor and departmental data from rialto-orgs. | ||
""" | ||
return True | ||
|
||
@task | ||
def create_dataset(pubs, contribs): | ||
""" | ||
Aggregate the incremental snapshot data into a single dataset. | ||
""" | ||
return True | ||
|
||
@task() | ||
def publish(dataset): | ||
""" | ||
Publish aggregate data to JupyterHub environment. | ||
""" | ||
return True | ||
|
||
config = setup() | ||
sul_pub = fetch_sul_pub(config["last_harvest"], config["snapshot_dir"]) | ||
dois = extract_doi(sul_pub) | ||
openalex = fetch_openalex(dois) | ||
dimensions = fetch_dimensions(dois) | ||
pubs = merge_publications(sul_pub, openalex, dimensions) | ||
contribs = merge_contributors(pubs) | ||
dataset = create_dataset(pubs, contribs) | ||
publish(dataset) | ||
|
||
update_data() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
import csv | ||
import logging | ||
|
||
import requests | ||
|
||
|
||
sul_pub_fields = [ | ||
"authorship", | ||
"title", | ||
"abstract", | ||
"author", | ||
"year", | ||
"type", | ||
"mesh_headings", | ||
"publisher", | ||
"journal", | ||
"provenance", | ||
"doi", | ||
"issn", | ||
"sulpubid", | ||
"sw_id", | ||
"pmid", | ||
"identifier", | ||
"last_updated", | ||
"pages", | ||
"date", | ||
"country", | ||
"booktitle", | ||
"edition", | ||
"series", | ||
"chapter", | ||
"editor", | ||
] | ||
|
||
|
||
def sul_pub_csv(csv_file, host, key, since=None, limit=None): | ||
with open(csv_file, "w") as csvfile: | ||
writer = csv.DictWriter(csvfile, fieldnames=sul_pub_fields) | ||
writer.writeheader() | ||
for row in harvest(host, key, since, limit): | ||
writer.writerow(row) | ||
|
||
|
||
def harvest(host, key, since, limit): | ||
url = f"https://{host}/publications.json" | ||
|
||
http_headers = {"CAPKEY": key} | ||
|
||
params = { "per": 1000 } | ||
if since: | ||
params["changedSince"] = since.strftime('%Y-%m-%d') | ||
|
||
page = 0 | ||
record_count = 0 | ||
more = True | ||
|
||
while more: | ||
page += 1 | ||
params['page'] = page | ||
|
||
logging.info(f"fetching sul_pub results {url} {params}") | ||
resp = requests.get(url, params=params, headers=http_headers) | ||
resp.raise_for_status() | ||
|
||
records = resp.json()['records'] | ||
if len(records) == 0: | ||
more = False | ||
|
||
for record in records: | ||
record_count += 1 | ||
if limit is not None and record_count > limit: | ||
logging.info(f"stopping with limit={limit}") | ||
more = False | ||
break | ||
|
||
yield {key: record[key] for key in record if key in sul_pub_fields} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
import os | ||
import datetime | ||
|
||
def last_harvest(): | ||
# TODO: look in the data_dir to determine the last harvest | ||
return datetime.datetime(2024, 1, 1, tzinfo=datetime.timezone.utc) | ||
|
||
def create_snapshot_dir(data_dir): | ||
now = datetime.datetime.now() | ||
snapshot_dir = os.path.join(data_dir, now.strftime('%Y%m%d%H%M%S')) | ||
os.mkdir(snapshot_dir) | ||
|
||
return snapshot_dir |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import os | ||
import datetime | ||
|
||
import dotenv | ||
import pandas | ||
import pytest | ||
|
||
from rialto_airflow.harvest.sul_pub import sul_pub_csv | ||
|
||
dotenv.load_dotenv() | ||
|
||
sul_pub_host = os.environ.get('AIRFLOW_VAR_SUL_PUB_HOST') | ||
sul_pub_key = os.environ.get('AIRFLOW_VAR_SUL_PUB_KEY') | ||
|
||
no_auth = not (sul_pub_host and sul_pub_key) | ||
|
||
@pytest.mark.skipif(no_auth, reason="no sul_pub key") | ||
def test_sul_pub_csv(tmpdir): | ||
csv_file = tmpdir / "sul_pub.csv" | ||
sul_pub_csv(csv_file, sul_pub_host, sul_pub_key, limit=2000) | ||
assert csv_file.isfile() | ||
|
||
df = pandas.read_csv(csv_file) | ||
assert len(df) == 2000 | ||
assert "title" in df.columns | ||
|
||
@pytest.mark.skip(reason="sul_pub changeSince broken") | ||
@pytest.mark.skipif(no_auth, reason="no sul_pub key") | ||
def test_sul_pub_csv_since(tmpdir): | ||
csv_file = tmpdir / "sul_pub.csv" | ||
since = datetime.datetime(2024, 1, 1, tzinfo=datetime.timezone.utc) | ||
sul_pub_csv(csv_file, sul_pub_host, sul_pub_key, since=since, limit=100) | ||
|
||
df = pandas.read_csv(csv_file, parse_dates=['last_updated']) | ||
assert len(df[df['last_updated'] < since]) == 0 |