Skip to content

Commit

Permalink
Merge pull request #129 from sul-dlss/update-uv
Browse files Browse the repository at this point in the history
Update to uv and airflow
  • Loading branch information
jmartin-sul authored Feb 6, 2025
2 parents a0cfdb1 + 2af6a3c commit dc86d44
Show file tree
Hide file tree
Showing 10 changed files with 765 additions and 115 deletions.
8 changes: 3 additions & 5 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,11 @@ jobs:
with:
args: 'format --check'

- name: Install dependencies
run: |
pip install -r requirements.txt
pip install -r requirements-dev.txt
- name: Install uv
run: pip3 install uv

- name: Run tests
run: pytest
run: uv run pytest
env:
AIRFLOW_VAR_DIMENSIONS_API_USER: ${{ secrets.AIRFLOW_VAR_DIMENSIONS_API_USER }}
AIRFLOW_VAR_DIMENSIONS_API_PASS: ${{ secrets.AIRFLOW_VAR_DIMENSIONS_API_PASS }}
5 changes: 3 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM apache/airflow:2.9.3-python3.12
FROM apache/airflow:2.10.4-python3.12

USER root
RUN apt-get update && apt-get install -y gcc git
Expand All @@ -8,6 +8,7 @@ ENV PYTHONPATH "${PYTHONPATH}:/opt/airflow/"
USER airflow

COPY rialto_airflow ./rialto_airflow
COPY requirements.txt ./

COPY requirements.txt .

RUN uv pip install --no-cache "apache-airflow==${AIRFLOW_VERSION}" -r requirements.txt
44 changes: 6 additions & 38 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,28 +67,10 @@ https://sul-rialto-dev.stanford.edu/authors?action=index&commit=Search&controlle
### Set-up

1. Install `uv` for dependency management as described in [the uv docs](https://github.com/astral-sh/uv?tab=readme-ov-file#getting-started).
2. Create a virtual environment:
```
uv venv
```

This will create the virtual environment at the default location of `.venv/`. `uv` automatically looks for a venv at this location when installing dependencies.

3. Activate the virtual environment:
```
source .venv/bin/activate
```


### Install dependencies
```
uv pip install -r requirements.txt
```

To add a dependency:
1. `uv pip install flask`
2. Add the dependency to `pyproject.toml`.
3. To re-generate the locked dependencies in `requirements.txt`:
1. `uv add flask`
3. Then re-generate the locked dependencies in `requirements.txt` which is used in `Dockerfile`.
```
uv pip compile pyproject.toml -o requirements.txt
```
Expand All @@ -103,29 +85,15 @@ uv pip compile pyproject.toml -o requirements.txt --upgrade

## Run Tests

First enable the virtual environment:

```
source .venv/bin/activate
```

Then ensure the app dependencies and dev dependencies are installed.

```
uv pip install -r requirements.txt -r requirements-dev.txt
```

Then run the tests:

```
pytest
uv run pytest
```

### Linting and formatting

1. Run linting: `ruff check`
2. Automatically fix linting: `ruff check --fix`
3. Run formatting: `ruff format` (or `ruff format --check` to identify any unformatted files)
1. Run linting: `uv run ruff check`
2. Automatically fix linting: `uv run ruff check --fix`
3. Run formatting: `uv run ruff format` (or `uv run ruff format --check` to identify any unformatted files)

## Deployment

Expand Down
21 changes: 16 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,23 +1,34 @@
[project]
name = "rialto-airflow"

version = "0.1.0"

description = "Airflow app for harvesting data for open access analysis and research intelligence."

authors = [ {name = "Laura Wrubel", email = "[email protected]"}, {name = "Ed Summers", email = "[email protected]"}, {name = "Jacob Hill", email = "[email protected]"}]
requires-python = ">= 3.12"

# Aligned with what is defined in Dockerfile and CI
requires-python = "== 3.12.*"

package-mode = false

dependencies = [
"pandas",
"requests",
"python-dotenv",
"dimcli",
"polars>=1.2",
"pyalex",
"more-itertools"
"more-itertools",
]

[tool.pytest.ini_options]
pythonpath = ["."]
addopts = "-v"

[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"
[dependency-groups]
dev = [
"pytest>=8.3.4",
"python-dotenv>=1.0.1",
"ruff>=0.9.4",
]
4 changes: 0 additions & 4 deletions requirements-dev.txt

This file was deleted.

68 changes: 33 additions & 35 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,74 +1,74 @@
# This file was autogenerated by uv via the following command:
# uv pip compile pyproject.toml -o requirements.txt
alabaster==0.7.16
alabaster==1.0.0
# via sphinx
asttokens==2.4.1
asttokens==3.0.0
# via stack-data
babel==2.15.0
babel==2.17.0
# via sphinx
certifi==2024.7.4
certifi==2025.1.31
# via requests
charset-normalizer==3.3.2
charset-normalizer==3.4.1
# via requests
click==8.1.7
click==8.1.8
# via dimcli
commonmark==0.9.1
# via recommonmark
decorator==5.1.1
# via ipython
dimcli==1.3
dimcli==1.4
# via rialto-airflow (pyproject.toml)
docutils==0.21.2
# via
# recommonmark
# sphinx
executing==2.0.1
executing==2.2.0
# via stack-data
idna==3.7
idna==3.10
# via requests
imagesize==1.4.1
# via sphinx
ipython==8.26.0
ipython==8.32.0
# via dimcli
jedi==0.19.1
jedi==0.19.2
# via ipython
jinja2==3.1.4
jinja2==3.1.5
# via sphinx
markupsafe==2.1.5
markupsafe==3.0.2
# via jinja2
matplotlib-inline==0.1.7
# via ipython
more-itertools==10.3.0
more-itertools==10.6.0
# via rialto-airflow (pyproject.toml)
numpy==2.0.1
numpy==2.2.2
# via
# dimcli
# pandas
packaging==24.1
packaging==24.2
# via
# dimcli
# sphinx
pandas==2.2.2
pandas==2.2.3
# via
# rialto-airflow (pyproject.toml)
# dimcli
parso==0.8.4
# via jedi
pexpect==4.9.0
# via ipython
polars==1.2.1
polars==1.21.0
# via rialto-airflow (pyproject.toml)
prompt-toolkit==3.0.47
prompt-toolkit==3.0.50
# via
# dimcli
# ipython
ptyprocess==0.7.0
# via pexpect
pure-eval==0.2.3
# via stack-data
pyalex==0.14
pyalex==0.15.1
# via rialto-airflow (pyproject.toml)
pygments==2.18.0
pygments==2.19.1
# via
# dimcli
# ipython
Expand All @@ -77,7 +77,7 @@ python-dateutil==2.9.0.post0
# via pandas
python-dotenv==1.0.1
# via rialto-airflow (pyproject.toml)
pytz==2024.1
pytz==2025.1
# via pandas
recommonmark==0.7.1
# via dimcli
Expand All @@ -87,37 +87,35 @@ requests==2.32.3
# dimcli
# pyalex
# sphinx
six==1.16.0
# via
# asttokens
# python-dateutil
six==1.17.0
# via python-dateutil
snowballstemmer==2.2.0
# via sphinx
sphinx==7.4.7
sphinx==8.1.3
# via recommonmark
sphinxcontrib-applehelp==1.0.8
sphinxcontrib-applehelp==2.0.0
# via sphinx
sphinxcontrib-devhelp==1.0.6
sphinxcontrib-devhelp==2.0.0
# via sphinx
sphinxcontrib-htmlhelp==2.0.6
sphinxcontrib-htmlhelp==2.1.0
# via sphinx
sphinxcontrib-jsmath==1.0.1
# via sphinx
sphinxcontrib-qthelp==1.0.8
sphinxcontrib-qthelp==2.0.0
# via sphinx
sphinxcontrib-serializinghtml==1.1.10
sphinxcontrib-serializinghtml==2.0.0
# via sphinx
stack-data==0.6.3
# via ipython
tqdm==4.66.4
tqdm==4.67.1
# via dimcli
traitlets==5.14.3
# via
# ipython
# matplotlib-inline
tzdata==2024.1
tzdata==2025.1
# via pandas
urllib3==2.2.2
urllib3==2.3.0
# via
# pyalex
# requests
Expand Down
4 changes: 2 additions & 2 deletions rialto_airflow/harvest/openalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import os
import pickle
import time
from urllib.parse import quote

from more_itertools import batched
from pyalex import Authors, Works, config, api
Expand Down Expand Up @@ -92,7 +91,7 @@ def publications_from_dois(dois: list):
# TODO: do we need this to stay within 100,000 requests / day API quota?
time.sleep(1)

doi_list = quote("|".join([doi for doi in doi_batch]))
doi_list = "|".join([doi for doi in doi_batch])
try:
for page in Works().filter(doi=doi_list).paginate(per_page=200):
for pub in page:
Expand Down Expand Up @@ -124,6 +123,7 @@ def normalize_publication(pub) -> dict:

FIELDS = [
"abstract_inverted_index",
"abstract_inverted_index_v3",
"authorships",
"apc_list",
"apc_paid",
Expand Down
6 changes: 3 additions & 3 deletions test/harvest/test_contribs.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ def test_create_contribs(pubs_parquet, doi_sunet, authors, tmp_path):
create_contribs(pubs_parquet, doi_sunet, authors, contribs_parquet)

df = pl.read_parquet(contribs_parquet)
assert set(df.columns) == set(
["doi", "sunetid", "title", "first_name"]
), "columns are correct"
assert set(df.columns) == set(["doi", "sunetid", "title", "first_name"]), (
"columns are correct"
)

# first publication got joined to authors
assert len(df.filter(pl.col("doi") == "0000/abc")) == 1
Expand Down
29 changes: 8 additions & 21 deletions test/harvest/test_openalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,32 +49,23 @@ def test_publications_from_dois():
# look up the publication metadata for them
pubs = list(openalex.publications_from_dois(dois))

# >= is used because sometimes there can be multiple works for a DOI!
assert len(pubs) >= 231, "should paginate (page size=200)"
# > 200 is used because some of the 231 DOIs have been removed from openalex 🤷
assert len(pubs) > 200, "should paginate (page size=200)"
assert set(openalex.FIELDS) == set(pubs[0].keys()), "All fields accounted for."
assert len(pubs[0].keys()) == 53, "first publication has 53 columns"
assert len(pubs[1].keys()) == 53, "second publication has 53 columns"
assert len(pubs[0].keys()) == 54, "first publication has 54 columns"
assert len(pubs[1].keys()) == 54, "second publication has 54 columns"


def test_publications_from_invalid_dois(caplog):
# Error may change if OpenAlex API or pyalex changes
invalid_dois = ["doi-with-comma,a", "10.1145/3442188.3445922"]
assert len(list(openalex.publications_from_dois(invalid_dois))) == 1
assert (
"OpenAlex QueryError for doi-with-comma,a: Invalid query parameter"
in caplog.text
), "logs error message"


def test_publications_from_invalid_with_comma(caplog):
# OpenAlex will interpret a DOI string with a comma as two DOIs but
# Does not return a result for the first half even if valid. Will return an empty list
invalid_doi = ["10.1002/cncr.33546,-(wileyonlinelibrary.com)"]
assert len(list(openalex.publications_from_dois(invalid_doi))) == 0
assert (
"OpenAlex QueryError for 10.1002/cncr.33546,-(wileyonlinelibrary.com): Invalid query parameter"
in caplog.text
), "logs error message"


def test_publications_csv(tmp_path):
Expand Down Expand Up @@ -102,13 +93,9 @@ def test_publications_csv(tmp_path):


def test_pyalex_urlencoding():
# this might start working if https://github.com/J535D165/pyalex/issues/41 is fixed
with pytest.raises(pyalex.api.QueryError):
pyalex.Works().filter(doi="10.1207/s15327809jls0703&4_2").count() == 1

assert (
pyalex.Works().filter(doi="10.1207/s15327809jls0703%264_2").count() == 1
), "url encoding the & works with OpenAlex API"
assert pyalex.Works().filter(doi="10.1207/s15327809jls0703&4_2").count() == 1, (
"url encoding the & works with OpenAlex API"
)

assert (
len(
Expand All @@ -119,7 +106,7 @@ def test_pyalex_urlencoding():
)
)
== 2
), "we handle url URL encoding DOIs until pyalex does"
), "we handle URL encoding"


@pytest.mark.skip(reason="This record no longer exhibits the problem")
Expand Down
Loading

0 comments on commit dc86d44

Please sign in to comment.