Skip to content

Commit

Permalink
Merge pull request #90 from sul-dlss-labs/remove-authors
Browse files Browse the repository at this point in the history
Remove authors from output Parquet
  • Loading branch information
edsu authored Jul 25, 2024
2 parents fe2b6e7 + adc38ee commit 7e9860c
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 5 deletions.
5 changes: 1 addition & 4 deletions rialto_airflow/harvest/merge_pubs.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ def dimensions_pubs_df(dimensions_pubs):
df = df.select(
pl.col("doi").map_elements(normalize_doi, return_dtype=pl.String),
pl.col(
"authors",
"document_type",
"funders",
"funding_section",
Expand All @@ -80,9 +79,7 @@ def openalex_pubs_df(openalex_pubs):
)
df = df.select(
pl.col("doi").map_elements(normalize_doi, return_dtype=pl.String),
pl.col(
"apc_paid", "authorships", "grants", "publication_year", "title", "type"
),
pl.col("apc_paid", "grants", "publication_year", "title", "type"),
)
df = df.rename(lambda column_name: "openalex_" + column_name)
return df
Expand Down
2 changes: 1 addition & 1 deletion test/harvest/test_merge_pubs.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def test_merge(tmp_path, sul_pubs_csv, openalex_pubs_csv, dimensions_pubs_csv):
assert output.is_file(), "output file has been created"
df = pl.read_parquet(output)
assert df.shape[0] == 5
assert df.shape[1] == 25
assert df.shape[1] == 23
assert set(df["doi"].to_list()) == set(
["10.0000/aaaa", "10.0000/1234", "10.0000/cccc", "10.0000/dddd", "10.0000/eeee"]
)

0 comments on commit 7e9860c

Please sign in to comment.