Skip to content

Commit

Permalink
Merge pull request #85 from sul-dlss-labs/years-as-strings
Browse files Browse the repository at this point in the history
Make pub year columns strings
  • Loading branch information
edsu authored Jul 18, 2024
2 parents bfad4e8 + 5ef7a19 commit a27a5a5
Showing 1 changed file with 5 additions and 3 deletions.
8 changes: 5 additions & 3 deletions rialto_airflow/harvest/merge_pubs.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ def dimensions_pubs_df(dimensions_pubs):
# Create a LazyFrame of dimension pubs to avoid loading all data into memory
"""
# Polars is inferring volume is an integer, but it should be a string e.g. "97-B"
df = pl.scan_csv(dimensions_pubs, schema_overrides={"volume": pl.String})
df = pl.scan_csv(
dimensions_pubs, schema_overrides={"volume": pl.String, "year": pl.String}
)
df = df.select(
pl.col("doi").map_elements(normalize_doi, return_dtype=pl.String),
pl.col(
Expand All @@ -71,7 +73,7 @@ def openalex_pubs_df(openalex_pubs):
"""
Create an openalex pubs LazyFrame and rename columns
"""
df = pl.scan_csv(openalex_pubs)
df = pl.scan_csv(openalex_pubs, schema_overrides={"publication_year": pl.String})
df = df.select(
pl.col("doi").map_elements(normalize_doi, return_dtype=pl.String),
pl.col(
Expand All @@ -86,7 +88,7 @@ def sulpub_df(sul_pub):
"""
Create a sulpub LazyFrame and rename columns
"""
df = pl.scan_csv(sul_pub, null_values="n/ a")
df = pl.scan_csv(sul_pub, schema_overrides={"year": pl.String})
df = df.drop_nulls("doi")
df = df.with_columns(
pl.col("doi").map_elements(normalize_doi, return_dtype=pl.String)
Expand Down

0 comments on commit a27a5a5

Please sign in to comment.