Add new authors.csv, adjust published columns and tests

sul-dlss · Sep 26, 2024 · d992d2c · d992d2c
1 parent 6d294e8
commit d992d2c
Show file tree

Hide file tree

Showing 5 changed files with 9 additions and 25 deletions.
diff --git a/rialto_airflow/harvest/dimensions.py b/rialto_airflow/harvest/dimensions.py
@@ -16,10 +16,10 @@
 def dois_from_orcid(orcid):
     logging.info(f"looking up dois for orcid {orcid}")
     q = """
-        search publications where researchers.orcid_id = "{}"
+        search publications where researchers.orcid_id = "{}" and year in [2018:{}]
         return publications [doi]
         limit 1000
-        """.format(orcid)
+        """.format(orcid, 2024)
 
     # The Dimensions API can flake out sometimes, so try to catch & retry.
     # TODO: Consider using retry param in query() instead

diff --git a/rialto_airflow/harvest/merge_pubs.py b/rialto_airflow/harvest/merge_pubs.py
@@ -57,11 +57,7 @@ def dimensions_pubs_df(dimensions_pubs):
             "document_type",
             "funders",
             "funding_section",
-            "linkout",
             "open_access",
-            "publisher",
-            "research_orgs",
-            "researchers",
             "title",
             "type",
             "year",
@@ -86,7 +82,7 @@ def openalex_pubs_df(openalex_pubs):
             "publication_year",
             "title",
             "type",
-            "best_oa_location",
+            "open_access",
         ),
     )
     df = df.rename(lambda column_name: "openalex_" + column_name)

diff --git a/rialto_airflow/harvest/openalex.py b/rialto_airflow/harvest/openalex.py
@@ -147,6 +147,7 @@ def normalize_publication(pub) -> dict:
     "id",
     "ids",
     "indexed_in",
+    "institution_assertions",
     "institutions_distinct_count",
     "is_authors_truncated",
     "is_paratext",

diff --git a/rialto_airflow/harvest/sul_pub.py b/rialto_airflow/harvest/sul_pub.py
@@ -7,29 +7,16 @@
 SUL_PUB_FIELDS = [
     "authorship",
     "title",
-    "abstract",
     "author",
     "year",
     "type",
-    "mesh_headings",
-    "publisher",
     "journal",
     "provenance",
     "doi",
-    "issn",
     "sulpubid",
-    "sw_id",
-    "pmid",
-    "identifier",
-    "last_updated",
-    "pages",
     "date",
     "country",
     "booktitle",
-    "edition",
-    "series",
-    "chapter",
-    "editor",
 ]
 
 

diff --git a/test/harvest/test_merge_pubs.py b/test/harvest/test_merge_pubs.py
@@ -84,7 +84,7 @@ def openalex_pubs_csv(tmp_path):
             "title",
             "type",
             "doi",
-            "best_oa_location",
+            "open_access",
         ]
         writer.writerow(header)
         writer.writerow(
@@ -97,7 +97,7 @@ def openalex_pubs_csv(tmp_path):
                 "A Publication",
                 "article",
                 "https://doi.org/10.0000/cccc",
-                '{is_oa: true, landing_page_url: "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1398957", pdf_url: null, source: { id: "https://openalex.org/S2764455111", display_name: "PubMed Central", issn_l: null, issn: null, host_organization: "https://openalex.org/I1299303238", type: "repository" }, license: null, version: "publishedVersion"}',
+                'green'
             ]
         )
         writer.writerow(
@@ -110,7 +110,7 @@ def openalex_pubs_csv(tmp_path):
                 "A Research Article",
                 "article",
                 "https://doi.org/10.0000/1234",
-                "",
+                "bronze"
             ]
         )
     return fixture_file
@@ -165,7 +165,7 @@ def test_openalex_pubs_df(openalex_pubs_csv):
     df = lazy_df.collect()
     assert df.shape[0] == 2
     assert "bogus" not in df.columns, "Unneeded columns have been dropped"
-    assert "openalex_best_oa_location" in df.columns
+    assert "openalex_open_access" in df.columns
     assert df["openalex_doi"].to_list() == ["10.0000/cccc", "10.0000/1234"]
 
 
@@ -193,7 +193,7 @@ def test_merge(tmp_path, sul_pubs_csv, openalex_pubs_csv, dimensions_pubs_csv):
     assert output.is_file(), "output file has been created"
     df = pl.read_parquet(output)
     assert df.shape[0] == 5
-    assert df.shape[1] == 25
+    assert df.shape[1] == 21
     assert set(df["doi"].to_list()) == set(
         ["10.0000/aaaa", "10.0000/1234", "10.0000/cccc", "10.0000/dddd", "10.0000/eeee"]
     )