From d992d2c7c52a6a44ec6d082dadd54892765f2134 Mon Sep 17 00:00:00 2001
From: jacobthill <jacobhill.mail@gmail.com>
Date: Thu, 26 Sep 2024 12:48:13 -0400
Subject: [PATCH 1/6] Add new authors.csv, adjust published columns and tests

---
 rialto_airflow/harvest/dimensions.py |  4 ++--
 rialto_airflow/harvest/merge_pubs.py |  6 +-----
 rialto_airflow/harvest/openalex.py   |  1 +
 rialto_airflow/harvest/sul_pub.py    | 13 -------------
 test/harvest/test_merge_pubs.py      | 10 +++++-----
 5 files changed, 9 insertions(+), 25 deletions(-)

diff --git a/rialto_airflow/harvest/dimensions.py b/rialto_airflow/harvest/dimensions.py
index 61f1f11..69d0a88 100644
--- a/rialto_airflow/harvest/dimensions.py
+++ b/rialto_airflow/harvest/dimensions.py
@@ -16,10 +16,10 @@
 def dois_from_orcid(orcid):
     logging.info(f"looking up dois for orcid {orcid}")
     q = """
-        search publications where researchers.orcid_id = "{}"
+        search publications where researchers.orcid_id = "{}" and year in [2018:{}]
         return publications [doi]
         limit 1000
-        """.format(orcid)
+        """.format(orcid, 2024)
 
     # The Dimensions API can flake out sometimes, so try to catch & retry.
     # TODO: Consider using retry param in query() instead
diff --git a/rialto_airflow/harvest/merge_pubs.py b/rialto_airflow/harvest/merge_pubs.py
index 3ba11de..7f16ee0 100644
--- a/rialto_airflow/harvest/merge_pubs.py
+++ b/rialto_airflow/harvest/merge_pubs.py
@@ -57,11 +57,7 @@ def dimensions_pubs_df(dimensions_pubs):
             "document_type",
             "funders",
             "funding_section",
-            "linkout",
             "open_access",
-            "publisher",
-            "research_orgs",
-            "researchers",
             "title",
             "type",
             "year",
@@ -86,7 +82,7 @@ def openalex_pubs_df(openalex_pubs):
             "publication_year",
             "title",
             "type",
-            "best_oa_location",
+            "open_access",
         ),
     )
     df = df.rename(lambda column_name: "openalex_" + column_name)
diff --git a/rialto_airflow/harvest/openalex.py b/rialto_airflow/harvest/openalex.py
index c51368c..48cf0d0 100644
--- a/rialto_airflow/harvest/openalex.py
+++ b/rialto_airflow/harvest/openalex.py
@@ -147,6 +147,7 @@ def normalize_publication(pub) -> dict:
     "id",
     "ids",
     "indexed_in",
+    "institution_assertions",
     "institutions_distinct_count",
     "is_authors_truncated",
     "is_paratext",
diff --git a/rialto_airflow/harvest/sul_pub.py b/rialto_airflow/harvest/sul_pub.py
index 9395f1e..c937607 100644
--- a/rialto_airflow/harvest/sul_pub.py
+++ b/rialto_airflow/harvest/sul_pub.py
@@ -7,29 +7,16 @@
 SUL_PUB_FIELDS = [
     "authorship",
     "title",
-    "abstract",
     "author",
     "year",
     "type",
-    "mesh_headings",
-    "publisher",
     "journal",
     "provenance",
     "doi",
-    "issn",
     "sulpubid",
-    "sw_id",
-    "pmid",
-    "identifier",
-    "last_updated",
-    "pages",
     "date",
     "country",
     "booktitle",
-    "edition",
-    "series",
-    "chapter",
-    "editor",
 ]
 
 
diff --git a/test/harvest/test_merge_pubs.py b/test/harvest/test_merge_pubs.py
index 28c8807..4fd52b1 100644
--- a/test/harvest/test_merge_pubs.py
+++ b/test/harvest/test_merge_pubs.py
@@ -84,7 +84,7 @@ def openalex_pubs_csv(tmp_path):
             "title",
             "type",
             "doi",
-            "best_oa_location",
+            "open_access",
         ]
         writer.writerow(header)
         writer.writerow(
@@ -97,7 +97,7 @@ def openalex_pubs_csv(tmp_path):
                 "A Publication",
                 "article",
                 "https://doi.org/10.0000/cccc",
-                '{is_oa: true, landing_page_url: "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1398957", pdf_url: null, source: { id: "https://openalex.org/S2764455111", display_name: "PubMed Central", issn_l: null, issn: null, host_organization: "https://openalex.org/I1299303238", type: "repository" }, license: null, version: "publishedVersion"}',
+                'green'
             ]
         )
         writer.writerow(
@@ -110,7 +110,7 @@ def openalex_pubs_csv(tmp_path):
                 "A Research Article",
                 "article",
                 "https://doi.org/10.0000/1234",
-                "",
+                "bronze"
             ]
         )
     return fixture_file
@@ -165,7 +165,7 @@ def test_openalex_pubs_df(openalex_pubs_csv):
     df = lazy_df.collect()
     assert df.shape[0] == 2
     assert "bogus" not in df.columns, "Unneeded columns have been dropped"
-    assert "openalex_best_oa_location" in df.columns
+    assert "openalex_open_access" in df.columns
     assert df["openalex_doi"].to_list() == ["10.0000/cccc", "10.0000/1234"]
 
 
@@ -193,7 +193,7 @@ def test_merge(tmp_path, sul_pubs_csv, openalex_pubs_csv, dimensions_pubs_csv):
     assert output.is_file(), "output file has been created"
     df = pl.read_parquet(output)
     assert df.shape[0] == 5
-    assert df.shape[1] == 25
+    assert df.shape[1] == 21
     assert set(df["doi"].to_list()) == set(
         ["10.0000/aaaa", "10.0000/1234", "10.0000/cccc", "10.0000/dddd", "10.0000/eeee"]
     )

From f388581928f4a34031805e72917e2fefb98d3e84 Mon Sep 17 00:00:00 2001
From: jacobthill <jacobhill.mail@gmail.com>
Date: Thu, 26 Sep 2024 12:56:14 -0400
Subject: [PATCH 2/6] changing quotes

---
 test/harvest/test_merge_pubs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/harvest/test_merge_pubs.py b/test/harvest/test_merge_pubs.py
index 4fd52b1..b3e3909 100644
--- a/test/harvest/test_merge_pubs.py
+++ b/test/harvest/test_merge_pubs.py
@@ -97,7 +97,7 @@ def openalex_pubs_csv(tmp_path):
                 "A Publication",
                 "article",
                 "https://doi.org/10.0000/cccc",
-                'green'
+                "green"
             ]
         )
         writer.writerow(

From 58982356c62bf60b15769fa1b995d76f7ef969b0 Mon Sep 17 00:00:00 2001
From: jacobthill <jacobhill.mail@gmail.com>
Date: Thu, 26 Sep 2024 14:04:22 -0400
Subject: [PATCH 3/6] ruff format

---
 test/harvest/test_merge_pubs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/harvest/test_merge_pubs.py b/test/harvest/test_merge_pubs.py
index b3e3909..c582b45 100644
--- a/test/harvest/test_merge_pubs.py
+++ b/test/harvest/test_merge_pubs.py
@@ -97,7 +97,7 @@ def openalex_pubs_csv(tmp_path):
                 "A Publication",
                 "article",
                 "https://doi.org/10.0000/cccc",
-                "green"
+                "green",
             ]
         )
         writer.writerow(
@@ -110,7 +110,7 @@ def openalex_pubs_csv(tmp_path):
                 "A Research Article",
                 "article",
                 "https://doi.org/10.0000/1234",
-                "bronze"
+                "bronze",
             ]
         )
     return fixture_file

From ccf68161dddb5b2cf48d5b5d5891297719aff6f6 Mon Sep 17 00:00:00 2001
From: jacobthill <jacobhill.mail@gmail.com>
Date: Thu, 26 Sep 2024 15:45:49 -0400
Subject: [PATCH 4/6] Revert filtering Dimensions by year

---
 rialto_airflow/harvest/dimensions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rialto_airflow/harvest/dimensions.py b/rialto_airflow/harvest/dimensions.py
index 69d0a88..61f1f11 100644
--- a/rialto_airflow/harvest/dimensions.py
+++ b/rialto_airflow/harvest/dimensions.py
@@ -16,10 +16,10 @@
 def dois_from_orcid(orcid):
     logging.info(f"looking up dois for orcid {orcid}")
     q = """
-        search publications where researchers.orcid_id = "{}" and year in [2018:{}]
+        search publications where researchers.orcid_id = "{}"
         return publications [doi]
         limit 1000
-        """.format(orcid, 2024)
+        """.format(orcid)
 
     # The Dimensions API can flake out sometimes, so try to catch & retry.
     # TODO: Consider using retry param in query() instead

From 9ae9adce08e78cb5df569e115761abeceb4f39c3 Mon Sep 17 00:00:00 2001
From: Ed Summers <ehs@pobox.com>
Date: Thu, 26 Sep 2024 15:41:42 -0400
Subject: [PATCH 5/6] Account for multiple OpenAlex works with the same DOI

The dois_from_orcid() now returns a list of unique DOIs, instead of an
iterator of possiblly unique DOIs. This allows a failing test to pass.

The test for openalex_publications_from_dois() was relaxed a bit to look
for 231 or more publications, since a lookup for an individual DOI can
sometimes pull back multiple works.

The number of columns for OpenAlex is now 53 because we added
`institution_assertions`.
---
 rialto_airflow/harvest/openalex.py | 15 +++++++++------
 test/harvest/test_openalex.py      | 11 ++++++-----
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/rialto_airflow/harvest/openalex.py b/rialto_airflow/harvest/openalex.py
index 48cf0d0..93917e0 100644
--- a/rialto_airflow/harvest/openalex.py
+++ b/rialto_airflow/harvest/openalex.py
@@ -37,7 +37,7 @@ def doi_orcids_pickle(authors_csv, pickle_file, limit=None):
 
 def dois_from_orcid(orcid: str, limit=None):
     """
-    Pass in the ORCID ID and get back an iterator of DOIs for publications authored by that person.
+    Pass in the ORCID ID and get back a list of DOIs for publications authored by that person.
     """
 
     # TODO: I think we can maybe have this function take a list of orcids and
@@ -57,16 +57,19 @@ def dois_from_orcid(orcid: str, limit=None):
     author_id = authors[0]["id"]
 
     # get all the works for the openalex author id
-    work_count = 0
+    dois = set()
     for page in (
         Works().filter(author={"id": author_id}).select(["doi"]).paginate(per_page=200)
     ):
         for pub in page:
             if pub.get("doi"):
-                work_count += 1
-                if limit is not None and work_count > limit:
-                    return
-                yield pub.get("doi").replace("https://doi.org/", "")
+                doi = pub.get("doi").replace("https://doi.org/", "")
+                dois.add(doi)
+                if limit is not None and len(dois) == limit:
+                    return list(dois)
+
+    return list(dois)
+
 
 
 def publications_csv(dois: list, csv_file: str) -> None:
diff --git a/test/harvest/test_openalex.py b/test/harvest/test_openalex.py
index bbb0cab..9935ee8 100644
--- a/test/harvest/test_openalex.py
+++ b/test/harvest/test_openalex.py
@@ -18,7 +18,7 @@ def test_dois_from_orcid_paging():
     # for Shanhui Fan who has a lot of publications (> 1300)
     dois = list(openalex.dois_from_orcid("0000-0002-0081-9732", limit=300))
     assert len(dois) == 300, "paging is limiting to 200 works"
-    assert len(set(dois)) == 300, "the dois are unique"
+    assert len(set(dois)) == len(dois), "the dois are unique"
 
 
 def test_doi_orcids_pickle(tmp_path):
@@ -48,11 +48,12 @@ def test_publications_from_dois():
 
     # look up the publication metadata for them
     pubs = list(openalex.publications_from_dois(dois))
-    assert len(pubs) == 231, "should paginate (page size=200)"
-    assert len(pubs) == len(set([pub["doi"] for pub in pubs])), "DOIs are unique"
+   
+    # >= is used because sometimes there can be multiple works for a DOI!
+    assert len(pubs) >= 231, "should paginate (page size=200)"
     assert set(openalex.FIELDS) == set(pubs[0].keys()), "All fields accounted for."
-    assert len(pubs[0].keys()) == 52, "first publication has 52 columns"
-    assert len(pubs[1].keys()) == 52, "second publication has 52 columns"
+    assert len(pubs[0].keys()) == 53, "first publication has 53 columns"
+    assert len(pubs[1].keys()) == 53, "second publication has 53 columns"
 
 
 def test_publications_from_invalid_dois(caplog):

From fd6807752fa98520cdb0d1640be3fb815279f939 Mon Sep 17 00:00:00 2001
From: Ed Summers <ehs@pobox.com>
Date: Thu, 26 Sep 2024 15:49:41 -0400
Subject: [PATCH 6/6] Reformatted

---
 rialto_airflow/harvest/openalex.py | 1 -
 test/harvest/test_openalex.py      | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/rialto_airflow/harvest/openalex.py b/rialto_airflow/harvest/openalex.py
index 93917e0..b2f99ec 100644
--- a/rialto_airflow/harvest/openalex.py
+++ b/rialto_airflow/harvest/openalex.py
@@ -71,7 +71,6 @@ def dois_from_orcid(orcid: str, limit=None):
     return list(dois)
 
 
-
 def publications_csv(dois: list, csv_file: str) -> None:
     """
     Get publication records for a list of DOIs and create a CSV file.
diff --git a/test/harvest/test_openalex.py b/test/harvest/test_openalex.py
index 9935ee8..1b5e2ba 100644
--- a/test/harvest/test_openalex.py
+++ b/test/harvest/test_openalex.py
@@ -48,7 +48,7 @@ def test_publications_from_dois():
 
     # look up the publication metadata for them
     pubs = list(openalex.publications_from_dois(dois))
-   
+
     # >= is used because sometimes there can be multiple works for a DOI!
     assert len(pubs) >= 231, "should paginate (page size=200)"
     assert set(openalex.FIELDS) == set(pubs[0].keys()), "All fields accounted for."