From d992d2c7c52a6a44ec6d082dadd54892765f2134 Mon Sep 17 00:00:00 2001 From: jacobthill Date: Thu, 26 Sep 2024 12:48:13 -0400 Subject: [PATCH 1/6] Add new authors.csv, adjust published columns and tests --- rialto_airflow/harvest/dimensions.py | 4 ++-- rialto_airflow/harvest/merge_pubs.py | 6 +----- rialto_airflow/harvest/openalex.py | 1 + rialto_airflow/harvest/sul_pub.py | 13 ------------- test/harvest/test_merge_pubs.py | 10 +++++----- 5 files changed, 9 insertions(+), 25 deletions(-) diff --git a/rialto_airflow/harvest/dimensions.py b/rialto_airflow/harvest/dimensions.py index 61f1f11..69d0a88 100644 --- a/rialto_airflow/harvest/dimensions.py +++ b/rialto_airflow/harvest/dimensions.py @@ -16,10 +16,10 @@ def dois_from_orcid(orcid): logging.info(f"looking up dois for orcid {orcid}") q = """ - search publications where researchers.orcid_id = "{}" + search publications where researchers.orcid_id = "{}" and year in [2018:{}] return publications [doi] limit 1000 - """.format(orcid) + """.format(orcid, 2024) # The Dimensions API can flake out sometimes, so try to catch & retry. # TODO: Consider using retry param in query() instead diff --git a/rialto_airflow/harvest/merge_pubs.py b/rialto_airflow/harvest/merge_pubs.py index 3ba11de..7f16ee0 100644 --- a/rialto_airflow/harvest/merge_pubs.py +++ b/rialto_airflow/harvest/merge_pubs.py @@ -57,11 +57,7 @@ def dimensions_pubs_df(dimensions_pubs): "document_type", "funders", "funding_section", - "linkout", "open_access", - "publisher", - "research_orgs", - "researchers", "title", "type", "year", @@ -86,7 +82,7 @@ def openalex_pubs_df(openalex_pubs): "publication_year", "title", "type", - "best_oa_location", + "open_access", ), ) df = df.rename(lambda column_name: "openalex_" + column_name) diff --git a/rialto_airflow/harvest/openalex.py b/rialto_airflow/harvest/openalex.py index c51368c..48cf0d0 100644 --- a/rialto_airflow/harvest/openalex.py +++ b/rialto_airflow/harvest/openalex.py @@ -147,6 +147,7 @@ def normalize_publication(pub) -> dict: "id", "ids", "indexed_in", + "institution_assertions", "institutions_distinct_count", "is_authors_truncated", "is_paratext", diff --git a/rialto_airflow/harvest/sul_pub.py b/rialto_airflow/harvest/sul_pub.py index 9395f1e..c937607 100644 --- a/rialto_airflow/harvest/sul_pub.py +++ b/rialto_airflow/harvest/sul_pub.py @@ -7,29 +7,16 @@ SUL_PUB_FIELDS = [ "authorship", "title", - "abstract", "author", "year", "type", - "mesh_headings", - "publisher", "journal", "provenance", "doi", - "issn", "sulpubid", - "sw_id", - "pmid", - "identifier", - "last_updated", - "pages", "date", "country", "booktitle", - "edition", - "series", - "chapter", - "editor", ] diff --git a/test/harvest/test_merge_pubs.py b/test/harvest/test_merge_pubs.py index 28c8807..4fd52b1 100644 --- a/test/harvest/test_merge_pubs.py +++ b/test/harvest/test_merge_pubs.py @@ -84,7 +84,7 @@ def openalex_pubs_csv(tmp_path): "title", "type", "doi", - "best_oa_location", + "open_access", ] writer.writerow(header) writer.writerow( @@ -97,7 +97,7 @@ def openalex_pubs_csv(tmp_path): "A Publication", "article", "https://doi.org/10.0000/cccc", - '{is_oa: true, landing_page_url: "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1398957", pdf_url: null, source: { id: "https://openalex.org/S2764455111", display_name: "PubMed Central", issn_l: null, issn: null, host_organization: "https://openalex.org/I1299303238", type: "repository" }, license: null, version: "publishedVersion"}', + 'green' ] ) writer.writerow( @@ -110,7 +110,7 @@ def openalex_pubs_csv(tmp_path): "A Research Article", "article", "https://doi.org/10.0000/1234", - "", + "bronze" ] ) return fixture_file @@ -165,7 +165,7 @@ def test_openalex_pubs_df(openalex_pubs_csv): df = lazy_df.collect() assert df.shape[0] == 2 assert "bogus" not in df.columns, "Unneeded columns have been dropped" - assert "openalex_best_oa_location" in df.columns + assert "openalex_open_access" in df.columns assert df["openalex_doi"].to_list() == ["10.0000/cccc", "10.0000/1234"] @@ -193,7 +193,7 @@ def test_merge(tmp_path, sul_pubs_csv, openalex_pubs_csv, dimensions_pubs_csv): assert output.is_file(), "output file has been created" df = pl.read_parquet(output) assert df.shape[0] == 5 - assert df.shape[1] == 25 + assert df.shape[1] == 21 assert set(df["doi"].to_list()) == set( ["10.0000/aaaa", "10.0000/1234", "10.0000/cccc", "10.0000/dddd", "10.0000/eeee"] ) From f388581928f4a34031805e72917e2fefb98d3e84 Mon Sep 17 00:00:00 2001 From: jacobthill Date: Thu, 26 Sep 2024 12:56:14 -0400 Subject: [PATCH 2/6] changing quotes --- test/harvest/test_merge_pubs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/harvest/test_merge_pubs.py b/test/harvest/test_merge_pubs.py index 4fd52b1..b3e3909 100644 --- a/test/harvest/test_merge_pubs.py +++ b/test/harvest/test_merge_pubs.py @@ -97,7 +97,7 @@ def openalex_pubs_csv(tmp_path): "A Publication", "article", "https://doi.org/10.0000/cccc", - 'green' + "green" ] ) writer.writerow( From 58982356c62bf60b15769fa1b995d76f7ef969b0 Mon Sep 17 00:00:00 2001 From: jacobthill Date: Thu, 26 Sep 2024 14:04:22 -0400 Subject: [PATCH 3/6] ruff format --- test/harvest/test_merge_pubs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/harvest/test_merge_pubs.py b/test/harvest/test_merge_pubs.py index b3e3909..c582b45 100644 --- a/test/harvest/test_merge_pubs.py +++ b/test/harvest/test_merge_pubs.py @@ -97,7 +97,7 @@ def openalex_pubs_csv(tmp_path): "A Publication", "article", "https://doi.org/10.0000/cccc", - "green" + "green", ] ) writer.writerow( @@ -110,7 +110,7 @@ def openalex_pubs_csv(tmp_path): "A Research Article", "article", "https://doi.org/10.0000/1234", - "bronze" + "bronze", ] ) return fixture_file From ccf68161dddb5b2cf48d5b5d5891297719aff6f6 Mon Sep 17 00:00:00 2001 From: jacobthill Date: Thu, 26 Sep 2024 15:45:49 -0400 Subject: [PATCH 4/6] Revert filtering Dimensions by year --- rialto_airflow/harvest/dimensions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rialto_airflow/harvest/dimensions.py b/rialto_airflow/harvest/dimensions.py index 69d0a88..61f1f11 100644 --- a/rialto_airflow/harvest/dimensions.py +++ b/rialto_airflow/harvest/dimensions.py @@ -16,10 +16,10 @@ def dois_from_orcid(orcid): logging.info(f"looking up dois for orcid {orcid}") q = """ - search publications where researchers.orcid_id = "{}" and year in [2018:{}] + search publications where researchers.orcid_id = "{}" return publications [doi] limit 1000 - """.format(orcid, 2024) + """.format(orcid) # The Dimensions API can flake out sometimes, so try to catch & retry. # TODO: Consider using retry param in query() instead From 9ae9adce08e78cb5df569e115761abeceb4f39c3 Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Thu, 26 Sep 2024 15:41:42 -0400 Subject: [PATCH 5/6] Account for multiple OpenAlex works with the same DOI The dois_from_orcid() now returns a list of unique DOIs, instead of an iterator of possiblly unique DOIs. This allows a failing test to pass. The test for openalex_publications_from_dois() was relaxed a bit to look for 231 or more publications, since a lookup for an individual DOI can sometimes pull back multiple works. The number of columns for OpenAlex is now 53 because we added `institution_assertions`. --- rialto_airflow/harvest/openalex.py | 15 +++++++++------ test/harvest/test_openalex.py | 11 ++++++----- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/rialto_airflow/harvest/openalex.py b/rialto_airflow/harvest/openalex.py index 48cf0d0..93917e0 100644 --- a/rialto_airflow/harvest/openalex.py +++ b/rialto_airflow/harvest/openalex.py @@ -37,7 +37,7 @@ def doi_orcids_pickle(authors_csv, pickle_file, limit=None): def dois_from_orcid(orcid: str, limit=None): """ - Pass in the ORCID ID and get back an iterator of DOIs for publications authored by that person. + Pass in the ORCID ID and get back a list of DOIs for publications authored by that person. """ # TODO: I think we can maybe have this function take a list of orcids and @@ -57,16 +57,19 @@ def dois_from_orcid(orcid: str, limit=None): author_id = authors[0]["id"] # get all the works for the openalex author id - work_count = 0 + dois = set() for page in ( Works().filter(author={"id": author_id}).select(["doi"]).paginate(per_page=200) ): for pub in page: if pub.get("doi"): - work_count += 1 - if limit is not None and work_count > limit: - return - yield pub.get("doi").replace("https://doi.org/", "") + doi = pub.get("doi").replace("https://doi.org/", "") + dois.add(doi) + if limit is not None and len(dois) == limit: + return list(dois) + + return list(dois) + def publications_csv(dois: list, csv_file: str) -> None: diff --git a/test/harvest/test_openalex.py b/test/harvest/test_openalex.py index bbb0cab..9935ee8 100644 --- a/test/harvest/test_openalex.py +++ b/test/harvest/test_openalex.py @@ -18,7 +18,7 @@ def test_dois_from_orcid_paging(): # for Shanhui Fan who has a lot of publications (> 1300) dois = list(openalex.dois_from_orcid("0000-0002-0081-9732", limit=300)) assert len(dois) == 300, "paging is limiting to 200 works" - assert len(set(dois)) == 300, "the dois are unique" + assert len(set(dois)) == len(dois), "the dois are unique" def test_doi_orcids_pickle(tmp_path): @@ -48,11 +48,12 @@ def test_publications_from_dois(): # look up the publication metadata for them pubs = list(openalex.publications_from_dois(dois)) - assert len(pubs) == 231, "should paginate (page size=200)" - assert len(pubs) == len(set([pub["doi"] for pub in pubs])), "DOIs are unique" + + # >= is used because sometimes there can be multiple works for a DOI! + assert len(pubs) >= 231, "should paginate (page size=200)" assert set(openalex.FIELDS) == set(pubs[0].keys()), "All fields accounted for." - assert len(pubs[0].keys()) == 52, "first publication has 52 columns" - assert len(pubs[1].keys()) == 52, "second publication has 52 columns" + assert len(pubs[0].keys()) == 53, "first publication has 53 columns" + assert len(pubs[1].keys()) == 53, "second publication has 53 columns" def test_publications_from_invalid_dois(caplog): From fd6807752fa98520cdb0d1640be3fb815279f939 Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Thu, 26 Sep 2024 15:49:41 -0400 Subject: [PATCH 6/6] Reformatted --- rialto_airflow/harvest/openalex.py | 1 - test/harvest/test_openalex.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/rialto_airflow/harvest/openalex.py b/rialto_airflow/harvest/openalex.py index 93917e0..b2f99ec 100644 --- a/rialto_airflow/harvest/openalex.py +++ b/rialto_airflow/harvest/openalex.py @@ -71,7 +71,6 @@ def dois_from_orcid(orcid: str, limit=None): return list(dois) - def publications_csv(dois: list, csv_file: str) -> None: """ Get publication records for a list of DOIs and create a CSV file. diff --git a/test/harvest/test_openalex.py b/test/harvest/test_openalex.py index 9935ee8..1b5e2ba 100644 --- a/test/harvest/test_openalex.py +++ b/test/harvest/test_openalex.py @@ -48,7 +48,7 @@ def test_publications_from_dois(): # look up the publication metadata for them pubs = list(openalex.publications_from_dois(dois)) - + # >= is used because sometimes there can be multiple works for a DOI! assert len(pubs) >= 231, "should paginate (page size=200)" assert set(openalex.FIELDS) == set(pubs[0].keys()), "All fields accounted for."