From 8f36592b79c16a60f0e93e3732ee8b22e7c27564 Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Mon, 27 Jan 2025 11:29:06 -0500 Subject: [PATCH 1/6] minimal aggregation of value sets --- .../scripts/aggregate_value_sets.py | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 src/nmdc_submission_schema/scripts/aggregate_value_sets.py diff --git a/src/nmdc_submission_schema/scripts/aggregate_value_sets.py b/src/nmdc_submission_schema/scripts/aggregate_value_sets.py new file mode 100644 index 00000000..50aeb447 --- /dev/null +++ b/src/nmdc_submission_schema/scripts/aggregate_value_sets.py @@ -0,0 +1,56 @@ +import os +import pandas as pd +import click + + +@click.command() +@click.option( + '--file', '-f', + type=click.Path(exists=True), + multiple=True, + help="Paths to the input TSV files. This option can be used multiple times to specify multiple files." +) +@click.option( + '--output', '-o', + type=click.Path(), + default="Combined_Environmental_Context_Data.tsv", + help="Path to the output TSV file. Defaults to 'Combined_Environmental_Context_Data.tsv'." +) +def combine_context_files(file, output): + """Combine environmental context files into a single TSV file with Extension, Context Field, ENVO Class CURIE, and Label columns.""" + combined_data = [] + + for file_path in file: + # Parse extension and context field from the filename + filename = os.path.basename(file_path) + parts = filename.split("_") + extension = parts[3].capitalize() # e.g., "soil" or "water" + + # Correctly map context fields to their full names + if "broad" in filename: + context_field = "env_broad_scale" + elif "local" in filename: + context_field = "env_local_scale" + elif "medium" in filename: + context_field = "env_medium" + else: + raise ValueError(f"Could not determine context field from filename: {filename}") + + # Load the file into a DataFrame + df = pd.read_csv(file_path, sep="\t") + + # Extract CURIEs and labels from the "id" and "label" columns + if "id" in df.columns and "label" in df.columns: + for curie, label in zip(df["id"].dropna(), df["label"].dropna()): + combined_data.append((extension, context_field, curie, label)) + + # Convert the combined data into a DataFrame + final_df = pd.DataFrame(combined_data, columns=["extension", "env_context_field", "class_curie", "label"]) + + # Save the DataFrame to a TSV file + final_df.to_csv(output, sep="\t", index=False) + click.echo(f"Combined data saved to: {output}") + + +if __name__ == "__main__": + combine_context_files() From 798f9cfe980d8ff465ac1d207ae01e61b5854619 Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Mon, 27 Jan 2025 16:52:45 -0500 Subject: [PATCH 2/6] makefile target --- project.Makefile | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/project.Makefile b/project.Makefile index 73ccf0a1..f8d9bdea 100644 --- a/project.Makefile +++ b/project.Makefile @@ -319,4 +319,21 @@ notebooks/environmental_context_value_sets/nmdc_env_context_subset_membership.ts $(RUN) python src/nmdc_submission_schema/scripts/create_env_context_robot_template.py \ --envo-owl-path notebooks/environmental_context_value_sets/envo.owl \ --schema-path $< \ - --output-file $@ \ No newline at end of file + --output-file $@ + +local/aggregate_value_sets.tsv: + $(RUN) python src/nmdc_submission_schema/scripts/aggregate_value_sets.py \ + --output $@ \ + --file notebooks/environmental_context_value_sets/water/env_local_scale/post_google_sheets_water_env_local_scale.tsv + +# notebooks/environmental_context_value_sets/plant_associated/env_broad_scale/post_google_sheets_plant_associated_env_broad_scale.tsv + #notebooks/environmental_context_value_sets/plant_associated/env_medium/post_google_sheets_plant_associated_env_medium.tsv + #notebooks/environmental_context_value_sets/soil/env_local_scale/post_google_sheets_soil_env_local_scale.tsv + #notebooks/environmental_context_value_sets/soil/env_broad_scale/post_google_sheets_soil_env_broad_scale.tsv + #notebooks/environmental_context_value_sets/soil/env_medium/discover_excludable_soils_curated.tsv + #notebooks/environmental_context_value_sets/soil/env_medium/post_google_sheets_soil_env_medium.tsv + #notebooks/environmental_context_value_sets/soil/env_medium/discover_excludable_soils.tsv + #notebooks/environmental_context_value_sets/nmdc_env_context_subset_membership.tsv + #notebooks/environmental_context_value_sets/water/env_local_scale/post_google_sheets_water_env_local_scale.tsv + #notebooks/environmental_context_value_sets/water/env_broad_scale/post_google_sheets_water_env_broad_scale.tsv + #notebooks/environmental_context_value_sets/water/env_medium/post_google_sheets_water_env_medium.tsv \ No newline at end of file From 8a536e0f2c54438232ab9045bda7d21ef01f6d06 Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Mon, 27 Jan 2025 17:35:32 -0500 Subject: [PATCH 3/6] enrichment targets --- project.Makefile | 126 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 113 insertions(+), 13 deletions(-) diff --git a/project.Makefile b/project.Makefile index f8d9bdea..678af840 100644 --- a/project.Makefile +++ b/project.Makefile @@ -324,16 +324,116 @@ notebooks/environmental_context_value_sets/nmdc_env_context_subset_membership.ts local/aggregate_value_sets.tsv: $(RUN) python src/nmdc_submission_schema/scripts/aggregate_value_sets.py \ --output $@ \ - --file notebooks/environmental_context_value_sets/water/env_local_scale/post_google_sheets_water_env_local_scale.tsv - -# notebooks/environmental_context_value_sets/plant_associated/env_broad_scale/post_google_sheets_plant_associated_env_broad_scale.tsv - #notebooks/environmental_context_value_sets/plant_associated/env_medium/post_google_sheets_plant_associated_env_medium.tsv - #notebooks/environmental_context_value_sets/soil/env_local_scale/post_google_sheets_soil_env_local_scale.tsv - #notebooks/environmental_context_value_sets/soil/env_broad_scale/post_google_sheets_soil_env_broad_scale.tsv - #notebooks/environmental_context_value_sets/soil/env_medium/discover_excludable_soils_curated.tsv - #notebooks/environmental_context_value_sets/soil/env_medium/post_google_sheets_soil_env_medium.tsv - #notebooks/environmental_context_value_sets/soil/env_medium/discover_excludable_soils.tsv - #notebooks/environmental_context_value_sets/nmdc_env_context_subset_membership.tsv - #notebooks/environmental_context_value_sets/water/env_local_scale/post_google_sheets_water_env_local_scale.tsv - #notebooks/environmental_context_value_sets/water/env_broad_scale/post_google_sheets_water_env_broad_scale.tsv - #notebooks/environmental_context_value_sets/water/env_medium/post_google_sheets_water_env_medium.tsv \ No newline at end of file + --file notebooks/environmental_context_value_sets/plant_associated/env_broad_scale/post_google_sheets_plant_associated_env_broad_scale.tsv \ + --file notebooks/environmental_context_value_sets/plant_associated/env_local_scale/post_google_sheets_plant_associated_env_local_scale.tsv \ + --file notebooks/environmental_context_value_sets/plant_associated/env_medium/post_google_sheets_plant_associated_env_medium.tsv \ + --file notebooks/environmental_context_value_sets/sediment/env_broad_scale/post_google_sheets_sediment_env_broad_scale.tsv \ + --file notebooks/environmental_context_value_sets/sediment/env_local_scale/post_google_sheets_sediment_env_local_scale.tsv \ + --file notebooks/environmental_context_value_sets/sediment/env_medium/post_google_sheets_sediment_env_medium.tsv \ + --file notebooks/environmental_context_value_sets/soil/env_broad_scale/post_google_sheets_soil_env_broad_scale.tsv \ + --file notebooks/environmental_context_value_sets/soil/env_local_scale/post_google_sheets_soil_env_local_scale.tsv \ + --file notebooks/environmental_context_value_sets/soil/env_medium/post_google_sheets_soil_env_medium.tsv \ + --file notebooks/environmental_context_value_sets/water/env_broad_scale/post_google_sheets_water_env_broad_scale.tsv \ + --file notebooks/environmental_context_value_sets/water/env_local_scale/post_google_sheets_water_env_local_scale.tsv \ + --file notebooks/environmental_context_value_sets/water/env_medium/post_google_sheets_water_env_medium.tsv + +local/soil_env_broad_scale_curies.txt: local/aggregate_value_sets.tsv + egrep 'Soil env_broad_scale' $< | cut -f 3 > $@ + +local/soil_env_local_scale_curies.txt: local/aggregate_value_sets.tsv + egrep 'Soil env_local_scale' $< | cut -f 3 > $@ + +local/soil_env_medium_curies.txt: local/aggregate_value_sets.tsv + egrep 'Soil env_medium' $< | cut -f 3 > $@ + +local/water_env_broad_scale_curies.txt: local/aggregate_value_sets.tsv + egrep 'Water env_broad_scale' $< | cut -f 3 > $@ + +local/water_env_local_scale_curies.txt: local/aggregate_value_sets.tsv + egrep 'Water env_local_scale' $< | cut -f 3 > $@ + +local/water_env_medium_curies.txt: local/aggregate_value_sets.tsv + egrep 'Water env_medium' $< | cut -f 3 > $@ + +local/plant_associated_env_broad_scale_curies.txt: local/aggregate_value_sets.tsv + egrep 'Plant env_broad_scale' $< | cut -f 3 > $@ + +local/plant_associated_env_local_scale_curies.txt: local/aggregate_value_sets.tsv + egrep 'Plant env_local_scale' $< | cut -f 3 > $@ + +local/plant_associated_env_medium_curies.txt: local/aggregate_value_sets.tsv + egrep 'Plant env_medium' $< | cut -f 3 > $@ + + +local/soil_env_broad_scale_enrichment.tsv: local/soil_env_broad_scale_curies.txt + $(RUN) runoak -i sqlite:obo:envo enrichment \ + --ontology-only \ + -p i \ + -O tsv \ + -o $@ .idfile $< + +local/soil_env_local_scale_enrichment.tsv: local/soil_env_local_scale_curies.txt + $(RUN) runoak -i sqlite:obo:envo enrichment \ + --ontology-only \ + -p i \ + -O tsv \ + -o $@ .idfile $< + +local/soil_env_medium_enrichment.tsv: local/soil_env_medium_curies.txt + $(RUN) runoak -i sqlite:obo:envo enrichment \ + --ontology-only \ + -p i \ + -O tsv \ + -o $@ .idfile $< + +local/water_env_broad_scale_enrichment.tsv: local/water_env_broad_scale_curies.txt + $(RUN) runoak -i sqlite:obo:envo enrichment \ + --ontology-only \ + -p i \ + -O tsv \ + -o $@ .idfile $< + +local/water_env_local_scale_enrichment.tsv: local/water_env_local_scale_curies.txt + $(RUN) runoak -i sqlite:obo:envo enrichment \ + --ontology-only \ + -p i \ + -O tsv \ + -o $@ .idfile $< + +local/water_env_medium_enrichment.tsv: local/water_env_medium_curies.txt + # --filter-redundant limits this to liquid water + $(RUN) runoak -i sqlite:obo:envo enrichment \ + --ontology-only \ + -p i \ + -O tsv \ + -o $@ .idfile $< + +local/plant_associated_env_broad_scale_enrichment.tsv: local/plant_associated_env_broad_scale_curies.txt + $(RUN) runoak -i sqlite:obo:envo enrichment \ + --ontology-only \ + -p i \ + -O tsv \ + -o $@ .idfile $< + +local/plant_associated_env_local_scale_enrichment.tsv: local/plant_associated_env_local_scale_curies.txt + $(RUN) runoak -i sqlite:obo:envo enrichment \ + --ontology-only \ + -p i \ + -O tsv \ + -o $@ .idfile $< + +local/plant_associated_env_medium_enrichment_by_envo.tsv: local/plant_associated_env_medium_curies.txt + $(RUN) runoak -i sqlite:obo:envo enrichment \ + --ontology-only \ + -p i \ + -O tsv \ + -o $@ .idfile $< + +local/plant_associated_env_medium_enrichment_by_po.tsv: local/plant_associated_env_medium_curies.txt + $(RUN) runoak -i sqlite:obo:po enrichment \ + --ontology-only \ + -p i \ + -O tsv \ + -o $@ .idfile $< + + From 0e29e8e6db72c211055f2aad89ef89b390af0df9 Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Mon, 27 Jan 2025 17:49:58 -0500 Subject: [PATCH 4/6] math domain error with repeated --ontology-only observations --- project.Makefile | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/project.Makefile b/project.Makefile index 678af840..5ab2d309 100644 --- a/project.Makefile +++ b/project.Makefile @@ -364,6 +364,17 @@ local/plant_associated_env_local_scale_curies.txt: local/aggregate_value_sets.ts local/plant_associated_env_medium_curies.txt: local/aggregate_value_sets.tsv egrep 'Plant env_medium' $< | cut -f 3 > $@ +local/sediment_env_broad_scale_curies.txt: local/aggregate_value_sets.tsv + egrep 'Sediment env_broad_scale' $< | cut -f 3 > $@ + +local/sediment_env_local_scale_curies.txt: local/aggregate_value_sets.tsv + egrep 'Sediment env_local_scale' $< | cut -f 3 > $@ + +local/sediment_env_medium_curies.txt: local/aggregate_value_sets.tsv + egrep 'Sediment env_medium' $< | cut -f 3 > $@ + +local/env_medium_curies.txt: local/aggregate_value_sets.tsv + egrep 'env_medium' $< | cut -f 3 > $@ local/soil_env_broad_scale_enrichment.tsv: local/soil_env_broad_scale_curies.txt $(RUN) runoak -i sqlite:obo:envo enrichment \ @@ -436,4 +447,30 @@ local/plant_associated_env_medium_enrichment_by_po.tsv: local/plant_associated_e -O tsv \ -o $@ .idfile $< +local/sediment_env_medium_enrichment.tsv: local/sediment_env_medium_curies.txt + $(RUN) runoak -i sqlite:obo:envo enrichment \ + --ontology-only \ + -p i \ + -O tsv \ + -o $@ .idfile $< +local/sediment_env_broad_scale_enrichment.tsv: local/sediment_env_broad_scale_curies.txt + $(RUN) runoak -i sqlite:obo:envo enrichment \ + --ontology-only \ + -p i \ + -O tsv \ + -o $@ .idfile $< + +local/sediment_env_local_scale_enrichment.tsv: local/sediment_env_local_scale_curies.txt + $(RUN) runoak -i sqlite:obo:envo enrichment \ + --ontology-only \ + -p i \ + -O tsv \ + -o $@ .idfile $< + +local/env_medium_enrichment.tsv: local/env_medium_curies.txt + $(RUN) runoak -i sqlite:obo:envo enrichment \ + --ontology-only \ + -p i \ + -O tsv \ + -o $@ .idfile $< From 61355a36e914689ff47025b1b8d9b01f21ef0c62 Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Wed, 29 Jan 2025 09:56:04 -0500 Subject: [PATCH 5/6] even longer project.Makefile --- project.Makefile | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/project.Makefile b/project.Makefile index 5ab2d309..2edc0857 100644 --- a/project.Makefile +++ b/project.Makefile @@ -373,8 +373,12 @@ local/sediment_env_local_scale_curies.txt: local/aggregate_value_sets.tsv local/sediment_env_medium_curies.txt: local/aggregate_value_sets.tsv egrep 'Sediment env_medium' $< | cut -f 3 > $@ -local/env_medium_curies.txt: local/aggregate_value_sets.tsv - egrep 'env_medium' $< | cut -f 3 > $@ +local/env_broad_scale_curies.txt: local/aggregate_value_sets.tsv + egrep 'env_broad_scale' $< | cut -f 3 | sort | uniq > $@ + +local/env_local_scale_curies.txt: local/aggregate_value_sets.tsv + egrep 'env_local_scale' $< | cut -f 3 | sort | uniq > $@ + local/soil_env_broad_scale_enrichment.tsv: local/soil_env_broad_scale_curies.txt $(RUN) runoak -i sqlite:obo:envo enrichment \ @@ -468,7 +472,14 @@ local/sediment_env_local_scale_enrichment.tsv: local/sediment_env_local_scale_cu -O tsv \ -o $@ .idfile $< -local/env_medium_enrichment.tsv: local/env_medium_curies.txt +local/env_broad_scale_enrichment.tsv: local/env_broad_scale_curies.txt + $(RUN) runoak -i sqlite:obo:envo enrichment \ + --ontology-only \ + -p i \ + -O tsv \ + -o $@ .idfile $< + +local/env_local_scale_enrichment.tsv: local/env_local_scale_curies.txt $(RUN) runoak -i sqlite:obo:envo enrichment \ --ontology-only \ -p i \ From b817b2fc84f298d3026deabfa297f2db4d0ed2b5 Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Wed, 29 Jan 2025 09:58:27 -0500 Subject: [PATCH 6/6] ignore envo.owl --- notebooks/environmental_context_value_sets/.gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/notebooks/environmental_context_value_sets/.gitignore b/notebooks/environmental_context_value_sets/.gitignore index d01f5cc3..03312ed5 100644 --- a/notebooks/environmental_context_value_sets/.gitignore +++ b/notebooks/environmental_context_value_sets/.gitignore @@ -1 +1,3 @@ env-context-voting-sheets-29d0d970ca1d.json +envo.owl +