neurobagel · alyssadai · Nov 11, 2024 · Nov 7, 2024 · Nov 7, 2024 · Nov 7, 2024
diff --git a/bagel/cli.py b/bagel/cli.py
@@ -1,22 +1,18 @@
-import json
 from pathlib import Path
 
 import typer
 from bids import BIDSLayout
 
-import bagel.bids_utils as butil
-import bagel.derivatives_utils as dutil
-import bagel.file_utils as futil
-import bagel.pheno_utils as putil
 from bagel import mappings, models
-from bagel.derivatives_utils import PROC_STATUS_COLS
-from bagel.utility import (
-    confirm_subs_match_pheno_data,
-    extract_and_validate_jsonld_dataset,
-    generate_context,
-    get_imaging_session_instances,
-    get_subject_instances,
+
+from .utilities import (
+    bids_utils,
+    derivative_utils,
+    file_utils,
+    model_utils,
+    pheno_utils,
 )
+from .utilities.derivative_utils import PROC_STATUS_COLS
 
 CUSTOM_SESSION_LABEL = "ses-unnamed"
 
@@ -66,7 +62,7 @@ def pheno(
         None,
         "--portal",
         "-u",  # for URL
-        callback=putil.validate_portal_uri,
+        callback=pheno_utils.validate_portal_uri,
         help="URL (HTTP/HTTPS) to a website or page that describes the dataset and access instructions (if available).",
     ),
     output: Path = typer.Option(
@@ -94,11 +90,11 @@ def pheno(
     graph data model for the provided phenotypic file in the .jsonld format.
     You can upload this .jsonld file to the Neurobagel graph.
     """
-    futil.check_overwrite(output, overwrite)
+    file_utils.check_overwrite(output, overwrite)
 
-    data_dictionary = futil.load_json(dictionary)
-    pheno_df = futil.load_tabular(pheno)
-    putil.validate_inputs(data_dictionary, pheno_df)
+    data_dictionary = file_utils.load_json(dictionary)
+    pheno_df = file_utils.load_tabular(pheno)
+    pheno_utils.validate_inputs(data_dictionary, pheno_df)
 
     # NOTE: `space` determines the amount of padding (in num. characters) before the file paths in the print statement.
     # It is currently calculated as = (length of the longer string, including the 3 leading spaces) + (2 extra spaces)
@@ -111,8 +107,8 @@ def pheno(
 
     subject_list = []
 
-    column_mapping = putil.map_categories_to_columns(data_dictionary)
-    tool_mapping = putil.map_tools_to_columns(data_dictionary)
+    column_mapping = pheno_utils.map_categories_to_columns(data_dictionary)
+    tool_mapping = pheno_utils.map_tools_to_columns(data_dictionary)
 
     # TODO: needs refactoring once we handle multiple participant IDs
     participants = column_mapping.get("participant")[0]
@@ -140,15 +136,15 @@ def pheno(
             _ses_pheno = session_row
 
             if "sex" in column_mapping.keys():
-                _sex_vals = putil.get_transformed_values(
+                _sex_vals = pheno_utils.get_transformed_values(
                     column_mapping["sex"], _ses_pheno, data_dictionary
                 )
                 if _sex_vals:
                     # NOTE: Our data model only allows a single sex value, so we only take the first instance if multiple columns are about sex
                     session.hasSex = models.Sex(identifier=_sex_vals[0])
 
             if "diagnosis" in column_mapping.keys():
-                _dx_vals = putil.get_transformed_values(
+                _dx_vals = pheno_utils.get_transformed_values(
                     column_mapping["diagnosis"], _ses_pheno, data_dictionary
                 )
                 if not _dx_vals:
@@ -166,7 +162,7 @@ def pheno(
                     ]
 
             if "age" in column_mapping.keys():
-                _age_vals = putil.get_transformed_values(
+                _age_vals = pheno_utils.get_transformed_values(
                     column_mapping["age"], _ses_pheno, data_dictionary
                 )
                 if _age_vals:
@@ -177,7 +173,7 @@ def pheno(
                 _assessments = [
                     models.Assessment(identifier=tool)
                     for tool, columns in tool_mapping.items()
-                    if putil.are_any_available(
+                    if pheno_utils.are_any_available(
                         columns, _ses_pheno, data_dictionary
                     )
                 ]
@@ -197,16 +193,10 @@ def pheno(
         hasSamples=subject_list,
     )
 
-    context = generate_context()
-    # We can't just exclude_unset here because the identifier and schemaKey
-    # for each instance are created as default values and so technically are never set
-    # TODO: we should revisit this because there may be reasons to have None be meaningful in the future
-    context.update(**dataset.dict(exclude_none=True))
-
-    with open(output, "w") as f:
-        f.write(json.dumps(context, indent=2))
-
-    print(f"Saved output to:  {output}")
+    file_utils.save_jsonld(
+        data=model_utils.add_context_to_graph_dataset(dataset),
+        filename=output,
+    )
 
 
 @bagel.command()
@@ -258,7 +248,7 @@ def bids(
     graph data model for the combined metadata in the .jsonld format.
     You can upload this .jsonld file to the Neurobagel graph.
     """
-    futil.check_overwrite(output, overwrite)
+    file_utils.check_overwrite(output, overwrite)
 
     space = 51
     print(
@@ -267,13 +257,15 @@ def bids(
         f"   {'BIDS dataset directory:' : <{space}} {bids_dir}"
     )
 
-    jsonld_dataset = extract_and_validate_jsonld_dataset(jsonld_path)
+    jsonld_dataset = model_utils.extract_and_validate_jsonld_dataset(
+        jsonld_path
+    )
 
-    existing_subs_dict = get_subject_instances(jsonld_dataset)
+    existing_subs_dict = model_utils.get_subject_instances(jsonld_dataset)
 
     # TODO: Revert to using Layout.get_subjects() to get BIDS subjects once pybids performance is improved
-    confirm_subs_match_pheno_data(
-        subjects=butil.get_bids_subjects_simple(bids_dir),
+    model_utils.confirm_subs_match_pheno_data(
+        subjects=bids_utils.get_bids_subjects_simple(bids_dir),
         subject_source_for_err="BIDS directory",
         pheno_subjects=existing_subs_dict.keys(),
     )
@@ -287,7 +279,7 @@ def bids(
     print("Merging BIDS metadata with existing subject annotations...\n")
     for bids_sub_id in layout.get_subjects():
         existing_subject = existing_subs_dict.get(f"sub-{bids_sub_id}")
-        existing_sessions_dict = get_imaging_session_instances(
+        existing_sessions_dict = model_utils.get_imaging_session_instances(
             existing_subject
         )
 
@@ -300,7 +292,7 @@ def bids(
         # For some reason .get_sessions() doesn't always follow alphanumeric order
         # By default (without sorting) the session lists look like ["02", "01"] per subject
         for session_id in sorted(bids_sessions):
-            image_list = butil.create_acquisitions(
+            image_list = bids_utils.create_acquisitions(
                 layout=layout,
                 bids_sub_id=bids_sub_id,
                 session=session_id,
@@ -321,7 +313,7 @@ def bids(
                 if session_id is None
                 else f"ses-{session_id}"
             )
-            session_path = butil.get_session_path(
+            session_path = bids_utils.get_session_path(
                 layout=layout,
                 bids_dir=bids_dir,
                 bids_sub_id=bids_sub_id,
@@ -344,13 +336,10 @@ def bids(
                 )
                 existing_subject.hasSession.append(new_imaging_session)
 
-    context = generate_context()
-    merged_dataset = {**context, **jsonld_dataset.dict(exclude_none=True)}
-
-    with open(output, "w") as f:
-        f.write(json.dumps(merged_dataset, indent=2))
-
-    print(f"Saved output to:  {output}")
+    file_utils.save_jsonld(
+        data=model_utils.add_context_to_graph_dataset(jsonld_dataset),
+        filename=output,
+    )
 
 
 @bagel.command()
@@ -402,7 +391,7 @@ def derivatives(
     graph data model for the combined metadata in the .jsonld format.
     You can upload this .jsonld file to the Neurobagel graph.
     """
-    futil.check_overwrite(output, overwrite)
+    file_utils.check_overwrite(output, overwrite)
 
     space = 51
     print(
@@ -411,10 +400,12 @@ def derivatives(
         f"   {'Processing status file (.tsv):' : <{space}}{tabular}"
     )
 
-    status_df = futil.load_tabular(tabular, input_type="processing status")
+    status_df = file_utils.load_tabular(
+        tabular, input_type="processing status"
+    )
 
     # We don't allow empty values in the participant ID column
-    if row_indices := putil.get_rows_with_empty_strings(
+    if row_indices := pheno_utils.get_rows_with_empty_strings(
         status_df, [PROC_STATUS_COLS["participant"]]
     ):
         raise LookupError(
@@ -424,21 +415,25 @@ def derivatives(
         )
 
     pipelines = status_df[PROC_STATUS_COLS["pipeline_name"]].unique()
-    dutil.check_pipelines_are_recognized(pipelines)
+    derivative_utils.check_pipelines_are_recognized(pipelines)
 
     # TODO: Do we need to check all versions across all pipelines first, and report all unrecognized versions together?
     for pipeline in pipelines:
         versions = status_df[
             status_df[PROC_STATUS_COLS["pipeline_name"]] == pipeline
         ][PROC_STATUS_COLS["pipeline_version"]].unique()
 
-        dutil.check_pipeline_versions_are_recognized(pipeline, versions)
+        derivative_utils.check_pipeline_versions_are_recognized(
+            pipeline, versions
+        )
 
-    jsonld_dataset = extract_and_validate_jsonld_dataset(jsonld_path)
+    jsonld_dataset = model_utils.extract_and_validate_jsonld_dataset(
+        jsonld_path
+    )
 
-    existing_subs_dict = get_subject_instances(jsonld_dataset)
+    existing_subs_dict = model_utils.get_subject_instances(jsonld_dataset)
 
-    confirm_subs_match_pheno_data(
+    model_utils.confirm_subs_match_pheno_data(
         subjects=status_df[PROC_STATUS_COLS["participant"]].unique(),
         subject_source_for_err="processing status file",
         pheno_subjects=existing_subs_dict.keys(),
@@ -451,14 +446,14 @@ def derivatives(
         existing_subject = existing_subs_dict.get(subject)
 
         # Note: Dictionary of existing imaging sessions can be empty if only bagel pheno was run
-        existing_sessions_dict = get_imaging_session_instances(
+        existing_sessions_dict = model_utils.get_imaging_session_instances(
             existing_subject
         )
 
         for session_label, sub_ses_proc_df in sub_proc_df.groupby(
             PROC_STATUS_COLS["session"]
         ):
-            completed_pipelines = dutil.create_completed_pipelines(
+            completed_pipelines = derivative_utils.create_completed_pipelines(
                 sub_ses_proc_df
             )
 
@@ -480,10 +475,7 @@ def derivatives(
                 )
                 existing_subject.hasSession.append(new_img_session)
 
-    context = generate_context()
-    merged_dataset = {**context, **jsonld_dataset.dict(exclude_none=True)}
-
-    with open(output, "w") as f:
-        f.write(json.dumps(merged_dataset, indent=2))
-
-    print(f"Saved output to:  {output}")
+    file_utils.save_jsonld(
+        data=model_utils.add_context_to_graph_dataset(jsonld_dataset),
+        filename=output,
+    )
diff --git a/bagel/mappings.py b/bagel/mappings.py
@@ -1,7 +1,7 @@
 from collections import namedtuple
 from pathlib import Path
 
-import bagel.file_utils as futil
+from .utilities import file_utils
 
 Namespace = namedtuple("Namespace", ["pf", "url"])
 COGATLAS = Namespace("cogatlas", "https://www.cognitiveatlas.org/task/id/")
@@ -51,7 +51,7 @@ def get_pipeline_uris() -> dict:
     """
     output_dict = {}
     for pipe_file in PROCESSING_PIPELINE_PATH.glob("*.json"):
-        pipe = futil.load_json(pipe_file)
+        pipe = file_utils.load_json(pipe_file)
         output_dict[pipe["name"]] = f"{NP.pf}:{pipe['name']}"
 
     return output_dict
@@ -64,7 +64,7 @@ def get_pipeline_versions() -> dict:
     """
     output_dict = {}
     for pipe_file in PROCESSING_PIPELINE_PATH.glob("*.json"):
-        pipe = futil.load_json(pipe_file)
+        pipe = file_utils.load_json(pipe_file)
         output_dict[pipe["name"]] = pipe["versions"]
 
     return output_dict

diff --git a/bagel/utilities/__init__.py b/bagel/utilities/__init__.py
diff --git a/bagel/bids_utils.py → bagel/utilities/bids_utils.py b/bagel/bids_utils.py → bagel/utilities/bids_utils.py
diff --git a/bagel/derivatives_utils.py → bagel/utilities/derivative_utils.py b/bagel/derivatives_utils.py → bagel/utilities/derivative_utils.py
diff --git a/bagel/file_utils.py → bagel/utilities/file_utils.py b/bagel/file_utils.py → bagel/utilities/file_utils.py
@@ -94,3 +94,9 @@ def check_overwrite(output: Path, overwrite: bool):
                 fg=typer.colors.RED,
             )
         )
+
+
+def save_jsonld(data: dict, filename: Path):
+    with open(filename, "w") as f:
+        f.write(json.dumps(data, indent=2))
+    print(f"Saved output to:  {filename}")
diff --git a/bagel/utility.py → bagel/utilities/model_utils.py b/bagel/utility.py → bagel/utilities/model_utils.py
@@ -5,9 +5,9 @@
 import typer
 from pydantic import ValidationError
 
-import bagel.file_utils as futil
 from bagel import models
 from bagel.mappings import ALL_NAMESPACES, NB
+from bagel.utilities import file_utils
 
 
 def generate_context():
@@ -35,6 +35,15 @@ def generate_context():
     return {"@context": field_preamble}
 
 
+def add_context_to_graph_dataset(dataset: models.Dataset) -> dict:
+    """Add the Neurobagel context to a graph-ready dataset to form a JSONLD dictionary."""
+    context = generate_context()
+    # We can't just exclude_unset here because the identifier and schemaKey
+    # for each instance are created as default values and so technically are never set
+    # TODO: we should revisit this because there may be reasons to have None be meaningful in the future
+    return {**context, **dataset.dict(exclude_none=True)}
+
+
 def get_subs_missing_from_pheno_data(
     subjects: Iterable, pheno_subjects: Iterable
 ) -> list:
@@ -68,7 +77,7 @@ def extract_and_validate_jsonld_dataset(file_path: Path) -> models.Dataset:
     Strip the context from a user-provided JSONLD and validate the remaining contents
     against the data model for a Neurobagel dataset.
     """
-    jsonld = futil.load_json(file_path)
+    jsonld = file_utils.load_json(file_path)
     jsonld.pop("@context")
     try:
         jsonld_dataset = models.Dataset.parse_obj(jsonld)

diff --git a/bagel/pheno_utils.py → bagel/utilities/pheno_utils.py b/bagel/pheno_utils.py → bagel/utilities/pheno_utils.py
diff --git a/tests/test_cli_bids.py → tests/integration/test_cli_bids.py b/tests/test_cli_bids.py → tests/integration/test_cli_bids.py
diff --git a/tests/test_cli_derivatives.py → tests/integration/test_cli_derivatives.py b/tests/test_cli_derivatives.py → tests/integration/test_cli_derivatives.py
diff --git a/tests/test_cli_pheno.py → tests/integration/test_cli_pheno.py b/tests/test_cli_pheno.py → tests/integration/test_cli_pheno.py