Skip to content

Commit

Permalink
use full name of util modules & rename derivative_utils
Browse files Browse the repository at this point in the history
  • Loading branch information
alyssadai committed Nov 11, 2024
1 parent 0727921 commit 2eea196
Show file tree
Hide file tree
Showing 9 changed files with 99 additions and 79 deletions.
92 changes: 51 additions & 41 deletions bagel/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@

from bagel import mappings, models

from .utilities import bids_utils as butil
from .utilities import derivatives_utils as dutil
from .utilities import file_utils as futil
from .utilities import model_utils as mutil
from .utilities import pheno_utils as putil
from .utilities.derivatives_utils import PROC_STATUS_COLS
from .utilities import (
bids_utils,
derivative_utils,
file_utils,
model_utils,
pheno_utils,
)
from .utilities.derivative_utils import PROC_STATUS_COLS

CUSTOM_SESSION_LABEL = "ses-unnamed"

Expand Down Expand Up @@ -60,7 +62,7 @@ def pheno(
None,
"--portal",
"-u", # for URL
callback=putil.validate_portal_uri,
callback=pheno_utils.validate_portal_uri,
help="URL (HTTP/HTTPS) to a website or page that describes the dataset and access instructions (if available).",
),
output: Path = typer.Option(
Expand Down Expand Up @@ -88,11 +90,11 @@ def pheno(
graph data model for the provided phenotypic file in the .jsonld format.
You can upload this .jsonld file to the Neurobagel graph.
"""
futil.check_overwrite(output, overwrite)
file_utils.check_overwrite(output, overwrite)

data_dictionary = futil.load_json(dictionary)
pheno_df = futil.load_tabular(pheno)
putil.validate_inputs(data_dictionary, pheno_df)
data_dictionary = file_utils.load_json(dictionary)
pheno_df = file_utils.load_tabular(pheno)
pheno_utils.validate_inputs(data_dictionary, pheno_df)

# NOTE: `space` determines the amount of padding (in num. characters) before the file paths in the print statement.
# It is currently calculated as = (length of the longer string, including the 3 leading spaces) + (2 extra spaces)
Expand All @@ -105,8 +107,8 @@ def pheno(

subject_list = []

column_mapping = putil.map_categories_to_columns(data_dictionary)
tool_mapping = putil.map_tools_to_columns(data_dictionary)
column_mapping = pheno_utils.map_categories_to_columns(data_dictionary)
tool_mapping = pheno_utils.map_tools_to_columns(data_dictionary)

# TODO: needs refactoring once we handle multiple participant IDs
participants = column_mapping.get("participant")[0]
Expand Down Expand Up @@ -134,15 +136,15 @@ def pheno(
_ses_pheno = session_row

if "sex" in column_mapping.keys():
_sex_vals = putil.get_transformed_values(
_sex_vals = pheno_utils.get_transformed_values(
column_mapping["sex"], _ses_pheno, data_dictionary
)
if _sex_vals:
# NOTE: Our data model only allows a single sex value, so we only take the first instance if multiple columns are about sex
session.hasSex = models.Sex(identifier=_sex_vals[0])

if "diagnosis" in column_mapping.keys():
_dx_vals = putil.get_transformed_values(
_dx_vals = pheno_utils.get_transformed_values(
column_mapping["diagnosis"], _ses_pheno, data_dictionary
)
if not _dx_vals:
Expand All @@ -160,7 +162,7 @@ def pheno(
]

if "age" in column_mapping.keys():
_age_vals = putil.get_transformed_values(
_age_vals = pheno_utils.get_transformed_values(
column_mapping["age"], _ses_pheno, data_dictionary
)
if _age_vals:
Expand All @@ -171,7 +173,7 @@ def pheno(
_assessments = [
models.Assessment(identifier=tool)
for tool, columns in tool_mapping.items()
if putil.are_any_available(
if pheno_utils.are_any_available(
columns, _ses_pheno, data_dictionary
)
]
Expand All @@ -191,8 +193,8 @@ def pheno(
hasSamples=subject_list,
)

futil.save_jsonld(
data=mutil.add_context_to_graph_dataset(dataset),
file_utils.save_jsonld(
data=model_utils.add_context_to_graph_dataset(dataset),
filename=output,
)

Expand Down Expand Up @@ -246,7 +248,7 @@ def bids(
graph data model for the combined metadata in the .jsonld format.
You can upload this .jsonld file to the Neurobagel graph.
"""
futil.check_overwrite(output, overwrite)
file_utils.check_overwrite(output, overwrite)

space = 51
print(
Expand All @@ -255,13 +257,15 @@ def bids(
f" {'BIDS dataset directory:' : <{space}} {bids_dir}"
)

jsonld_dataset = mutil.extract_and_validate_jsonld_dataset(jsonld_path)
jsonld_dataset = model_utils.extract_and_validate_jsonld_dataset(
jsonld_path
)

existing_subs_dict = mutil.get_subject_instances(jsonld_dataset)
existing_subs_dict = model_utils.get_subject_instances(jsonld_dataset)

# TODO: Revert to using Layout.get_subjects() to get BIDS subjects once pybids performance is improved
mutil.confirm_subs_match_pheno_data(
subjects=butil.get_bids_subjects_simple(bids_dir),
model_utils.confirm_subs_match_pheno_data(
subjects=bids_utils.get_bids_subjects_simple(bids_dir),
subject_source_for_err="BIDS directory",
pheno_subjects=existing_subs_dict.keys(),
)
Expand All @@ -275,7 +279,7 @@ def bids(
print("Merging BIDS metadata with existing subject annotations...\n")
for bids_sub_id in layout.get_subjects():
existing_subject = existing_subs_dict.get(f"sub-{bids_sub_id}")
existing_sessions_dict = mutil.get_imaging_session_instances(
existing_sessions_dict = model_utils.get_imaging_session_instances(
existing_subject
)

Expand All @@ -288,7 +292,7 @@ def bids(
# For some reason .get_sessions() doesn't always follow alphanumeric order
# By default (without sorting) the session lists look like ["02", "01"] per subject
for session_id in sorted(bids_sessions):
image_list = butil.create_acquisitions(
image_list = bids_utils.create_acquisitions(
layout=layout,
bids_sub_id=bids_sub_id,
session=session_id,
Expand All @@ -309,7 +313,7 @@ def bids(
if session_id is None
else f"ses-{session_id}"
)
session_path = butil.get_session_path(
session_path = bids_utils.get_session_path(
layout=layout,
bids_dir=bids_dir,
bids_sub_id=bids_sub_id,
Expand All @@ -332,8 +336,8 @@ def bids(
)
existing_subject.hasSession.append(new_imaging_session)

futil.save_jsonld(
data=mutil.add_context_to_graph_dataset(jsonld_dataset),
file_utils.save_jsonld(
data=model_utils.add_context_to_graph_dataset(jsonld_dataset),
filename=output,
)

Expand Down Expand Up @@ -387,7 +391,7 @@ def derivatives(
graph data model for the combined metadata in the .jsonld format.
You can upload this .jsonld file to the Neurobagel graph.
"""
futil.check_overwrite(output, overwrite)
file_utils.check_overwrite(output, overwrite)

space = 51
print(
Expand All @@ -396,10 +400,12 @@ def derivatives(
f" {'Processing status file (.tsv):' : <{space}}{tabular}"
)

status_df = futil.load_tabular(tabular, input_type="processing status")
status_df = file_utils.load_tabular(
tabular, input_type="processing status"
)

# We don't allow empty values in the participant ID column
if row_indices := putil.get_rows_with_empty_strings(
if row_indices := pheno_utils.get_rows_with_empty_strings(
status_df, [PROC_STATUS_COLS["participant"]]
):
raise LookupError(
Expand All @@ -409,21 +415,25 @@ def derivatives(
)

pipelines = status_df[PROC_STATUS_COLS["pipeline_name"]].unique()
dutil.check_pipelines_are_recognized(pipelines)
derivative_utils.check_pipelines_are_recognized(pipelines)

# TODO: Do we need to check all versions across all pipelines first, and report all unrecognized versions together?
for pipeline in pipelines:
versions = status_df[
status_df[PROC_STATUS_COLS["pipeline_name"]] == pipeline
][PROC_STATUS_COLS["pipeline_version"]].unique()

dutil.check_pipeline_versions_are_recognized(pipeline, versions)
derivative_utils.check_pipeline_versions_are_recognized(
pipeline, versions
)

jsonld_dataset = mutil.extract_and_validate_jsonld_dataset(jsonld_path)
jsonld_dataset = model_utils.extract_and_validate_jsonld_dataset(
jsonld_path
)

existing_subs_dict = mutil.get_subject_instances(jsonld_dataset)
existing_subs_dict = model_utils.get_subject_instances(jsonld_dataset)

mutil.confirm_subs_match_pheno_data(
model_utils.confirm_subs_match_pheno_data(
subjects=status_df[PROC_STATUS_COLS["participant"]].unique(),
subject_source_for_err="processing status file",
pheno_subjects=existing_subs_dict.keys(),
Expand All @@ -436,14 +446,14 @@ def derivatives(
existing_subject = existing_subs_dict.get(subject)

# Note: Dictionary of existing imaging sessions can be empty if only bagel pheno was run
existing_sessions_dict = mutil.get_imaging_session_instances(
existing_sessions_dict = model_utils.get_imaging_session_instances(
existing_subject
)

for session_label, sub_ses_proc_df in sub_proc_df.groupby(
PROC_STATUS_COLS["session"]
):
completed_pipelines = dutil.create_completed_pipelines(
completed_pipelines = derivative_utils.create_completed_pipelines(
sub_ses_proc_df
)

Expand All @@ -465,7 +475,7 @@ def derivatives(
)
existing_subject.hasSession.append(new_img_session)

futil.save_jsonld(
data=mutil.add_context_to_graph_dataset(jsonld_dataset),
file_utils.save_jsonld(
data=model_utils.add_context_to_graph_dataset(jsonld_dataset),
filename=output,
)
6 changes: 3 additions & 3 deletions bagel/mappings.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from collections import namedtuple
from pathlib import Path

from .utilities import file_utils as futil
from .utilities import file_utils

Namespace = namedtuple("Namespace", ["pf", "url"])
COGATLAS = Namespace("cogatlas", "https://www.cognitiveatlas.org/task/id/")
Expand Down Expand Up @@ -51,7 +51,7 @@ def get_pipeline_uris() -> dict:
"""
output_dict = {}
for pipe_file in PROCESSING_PIPELINE_PATH.glob("*.json"):
pipe = futil.load_json(pipe_file)
pipe = file_utils.load_json(pipe_file)
output_dict[pipe["name"]] = f"{NP.pf}:{pipe['name']}"

return output_dict
Expand All @@ -64,7 +64,7 @@ def get_pipeline_versions() -> dict:
"""
output_dict = {}
for pipe_file in PROCESSING_PIPELINE_PATH.glob("*.json"):
pipe = futil.load_json(pipe_file)
pipe = file_utils.load_json(pipe_file)
output_dict[pipe["name"]] = pipe["versions"]

return output_dict
Expand Down
File renamed without changes.
4 changes: 2 additions & 2 deletions bagel/utilities/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from bagel import models
from bagel.mappings import ALL_NAMESPACES, NB

from . import file_utils as futil
from . import file_utils


def generate_context():
Expand Down Expand Up @@ -78,7 +78,7 @@ def extract_and_validate_jsonld_dataset(file_path: Path) -> models.Dataset:
Strip the context from a user-provided JSONLD and validate the remaining contents
against the data model for a Neurobagel dataset.
"""
jsonld = futil.load_json(file_path)
jsonld = file_utils.load_json(file_path)
jsonld.pop("@context")
try:
jsonld_dataset = models.Dataset.parse_obj(jsonld)
Expand Down
12 changes: 7 additions & 5 deletions tests/unit/test_butil.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from bids import BIDSLayout

from bagel.utilities import bids_utils as butil
from bagel.utilities import bids_utils


@pytest.mark.parametrize(
Expand All @@ -13,7 +13,9 @@
)
def test_get_bids_subjects_simple(bids_path, bids_dir):
"""Test that get_bids_subjects_simple() correctly extracts subject IDs from a BIDS directory."""
bids_subject_list = butil.get_bids_subjects_simple(bids_path / bids_dir)
bids_subject_list = bids_utils.get_bids_subjects_simple(
bids_path / bids_dir
)
expected_subjects = [
f"sub-{sub_id}"
for sub_id in BIDSLayout(
Expand Down Expand Up @@ -45,7 +47,7 @@ def test_get_bids_subjects_simple(bids_path, bids_dir):
)
def test_create_acquisitions(bids_path, bids_dir, acquisitions, bids_session):
"""Given a BIDS dataset, creates a list of acquisitions matching the image files found on disk."""
image_list = butil.create_acquisitions(
image_list = bids_utils.create_acquisitions(
layout=BIDSLayout(bids_path / bids_dir, validate=True),
bids_sub_id="01",
session=bids_session,
Expand All @@ -70,7 +72,7 @@ def test_get_session_path_when_session_exists(
Test that given a subject and session ID (i.e. when BIDS session layer exists for dataset),
get_session_path() returns a path to the subject's session directory.
"""
session_path = butil.get_session_path(
session_path = bids_utils.get_session_path(
layout=BIDSLayout(bids_synthetic, validate=True),
bids_dir=bids_synthetic,
bids_sub_id=bids_sub_id,
Expand All @@ -90,7 +92,7 @@ def test_get_session_path_when_session_missing(bids_sub_id, bids_path):
get_session_path() returns the path to the subject directory.
"""
bids_dir = bids_path / "ds001"
session_path = butil.get_session_path(
session_path = bids_utils.get_session_path(
layout=BIDSLayout(bids_dir, validate=True),
bids_dir=bids_dir,
bids_sub_id=bids_sub_id,
Expand Down
10 changes: 6 additions & 4 deletions tests/unit/test_dutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pytest

from bagel import mappings
from bagel.utilities import derivatives_utils as dutil
from bagel.utilities import derivative_utils


def test_pipeline_uris_are_loaded():
Expand Down Expand Up @@ -35,7 +35,7 @@ def test_pipeline_versions_are_loaded():
def test_unrecognized_pipeline_names_raise_error(pipelines, unrecog_pipelines):
"""Test that pipeline names not found in the pipeline catalog raise an informative error."""
with pytest.raises(LookupError) as e:
dutil.check_pipelines_are_recognized(pipelines)
derivative_utils.check_pipelines_are_recognized(pipelines)

assert all(
substr in str(e.value)
Expand All @@ -55,7 +55,7 @@ def test_unrecognized_pipeline_versions_raise_error(
):
"""Test that versions of a pipeline not found in the pipeline catalog raise an informative error."""
with pytest.raises(LookupError) as e:
dutil.check_pipeline_versions_are_recognized(
derivative_utils.check_pipeline_versions_are_recognized(
"fmriprep", fmriprep_versions
)

Expand Down Expand Up @@ -116,7 +116,9 @@ def test_create_completed_pipelines():
],
data=sub_ses_data,
)
completed_pipelines = dutil.create_completed_pipelines(example_ses_proc_df)
completed_pipelines = derivative_utils.create_completed_pipelines(
example_ses_proc_df
)

assert len(completed_pipelines) == 1
assert (
Expand Down
Loading

0 comments on commit 2eea196

Please sign in to comment.