Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REF] Refactor utilities #385

Merged
merged 15 commits into from
Nov 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 57 additions & 65 deletions bagel/cli.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,18 @@
import json
from pathlib import Path

import typer
from bids import BIDSLayout

import bagel.bids_utils as butil
import bagel.derivatives_utils as dutil
import bagel.file_utils as futil
import bagel.pheno_utils as putil
from bagel import mappings, models
from bagel.derivatives_utils import PROC_STATUS_COLS
from bagel.utility import (
confirm_subs_match_pheno_data,
extract_and_validate_jsonld_dataset,
generate_context,
get_imaging_session_instances,
get_subject_instances,

from .utilities import (
bids_utils,
derivative_utils,
file_utils,
model_utils,
pheno_utils,
)
from .utilities.derivative_utils import PROC_STATUS_COLS

CUSTOM_SESSION_LABEL = "ses-unnamed"

Expand Down Expand Up @@ -66,7 +62,7 @@ def pheno(
None,
"--portal",
"-u", # for URL
callback=putil.validate_portal_uri,
callback=pheno_utils.validate_portal_uri,
help="URL (HTTP/HTTPS) to a website or page that describes the dataset and access instructions (if available).",
),
output: Path = typer.Option(
Expand Down Expand Up @@ -94,11 +90,11 @@ def pheno(
graph data model for the provided phenotypic file in the .jsonld format.
You can upload this .jsonld file to the Neurobagel graph.
"""
futil.check_overwrite(output, overwrite)
file_utils.check_overwrite(output, overwrite)

data_dictionary = futil.load_json(dictionary)
pheno_df = futil.load_tabular(pheno)
putil.validate_inputs(data_dictionary, pheno_df)
data_dictionary = file_utils.load_json(dictionary)
pheno_df = file_utils.load_tabular(pheno)
pheno_utils.validate_inputs(data_dictionary, pheno_df)

# NOTE: `space` determines the amount of padding (in num. characters) before the file paths in the print statement.
# It is currently calculated as = (length of the longer string, including the 3 leading spaces) + (2 extra spaces)
Expand All @@ -111,8 +107,8 @@ def pheno(

subject_list = []

column_mapping = putil.map_categories_to_columns(data_dictionary)
tool_mapping = putil.map_tools_to_columns(data_dictionary)
column_mapping = pheno_utils.map_categories_to_columns(data_dictionary)
tool_mapping = pheno_utils.map_tools_to_columns(data_dictionary)

# TODO: needs refactoring once we handle multiple participant IDs
participants = column_mapping.get("participant")[0]
Expand Down Expand Up @@ -140,15 +136,15 @@ def pheno(
_ses_pheno = session_row

if "sex" in column_mapping.keys():
_sex_vals = putil.get_transformed_values(
_sex_vals = pheno_utils.get_transformed_values(
column_mapping["sex"], _ses_pheno, data_dictionary
)
if _sex_vals:
# NOTE: Our data model only allows a single sex value, so we only take the first instance if multiple columns are about sex
session.hasSex = models.Sex(identifier=_sex_vals[0])

if "diagnosis" in column_mapping.keys():
_dx_vals = putil.get_transformed_values(
_dx_vals = pheno_utils.get_transformed_values(
column_mapping["diagnosis"], _ses_pheno, data_dictionary
)
if not _dx_vals:
Expand All @@ -166,7 +162,7 @@ def pheno(
]

if "age" in column_mapping.keys():
_age_vals = putil.get_transformed_values(
_age_vals = pheno_utils.get_transformed_values(
column_mapping["age"], _ses_pheno, data_dictionary
)
if _age_vals:
Expand All @@ -177,7 +173,7 @@ def pheno(
_assessments = [
models.Assessment(identifier=tool)
for tool, columns in tool_mapping.items()
if putil.are_any_available(
if pheno_utils.are_any_available(
columns, _ses_pheno, data_dictionary
)
]
Expand All @@ -197,16 +193,10 @@ def pheno(
hasSamples=subject_list,
)

context = generate_context()
# We can't just exclude_unset here because the identifier and schemaKey
# for each instance are created as default values and so technically are never set
# TODO: we should revisit this because there may be reasons to have None be meaningful in the future
context.update(**dataset.dict(exclude_none=True))

with open(output, "w") as f:
f.write(json.dumps(context, indent=2))

print(f"Saved output to: {output}")
file_utils.save_jsonld(
data=model_utils.add_context_to_graph_dataset(dataset),
filename=output,
)


@bagel.command()
Expand Down Expand Up @@ -258,7 +248,7 @@ def bids(
graph data model for the combined metadata in the .jsonld format.
You can upload this .jsonld file to the Neurobagel graph.
"""
futil.check_overwrite(output, overwrite)
file_utils.check_overwrite(output, overwrite)

space = 51
print(
Expand All @@ -267,13 +257,15 @@ def bids(
f" {'BIDS dataset directory:' : <{space}} {bids_dir}"
)

jsonld_dataset = extract_and_validate_jsonld_dataset(jsonld_path)
jsonld_dataset = model_utils.extract_and_validate_jsonld_dataset(
jsonld_path
)

existing_subs_dict = get_subject_instances(jsonld_dataset)
existing_subs_dict = model_utils.get_subject_instances(jsonld_dataset)

# TODO: Revert to using Layout.get_subjects() to get BIDS subjects once pybids performance is improved
confirm_subs_match_pheno_data(
subjects=butil.get_bids_subjects_simple(bids_dir),
model_utils.confirm_subs_match_pheno_data(
subjects=bids_utils.get_bids_subjects_simple(bids_dir),
subject_source_for_err="BIDS directory",
pheno_subjects=existing_subs_dict.keys(),
)
Expand All @@ -287,7 +279,7 @@ def bids(
print("Merging BIDS metadata with existing subject annotations...\n")
for bids_sub_id in layout.get_subjects():
existing_subject = existing_subs_dict.get(f"sub-{bids_sub_id}")
existing_sessions_dict = get_imaging_session_instances(
existing_sessions_dict = model_utils.get_imaging_session_instances(
existing_subject
)

Expand All @@ -300,7 +292,7 @@ def bids(
# For some reason .get_sessions() doesn't always follow alphanumeric order
# By default (without sorting) the session lists look like ["02", "01"] per subject
for session_id in sorted(bids_sessions):
image_list = butil.create_acquisitions(
image_list = bids_utils.create_acquisitions(
layout=layout,
bids_sub_id=bids_sub_id,
session=session_id,
Expand All @@ -321,7 +313,7 @@ def bids(
if session_id is None
else f"ses-{session_id}"
)
session_path = butil.get_session_path(
session_path = bids_utils.get_session_path(
layout=layout,
bids_dir=bids_dir,
bids_sub_id=bids_sub_id,
Expand All @@ -344,13 +336,10 @@ def bids(
)
existing_subject.hasSession.append(new_imaging_session)

context = generate_context()
merged_dataset = {**context, **jsonld_dataset.dict(exclude_none=True)}

with open(output, "w") as f:
f.write(json.dumps(merged_dataset, indent=2))

print(f"Saved output to: {output}")
file_utils.save_jsonld(
data=model_utils.add_context_to_graph_dataset(jsonld_dataset),
filename=output,
)


@bagel.command()
Expand Down Expand Up @@ -402,7 +391,7 @@ def derivatives(
graph data model for the combined metadata in the .jsonld format.
You can upload this .jsonld file to the Neurobagel graph.
"""
futil.check_overwrite(output, overwrite)
file_utils.check_overwrite(output, overwrite)

space = 51
print(
Expand All @@ -411,10 +400,12 @@ def derivatives(
f" {'Processing status file (.tsv):' : <{space}}{tabular}"
)

status_df = futil.load_tabular(tabular, input_type="processing status")
status_df = file_utils.load_tabular(
tabular, input_type="processing status"
)

# We don't allow empty values in the participant ID column
if row_indices := putil.get_rows_with_empty_strings(
if row_indices := pheno_utils.get_rows_with_empty_strings(
status_df, [PROC_STATUS_COLS["participant"]]
):
raise LookupError(
Expand All @@ -424,21 +415,25 @@ def derivatives(
)

pipelines = status_df[PROC_STATUS_COLS["pipeline_name"]].unique()
dutil.check_pipelines_are_recognized(pipelines)
derivative_utils.check_pipelines_are_recognized(pipelines)

# TODO: Do we need to check all versions across all pipelines first, and report all unrecognized versions together?
for pipeline in pipelines:
versions = status_df[
status_df[PROC_STATUS_COLS["pipeline_name"]] == pipeline
][PROC_STATUS_COLS["pipeline_version"]].unique()

dutil.check_pipeline_versions_are_recognized(pipeline, versions)
derivative_utils.check_pipeline_versions_are_recognized(
pipeline, versions
)

jsonld_dataset = extract_and_validate_jsonld_dataset(jsonld_path)
jsonld_dataset = model_utils.extract_and_validate_jsonld_dataset(
jsonld_path
)

existing_subs_dict = get_subject_instances(jsonld_dataset)
existing_subs_dict = model_utils.get_subject_instances(jsonld_dataset)

confirm_subs_match_pheno_data(
model_utils.confirm_subs_match_pheno_data(
subjects=status_df[PROC_STATUS_COLS["participant"]].unique(),
subject_source_for_err="processing status file",
pheno_subjects=existing_subs_dict.keys(),
Expand All @@ -451,14 +446,14 @@ def derivatives(
existing_subject = existing_subs_dict.get(subject)

# Note: Dictionary of existing imaging sessions can be empty if only bagel pheno was run
existing_sessions_dict = get_imaging_session_instances(
existing_sessions_dict = model_utils.get_imaging_session_instances(
existing_subject
)

for session_label, sub_ses_proc_df in sub_proc_df.groupby(
PROC_STATUS_COLS["session"]
):
completed_pipelines = dutil.create_completed_pipelines(
completed_pipelines = derivative_utils.create_completed_pipelines(
sub_ses_proc_df
)

Expand All @@ -480,10 +475,7 @@ def derivatives(
)
existing_subject.hasSession.append(new_img_session)

context = generate_context()
merged_dataset = {**context, **jsonld_dataset.dict(exclude_none=True)}

with open(output, "w") as f:
f.write(json.dumps(merged_dataset, indent=2))

print(f"Saved output to: {output}")
file_utils.save_jsonld(
data=model_utils.add_context_to_graph_dataset(jsonld_dataset),
filename=output,
)
6 changes: 3 additions & 3 deletions bagel/mappings.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from collections import namedtuple
from pathlib import Path

import bagel.file_utils as futil
from .utilities import file_utils

Namespace = namedtuple("Namespace", ["pf", "url"])
COGATLAS = Namespace("cogatlas", "https://www.cognitiveatlas.org/task/id/")
Expand Down Expand Up @@ -51,7 +51,7 @@ def get_pipeline_uris() -> dict:
"""
output_dict = {}
for pipe_file in PROCESSING_PIPELINE_PATH.glob("*.json"):
pipe = futil.load_json(pipe_file)
pipe = file_utils.load_json(pipe_file)
output_dict[pipe["name"]] = f"{NP.pf}:{pipe['name']}"

return output_dict
Expand All @@ -64,7 +64,7 @@ def get_pipeline_versions() -> dict:
"""
output_dict = {}
for pipe_file in PROCESSING_PIPELINE_PATH.glob("*.json"):
pipe = futil.load_json(pipe_file)
pipe = file_utils.load_json(pipe_file)
output_dict[pipe["name"]] = pipe["versions"]

return output_dict
Expand Down
Empty file added bagel/utilities/__init__.py
Empty file.
File renamed without changes.
File renamed without changes.
6 changes: 6 additions & 0 deletions bagel/file_utils.py → bagel/utilities/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,9 @@ def check_overwrite(output: Path, overwrite: bool):
fg=typer.colors.RED,
)
)


def save_jsonld(data: dict, filename: Path):
with open(filename, "w") as f:
f.write(json.dumps(data, indent=2))
print(f"Saved output to: {filename}")
13 changes: 11 additions & 2 deletions bagel/utility.py → bagel/utilities/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
import typer
from pydantic import ValidationError

import bagel.file_utils as futil
from bagel import models
from bagel.mappings import ALL_NAMESPACES, NB
from bagel.utilities import file_utils


def generate_context():
Expand Down Expand Up @@ -35,6 +35,15 @@ def generate_context():
return {"@context": field_preamble}


def add_context_to_graph_dataset(dataset: models.Dataset) -> dict:
"""Add the Neurobagel context to a graph-ready dataset to form a JSONLD dictionary."""
context = generate_context()
# We can't just exclude_unset here because the identifier and schemaKey
# for each instance are created as default values and so technically are never set
# TODO: we should revisit this because there may be reasons to have None be meaningful in the future
return {**context, **dataset.dict(exclude_none=True)}


def get_subs_missing_from_pheno_data(
subjects: Iterable, pheno_subjects: Iterable
) -> list:
Expand Down Expand Up @@ -68,7 +77,7 @@ def extract_and_validate_jsonld_dataset(file_path: Path) -> models.Dataset:
Strip the context from a user-provided JSONLD and validate the remaining contents
against the data model for a Neurobagel dataset.
"""
jsonld = futil.load_json(file_path)
jsonld = file_utils.load_json(file_path)
jsonld.pop("@context")
try:
jsonld_dataset = models.Dataset.parse_obj(jsonld)
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
Loading
Loading