diff --git a/bagel/cli.py b/bagel/cli.py index 5bd0e481..b793c7dc 100644 --- a/bagel/cli.py +++ b/bagel/cli.py @@ -1,22 +1,18 @@ -import json from pathlib import Path import typer from bids import BIDSLayout -import bagel.bids_utils as butil -import bagel.derivatives_utils as dutil -import bagel.file_utils as futil -import bagel.pheno_utils as putil from bagel import mappings, models -from bagel.derivatives_utils import PROC_STATUS_COLS -from bagel.utility import ( - confirm_subs_match_pheno_data, - extract_and_validate_jsonld_dataset, - generate_context, - get_imaging_session_instances, - get_subject_instances, + +from .utilities import ( + bids_utils, + derivative_utils, + file_utils, + model_utils, + pheno_utils, ) +from .utilities.derivative_utils import PROC_STATUS_COLS CUSTOM_SESSION_LABEL = "ses-unnamed" @@ -66,7 +62,7 @@ def pheno( None, "--portal", "-u", # for URL - callback=putil.validate_portal_uri, + callback=pheno_utils.validate_portal_uri, help="URL (HTTP/HTTPS) to a website or page that describes the dataset and access instructions (if available).", ), output: Path = typer.Option( @@ -94,11 +90,11 @@ def pheno( graph data model for the provided phenotypic file in the .jsonld format. You can upload this .jsonld file to the Neurobagel graph. """ - futil.check_overwrite(output, overwrite) + file_utils.check_overwrite(output, overwrite) - data_dictionary = futil.load_json(dictionary) - pheno_df = futil.load_tabular(pheno) - putil.validate_inputs(data_dictionary, pheno_df) + data_dictionary = file_utils.load_json(dictionary) + pheno_df = file_utils.load_tabular(pheno) + pheno_utils.validate_inputs(data_dictionary, pheno_df) # NOTE: `space` determines the amount of padding (in num. characters) before the file paths in the print statement. # It is currently calculated as = (length of the longer string, including the 3 leading spaces) + (2 extra spaces) @@ -111,8 +107,8 @@ def pheno( subject_list = [] - column_mapping = putil.map_categories_to_columns(data_dictionary) - tool_mapping = putil.map_tools_to_columns(data_dictionary) + column_mapping = pheno_utils.map_categories_to_columns(data_dictionary) + tool_mapping = pheno_utils.map_tools_to_columns(data_dictionary) # TODO: needs refactoring once we handle multiple participant IDs participants = column_mapping.get("participant")[0] @@ -140,7 +136,7 @@ def pheno( _ses_pheno = session_row if "sex" in column_mapping.keys(): - _sex_vals = putil.get_transformed_values( + _sex_vals = pheno_utils.get_transformed_values( column_mapping["sex"], _ses_pheno, data_dictionary ) if _sex_vals: @@ -148,7 +144,7 @@ def pheno( session.hasSex = models.Sex(identifier=_sex_vals[0]) if "diagnosis" in column_mapping.keys(): - _dx_vals = putil.get_transformed_values( + _dx_vals = pheno_utils.get_transformed_values( column_mapping["diagnosis"], _ses_pheno, data_dictionary ) if not _dx_vals: @@ -166,7 +162,7 @@ def pheno( ] if "age" in column_mapping.keys(): - _age_vals = putil.get_transformed_values( + _age_vals = pheno_utils.get_transformed_values( column_mapping["age"], _ses_pheno, data_dictionary ) if _age_vals: @@ -177,7 +173,7 @@ def pheno( _assessments = [ models.Assessment(identifier=tool) for tool, columns in tool_mapping.items() - if putil.are_any_available( + if pheno_utils.are_any_available( columns, _ses_pheno, data_dictionary ) ] @@ -197,16 +193,10 @@ def pheno( hasSamples=subject_list, ) - context = generate_context() - # We can't just exclude_unset here because the identifier and schemaKey - # for each instance are created as default values and so technically are never set - # TODO: we should revisit this because there may be reasons to have None be meaningful in the future - context.update(**dataset.dict(exclude_none=True)) - - with open(output, "w") as f: - f.write(json.dumps(context, indent=2)) - - print(f"Saved output to: {output}") + file_utils.save_jsonld( + data=model_utils.add_context_to_graph_dataset(dataset), + filename=output, + ) @bagel.command() @@ -258,7 +248,7 @@ def bids( graph data model for the combined metadata in the .jsonld format. You can upload this .jsonld file to the Neurobagel graph. """ - futil.check_overwrite(output, overwrite) + file_utils.check_overwrite(output, overwrite) space = 51 print( @@ -267,13 +257,15 @@ def bids( f" {'BIDS dataset directory:' : <{space}} {bids_dir}" ) - jsonld_dataset = extract_and_validate_jsonld_dataset(jsonld_path) + jsonld_dataset = model_utils.extract_and_validate_jsonld_dataset( + jsonld_path + ) - existing_subs_dict = get_subject_instances(jsonld_dataset) + existing_subs_dict = model_utils.get_subject_instances(jsonld_dataset) # TODO: Revert to using Layout.get_subjects() to get BIDS subjects once pybids performance is improved - confirm_subs_match_pheno_data( - subjects=butil.get_bids_subjects_simple(bids_dir), + model_utils.confirm_subs_match_pheno_data( + subjects=bids_utils.get_bids_subjects_simple(bids_dir), subject_source_for_err="BIDS directory", pheno_subjects=existing_subs_dict.keys(), ) @@ -287,7 +279,7 @@ def bids( print("Merging BIDS metadata with existing subject annotations...\n") for bids_sub_id in layout.get_subjects(): existing_subject = existing_subs_dict.get(f"sub-{bids_sub_id}") - existing_sessions_dict = get_imaging_session_instances( + existing_sessions_dict = model_utils.get_imaging_session_instances( existing_subject ) @@ -300,7 +292,7 @@ def bids( # For some reason .get_sessions() doesn't always follow alphanumeric order # By default (without sorting) the session lists look like ["02", "01"] per subject for session_id in sorted(bids_sessions): - image_list = butil.create_acquisitions( + image_list = bids_utils.create_acquisitions( layout=layout, bids_sub_id=bids_sub_id, session=session_id, @@ -321,7 +313,7 @@ def bids( if session_id is None else f"ses-{session_id}" ) - session_path = butil.get_session_path( + session_path = bids_utils.get_session_path( layout=layout, bids_dir=bids_dir, bids_sub_id=bids_sub_id, @@ -344,13 +336,10 @@ def bids( ) existing_subject.hasSession.append(new_imaging_session) - context = generate_context() - merged_dataset = {**context, **jsonld_dataset.dict(exclude_none=True)} - - with open(output, "w") as f: - f.write(json.dumps(merged_dataset, indent=2)) - - print(f"Saved output to: {output}") + file_utils.save_jsonld( + data=model_utils.add_context_to_graph_dataset(jsonld_dataset), + filename=output, + ) @bagel.command() @@ -402,7 +391,7 @@ def derivatives( graph data model for the combined metadata in the .jsonld format. You can upload this .jsonld file to the Neurobagel graph. """ - futil.check_overwrite(output, overwrite) + file_utils.check_overwrite(output, overwrite) space = 51 print( @@ -411,10 +400,12 @@ def derivatives( f" {'Processing status file (.tsv):' : <{space}}{tabular}" ) - status_df = futil.load_tabular(tabular, input_type="processing status") + status_df = file_utils.load_tabular( + tabular, input_type="processing status" + ) # We don't allow empty values in the participant ID column - if row_indices := putil.get_rows_with_empty_strings( + if row_indices := pheno_utils.get_rows_with_empty_strings( status_df, [PROC_STATUS_COLS["participant"]] ): raise LookupError( @@ -424,7 +415,7 @@ def derivatives( ) pipelines = status_df[PROC_STATUS_COLS["pipeline_name"]].unique() - dutil.check_pipelines_are_recognized(pipelines) + derivative_utils.check_pipelines_are_recognized(pipelines) # TODO: Do we need to check all versions across all pipelines first, and report all unrecognized versions together? for pipeline in pipelines: @@ -432,13 +423,17 @@ def derivatives( status_df[PROC_STATUS_COLS["pipeline_name"]] == pipeline ][PROC_STATUS_COLS["pipeline_version"]].unique() - dutil.check_pipeline_versions_are_recognized(pipeline, versions) + derivative_utils.check_pipeline_versions_are_recognized( + pipeline, versions + ) - jsonld_dataset = extract_and_validate_jsonld_dataset(jsonld_path) + jsonld_dataset = model_utils.extract_and_validate_jsonld_dataset( + jsonld_path + ) - existing_subs_dict = get_subject_instances(jsonld_dataset) + existing_subs_dict = model_utils.get_subject_instances(jsonld_dataset) - confirm_subs_match_pheno_data( + model_utils.confirm_subs_match_pheno_data( subjects=status_df[PROC_STATUS_COLS["participant"]].unique(), subject_source_for_err="processing status file", pheno_subjects=existing_subs_dict.keys(), @@ -451,14 +446,14 @@ def derivatives( existing_subject = existing_subs_dict.get(subject) # Note: Dictionary of existing imaging sessions can be empty if only bagel pheno was run - existing_sessions_dict = get_imaging_session_instances( + existing_sessions_dict = model_utils.get_imaging_session_instances( existing_subject ) for session_label, sub_ses_proc_df in sub_proc_df.groupby( PROC_STATUS_COLS["session"] ): - completed_pipelines = dutil.create_completed_pipelines( + completed_pipelines = derivative_utils.create_completed_pipelines( sub_ses_proc_df ) @@ -480,10 +475,7 @@ def derivatives( ) existing_subject.hasSession.append(new_img_session) - context = generate_context() - merged_dataset = {**context, **jsonld_dataset.dict(exclude_none=True)} - - with open(output, "w") as f: - f.write(json.dumps(merged_dataset, indent=2)) - - print(f"Saved output to: {output}") + file_utils.save_jsonld( + data=model_utils.add_context_to_graph_dataset(jsonld_dataset), + filename=output, + ) diff --git a/bagel/mappings.py b/bagel/mappings.py index 0acb5b5c..1d82c598 100644 --- a/bagel/mappings.py +++ b/bagel/mappings.py @@ -1,7 +1,7 @@ from collections import namedtuple from pathlib import Path -import bagel.file_utils as futil +from .utilities import file_utils Namespace = namedtuple("Namespace", ["pf", "url"]) COGATLAS = Namespace("cogatlas", "https://www.cognitiveatlas.org/task/id/") @@ -51,7 +51,7 @@ def get_pipeline_uris() -> dict: """ output_dict = {} for pipe_file in PROCESSING_PIPELINE_PATH.glob("*.json"): - pipe = futil.load_json(pipe_file) + pipe = file_utils.load_json(pipe_file) output_dict[pipe["name"]] = f"{NP.pf}:{pipe['name']}" return output_dict @@ -64,7 +64,7 @@ def get_pipeline_versions() -> dict: """ output_dict = {} for pipe_file in PROCESSING_PIPELINE_PATH.glob("*.json"): - pipe = futil.load_json(pipe_file) + pipe = file_utils.load_json(pipe_file) output_dict[pipe["name"]] = pipe["versions"] return output_dict diff --git a/bagel/utilities/__init__.py b/bagel/utilities/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/bagel/bids_utils.py b/bagel/utilities/bids_utils.py similarity index 100% rename from bagel/bids_utils.py rename to bagel/utilities/bids_utils.py diff --git a/bagel/derivatives_utils.py b/bagel/utilities/derivative_utils.py similarity index 100% rename from bagel/derivatives_utils.py rename to bagel/utilities/derivative_utils.py diff --git a/bagel/file_utils.py b/bagel/utilities/file_utils.py similarity index 95% rename from bagel/file_utils.py rename to bagel/utilities/file_utils.py index 385af12c..0fbabe46 100644 --- a/bagel/file_utils.py +++ b/bagel/utilities/file_utils.py @@ -94,3 +94,9 @@ def check_overwrite(output: Path, overwrite: bool): fg=typer.colors.RED, ) ) + + +def save_jsonld(data: dict, filename: Path): + with open(filename, "w") as f: + f.write(json.dumps(data, indent=2)) + print(f"Saved output to: {filename}") diff --git a/bagel/utility.py b/bagel/utilities/model_utils.py similarity index 86% rename from bagel/utility.py rename to bagel/utilities/model_utils.py index 4d7a7236..fa47f022 100644 --- a/bagel/utility.py +++ b/bagel/utilities/model_utils.py @@ -5,9 +5,9 @@ import typer from pydantic import ValidationError -import bagel.file_utils as futil from bagel import models from bagel.mappings import ALL_NAMESPACES, NB +from bagel.utilities import file_utils def generate_context(): @@ -35,6 +35,15 @@ def generate_context(): return {"@context": field_preamble} +def add_context_to_graph_dataset(dataset: models.Dataset) -> dict: + """Add the Neurobagel context to a graph-ready dataset to form a JSONLD dictionary.""" + context = generate_context() + # We can't just exclude_unset here because the identifier and schemaKey + # for each instance are created as default values and so technically are never set + # TODO: we should revisit this because there may be reasons to have None be meaningful in the future + return {**context, **dataset.dict(exclude_none=True)} + + def get_subs_missing_from_pheno_data( subjects: Iterable, pheno_subjects: Iterable ) -> list: @@ -68,7 +77,7 @@ def extract_and_validate_jsonld_dataset(file_path: Path) -> models.Dataset: Strip the context from a user-provided JSONLD and validate the remaining contents against the data model for a Neurobagel dataset. """ - jsonld = futil.load_json(file_path) + jsonld = file_utils.load_json(file_path) jsonld.pop("@context") try: jsonld_dataset = models.Dataset.parse_obj(jsonld) diff --git a/bagel/pheno_utils.py b/bagel/utilities/pheno_utils.py similarity index 100% rename from bagel/pheno_utils.py rename to bagel/utilities/pheno_utils.py diff --git a/tests/test_cli_bids.py b/tests/integration/test_cli_bids.py similarity index 100% rename from tests/test_cli_bids.py rename to tests/integration/test_cli_bids.py diff --git a/tests/test_cli_derivatives.py b/tests/integration/test_cli_derivatives.py similarity index 100% rename from tests/test_cli_derivatives.py rename to tests/integration/test_cli_derivatives.py diff --git a/tests/test_cli_pheno.py b/tests/integration/test_cli_pheno.py similarity index 100% rename from tests/test_cli_pheno.py rename to tests/integration/test_cli_pheno.py diff --git a/tests/test_utility.py b/tests/test_utility.py deleted file mode 100644 index 907a576e..00000000 --- a/tests/test_utility.py +++ /dev/null @@ -1,800 +0,0 @@ -from collections import Counter -from pathlib import Path - -import pandas as pd -import pytest -import typer -from bids import BIDSLayout - -import bagel.bids_utils as butil -import bagel.derivatives_utils as dutil -import bagel.file_utils as futil -import bagel.pheno_utils as putil -from bagel import mappings, models -from bagel.utility import ( - generate_context, - get_imaging_session_instances, - get_subject_instances, - get_subs_missing_from_pheno_data, -) - - -@pytest.fixture -def get_test_context(): - """Generate an @context dictionary to test against.""" - return generate_context() - - -@pytest.fixture -def get_values_by_key(): - """ - Get values of all instances of a specified key in a dictionary. Will also look inside lists of dictionaries and nested dictionaries. - """ - - def _find_by_key(data, target): - if isinstance(data, dict): - for key, value in data.items(): - if isinstance(value, (dict, list)): - yield from _find_by_key(value, target) - elif key == target: - yield value - elif isinstance(data, list): - for item in data: - yield from _find_by_key(item, target) - - return _find_by_key - - -def test_all_used_namespaces_have_urls( - get_test_context, get_values_by_key, load_test_json, test_data_upload_path -): - """Test that all namespace prefixes used in a comprehensive data dictionary have a corresponding URL in the @context.""" - data_dict = load_test_json( - test_data_upload_path / "example_synthetic.json" - ) - - prefixes = list( - map( - lambda term: term.split(":")[0], - get_values_by_key(data_dict, "TermURL"), - ) - ) - - # add nidm to the list of tested prefixes manually since it is the only one not present in the data dictionary - # but is used automatically during the bids step - for prefix in set(prefixes + ["nidm"]): - assert ( - prefix in get_test_context["@context"] - ), f"The namespace '{prefix}' was used in the data dictionary, but was not defined in the @context." - - -@pytest.mark.parametrize( - "partial_data_dict, invalid_column_name", - [ - # sex column missing Levels - ( - { - "participant_id": { - "Description": "A participant ID", - "Annotations": { - "IsAbout": { - "TermURL": "nb:ParticipantID", - "Label": "Unique participant identifier", - }, - "Identifies": "participant", - }, - }, - "sex": { - "Description": "Participant sex", - "Annotations": { - "IsAbout": {"TermURL": "nb:Sex", "Label": ""} - }, - }, - }, - "sex", - ), - # age column missing Transformation - ( - { - "participant_id": { - "Description": "A participant ID", - "Annotations": { - "IsAbout": { - "TermURL": "nb:ParticipantID", - "Label": "Unique participant identifier", - }, - "Identifies": "participant", - }, - }, - "age": { - "Description": "Participant age", - "Annotations": { - "IsAbout": { - "TermURL": "nb:Age", - "Label": "Chronological age", - } - }, - }, - }, - "age", - ), - ], -) -def test_schema_invalid_column_raises_error( - partial_data_dict, invalid_column_name -): - """ - Test that when an input data dictionary contains a schema invalid column annotation, - an informative error is raised which includes the name of the offending column. - """ - with pytest.raises(ValueError) as e: - putil.validate_data_dict(partial_data_dict) - - for substring in [ - "not a valid Neurobagel data dictionary", - invalid_column_name, - ]: - assert substring in str(e.value) - - -def test_get_columns_that_are_about_concept(test_data, load_test_json): - """Test that matching annotated columns are returned as a list, - and that empty list is returned if nothing matches""" - data_dict = load_test_json(test_data / "example14.json") - - assert ["participant_id"] == putil.get_columns_about( - data_dict, concept=mappings.NEUROBAGEL["participant"] - ) - assert [] == putil.get_columns_about( - data_dict, concept="does not exist concept" - ) - - -def test_get_columns_with_annotations(): - example = { - "someOtherColumn": { - "Description": "This is cool in BIDS, but not in Neurobagel" - }, - "participant_id": { - "Description": "A participant ID", - "Annotations": { - "IsAbout": { - "TermURL": "nb:ParticipantID", - "Label": "Unique participant identifier", - } - }, - }, - } - result = putil.get_annotated_columns(example)[0] - assert result[0] == "participant_id" - assert result[1] == example["participant_id"] - - -def test_map_categories_to_columns(test_data, load_test_json): - """Test that inverse mapping of concepts to columns is correctly created""" - data_dict = load_test_json(test_data / "example2.json") - - result = putil.map_categories_to_columns(data_dict) - - assert {"participant", "session", "sex"}.issubset(result.keys()) - assert ["participant_id"] == result["participant"] - assert ["session_id"] == result["session"] - assert ["sex"] == result["sex"] - - -@pytest.mark.parametrize( - "tool, columns", - [ - ("cogatlas:1234", ["tool_item1", "tool_item2"]), - ("cogatlas:4321", ["other_tool_item1"]), - ], -) -def test_map_tools_to_columns(test_data, load_test_json, tool, columns): - data_dict = load_test_json(test_data / "example6.json") - - result = putil.map_tools_to_columns(data_dict) - - assert result[tool] == columns - - -@pytest.mark.parametrize( - "example, column_list, expected_values", - [ - ("example2", ["sex"], ["snomed:248153007"]), - ( - "example19", - ["group", "diagnosis"], - ["snomed:49049000", "snomed:724761004"], - ), - ], -) -def test_get_transformed_categorical_values( - test_data, load_test_json, example, column_list, expected_values -): - """Test that the correct transformed values are returned for a categorical variable""" - data_dict = load_test_json(test_data / f"{example}.json") - pheno = pd.read_csv(test_data / f"{example}.tsv", sep="\t") - - assert expected_values == putil.get_transformed_values( - columns=column_list, - row=pheno.iloc[0], - data_dict=data_dict, - ) - - -@pytest.mark.parametrize( - "example,expected_result", - [ - ( - { - "column": { - "Annotations": { - "IsAbout": {"TermURL": "something", "Labels": "other"}, - "Levels": { - "val1": {"TermURL": "something", "Label": "other"} - }, - } - } - }, - True, - ), - ( - { - "column": { - "Levels": {"val1": "some description"}, - "Annotations": { - "IsAbout": {"TermURL": "something", "Labels": "other"} - }, - } - }, - False, - ), - ], -) -def test_detect_categorical_column(example, expected_result): - result = putil.is_column_categorical(column="column", data_dict=example) - - assert result is expected_result - - -@pytest.mark.parametrize( - "value,column,expected", - [ - ("test_value", "test_column", True), - ("does not exist", "test_column", False), - ("my_value", "empty_column", False), - ], -) -def test_missing_values(value, column, expected): - """Test that missing values are correctly detected""" - test_data_dict = { - "test_column": {"Annotations": {"MissingValues": ["test_value"]}}, - "empty_column": {"Annotations": {}}, - } - - assert putil.is_missing_value(value, column, test_data_dict) is expected - - -@pytest.mark.parametrize( - "subject_idx, is_avail", - [(0, True), (2, False), (4, True)], -) -def test_get_assessment_tool_availability( - test_data, load_test_json, subject_idx, is_avail -): - """ - Ensure that subjects who have one or more missing values in columns mapped to an assessment - tool are correctly identified as not having this assessment tool - """ - data_dict = load_test_json(test_data / "example6.json") - pheno = pd.read_csv(test_data / "example6.tsv", sep="\t") - test_columns = ["tool_item1", "tool_item2"] - - assert ( - putil.are_any_available( - test_columns, pheno.iloc[subject_idx], data_dict - ) - is is_avail - ) - - -@pytest.mark.parametrize( - "columns, expected_indices", - [(["participant_id"], [0]), (["session_id"], [2])], -) -def test_missing_ids_in_columns(test_data, columns, expected_indices): - """ - When a participant or session labeled column has missing values, - we raise and provide the list of offending row indices - """ - pheno = pd.read_csv( - test_data / "example11.tsv", sep="\t", keep_default_na=False, dtype=str - ) - assert expected_indices == putil.get_rows_with_empty_strings( - pheno, columns=columns - ) - - -@pytest.mark.parametrize( - "raw_age,expected_age,heuristic", - [ - ("11.0", 11.0, "nb:FromFloat"), - ("11", 11.0, "nb:FromInt"), - ("11,0", 11.0, "nb:FromEuro"), - ("90+", 90.0, "nb:FromBounded"), - ("20Y6M", 20.5, "nb:FromISO8601"), - ("P20Y6M", 20.5, "nb:FromISO8601"), - ("20Y9M", 20.75, "nb:FromISO8601"), - ], -) -def test_age_gets_converted(raw_age, expected_age, heuristic): - assert expected_age == putil.transform_age(raw_age, heuristic) - - -@pytest.mark.parametrize( - "raw_age, incorrect_heuristic", - [ - ("11,0", "nb:FromFloat"), - ("11.0", "nb:FromISO8601"), - ("20-30", "nb:FromBounded"), - ], -) -def test_incorrect_age_heuristic(raw_age, incorrect_heuristic): - """Given an age transformation that does not match the type of age value provided, returns an informative error.""" - with pytest.raises(ValueError) as e: - putil.transform_age(raw_age, incorrect_heuristic) - - assert ( - f"problem with applying the age transformation: {incorrect_heuristic}." - in str(e.value) - ) - - -def test_invalid_age_heuristic(): - """Given an age transformation that is not recognized, returns an informative ValueError.""" - with pytest.raises(ValueError) as e: - putil.transform_age("11,0", "nb:birthyear") - - assert "unrecognized age transformation: nb:birthyear" in str(e.value) - - -# TODO: See if we can remove this test: it's a little hard to maintain and -# essentially replicates the logic of the function being tested -# Instead, see test_all_used_namespaces_have_urls and test_used_namespaces_in_context -@pytest.mark.parametrize( - "model, attributes", - [ - ("Bagel", ["identifier"]), - ("ControlledTerm", ["identifier", "schemaKey"]), - ("Sex", ["schemaKey"]), - ("Diagnosis", ["schemaKey"]), - ("SubjectGroup", ["schemaKey"]), - ("Assessment", ["schemaKey"]), - ("Image", ["schemaKey"]), - ("Acquisition", ["hasContrastType", "schemaKey"]), - ("Pipeline", ["schemaKey"]), - ( - "CompletedPipeline", - ["hasPipelineVersion", "hasPipelineName", "schemaKey"], - ), - ("Session", ["hasLabel"]), - ( - "PhenotypicSession", - [ - "hasAge", - "hasSex", - "isSubjectGroup", - "hasDiagnosis", - "hasAssessment", - "schemaKey", - ], - ), - ( - "ImagingSession", - [ - "hasFilePath", - "hasAcquisition", - "hasCompletedPipeline", - "schemaKey", - ], - ), - ( - "Subject", - [ - "hasLabel", - "hasSession", - "schemaKey", - ], - ), - ( - "Dataset", - [ - "hasLabel", - "hasPortalURI", - "hasSamples", - "schemaKey", - ], - ), - ], -) -def test_generate_context(get_test_context, model, attributes): - """Test that each model and its set of attributes have corresponding entries in @context.""" - assert model in get_test_context["@context"] - for attribute in attributes: - assert attribute in get_test_context["@context"] - - -@pytest.mark.parametrize( - "bids_dir", - ["synthetic", "ds000248"], -) -def test_get_bids_subjects_simple(bids_path, bids_dir): - """Test that get_bids_subjects_simple() correctly extracts subject IDs from a BIDS directory.""" - bids_subject_list = butil.get_bids_subjects_simple(bids_path / bids_dir) - expected_subjects = [ - f"sub-{sub_id}" - for sub_id in BIDSLayout( - bids_path / bids_dir, validate=True - ).get_subjects() - ] - assert sorted(bids_subject_list) == sorted(expected_subjects) - - -@pytest.mark.parametrize( - "bids_list, expected_bids_exclusive_subs", - [ - (["sub-01", "sub-02", "sub-03"], []), - ( - ["sub-01", "sub-02", "sub-03", "sub-04", "sub-05"], - ["sub-04", "sub-05"], - ), - ( - ["sub-cbm001", "sub-cbm002", "sub-cbm003"], - ["sub-cbm001", "sub-cbm002", "sub-cbm003"], - ), - ( - ["sub-pd123", "sub-pd234"], - ["sub-pd123", "sub-pd234"], - ), - ], -) -def test_get_subjects_missing_from_pheno_data( - bids_list, expected_bids_exclusive_subs -): - """ - Given a list of BIDS subject IDs, test that IDs not found in the phenotypic subject list are returned. - """ - pheno_list = ["sub-01", "sub-02", "sub-03", "sub-PD123", "sub-PD234"] - bids_exclusive_subs = get_subs_missing_from_pheno_data( - pheno_subjects=pheno_list, subjects=bids_list - ) - - # We sort the list for comparison since the order of the missing subjects is not guaranteed - # due to using set operations - assert sorted(bids_exclusive_subs) == expected_bids_exclusive_subs - - -@pytest.mark.parametrize( - "bids_dir, acquisitions, bids_session", - [ - ( - "synthetic", - {"nidm:T1Weighted": 1, "nidm:FlowWeighted": 3}, - "01", - ), - ( - "ds001", - { - "nidm:T2Weighted": 1, - "nidm:T1Weighted": 1, - "nidm:FlowWeighted": 3, - }, - None, - ), - ("eeg_ds000117", {"nidm:T1Weighted": 1}, None), - ], -) -def test_create_acquisitions(bids_path, bids_dir, acquisitions, bids_session): - """Given a BIDS dataset, creates a list of acquisitions matching the image files found on disk.""" - image_list = butil.create_acquisitions( - layout=BIDSLayout(bids_path / bids_dir, validate=True), - bids_sub_id="01", - session=bids_session, - ) - - image_counts = Counter( - [image.hasContrastType.identifier for image in image_list] - ) - - for contrast, count in acquisitions.items(): - assert image_counts[contrast] == count - - -@pytest.mark.parametrize( - "bids_sub_id, session", - [("01", "01"), ("02", "02"), ("03", "01")], -) -def test_get_session_path_when_session_exists( - bids_sub_id, session, bids_synthetic -): - """ - Test that given a subject and session ID (i.e. when BIDS session layer exists for dataset), - get_session_path() returns a path to the subject's session directory. - """ - session_path = butil.get_session_path( - layout=BIDSLayout(bids_synthetic, validate=True), - bids_dir=bids_synthetic, - bids_sub_id=bids_sub_id, - session=session, - ) - - assert f"sub-{bids_sub_id}" in session_path - assert f"ses-{session}" in session_path - assert Path(session_path).is_absolute() - assert Path(session_path).is_dir() - - -@pytest.mark.parametrize("bids_sub_id", ["01", "03", "05"]) -def test_get_session_path_when_session_missing(bids_sub_id, bids_path): - """ - Test that given only a subject ID (i.e., when BIDS session layer is missing for dataset), - get_session_path() returns the path to the subject directory. - """ - bids_dir = bids_path / "ds001" - session_path = butil.get_session_path( - layout=BIDSLayout(bids_dir, validate=True), - bids_dir=bids_dir, - bids_sub_id=bids_sub_id, - session=None, - ) - - assert session_path.endswith(f"sub-{bids_sub_id}") - assert Path(session_path).is_absolute() - assert Path(session_path).is_dir() - - -@pytest.mark.parametrize( - "unreadable_json,expected_message", - [ - ("example_iso88591.json", "Failed to decode the input file"), - ("example_invalid_json.json", "not valid JSON"), - ], -) -def test_failed_json_reading_raises_informative_error( - test_data, unreadable_json, expected_message, capsys -): - """Test that when there is an issue reading an input JSON file, the CLI exits with an informative error message.""" - with pytest.raises(typer.Exit): - futil.load_json(test_data / unreadable_json) - captured = capsys.readouterr() - - assert expected_message in captured.err - - -def test_unsupported_tsv_encoding_raises_informative_error(test_data, capsys): - """Test that given an input phenotypic TSV with an unsupported encoding, the CLI exits with an informative error message.""" - with pytest.raises(typer.Exit): - futil.load_tabular(test_data / "example_iso88591.tsv") - captured = capsys.readouterr() - - assert "Failed to decode the input file" in captured.err - - -def test_get_subject_instances(): - """Test that subjects are correctly extracted from a Neurobagel dataset instance.""" - dataset = models.Dataset( - hasLabel="test_dataset", - hasSamples=[ - models.Subject( - hasLabel="sub-01", - hasSession=[ - models.PhenotypicSession( - hasLabel="ses-01", - hasAge=26, - ), - ], - ), - models.Subject( - hasLabel="sub-02", - hasSession=[ - models.PhenotypicSession( - hasLabel="ses-01", - hasAge=30, - ), - ], - ), - ], - ) - - subjects = get_subject_instances(dataset) - assert list(subjects.keys()) == ["sub-01", "sub-02"] - - -def test_pipeline_uris_are_loaded(): - """Test that pipeline URIs are loaded from the pipeline-catalog submodule.""" - - pipeline_dict = mappings.get_pipeline_uris() - assert all( - ((mappings.NP.pf in pipe_uri) and (" " not in pipe_uri)) - for pipe_uri in pipeline_dict.values() - ) - - -def test_pipeline_versions_are_loaded(): - """Test that pipeline versions are loaded from the pipeline-catalog submodule.""" - - pipeline_dict = mappings.get_pipeline_versions() - assert all( - isinstance(pipe_versions, list) and len(pipe_versions) > 0 - for pipe_versions in pipeline_dict.values() - ) - - -@pytest.mark.parametrize( - "pipelines, unrecog_pipelines", - [ - (["fmriprep", "pipeline1"], ["pipeline1"]), - (["pipelineA", "pipelineB"], ["pipelineA", "pipelineB"]), - ], -) -def test_unrecognized_pipeline_names_raise_error(pipelines, unrecog_pipelines): - """Test that pipeline names not found in the pipeline catalog raise an informative error.""" - with pytest.raises(LookupError) as e: - dutil.check_pipelines_are_recognized(pipelines) - - assert all( - substr in str(e.value) - for substr in ["unrecognized pipelines"] + unrecog_pipelines - ) - - -@pytest.mark.parametrize( - "fmriprep_versions, unrecog_versions", - [ - (["20.2.7", "vA.B"], ["vA.B"]), - (["C.D.E", "F.G.H"], ["C.D.E", "F.G.H"]), - ], -) -def test_unrecognized_pipeline_versions_raise_error( - fmriprep_versions, unrecog_versions -): - """Test that versions of a pipeline not found in the pipeline catalog raise an informative error.""" - with pytest.raises(LookupError) as e: - dutil.check_pipeline_versions_are_recognized( - "fmriprep", fmriprep_versions - ) - - assert all( - substr in str(e.value) - for substr in ["unrecognized fmriprep versions"] + unrecog_versions - ) - - -def test_get_imaging_session_instances(): - """Test that get_imaging_session_instances() correctly returns existing imaging sessions for a given subject.""" - example_subject_jsonld = { - "identifier": "nb:34ec1e2d-9a81-4a50-bcd0-eb22c88d11e1", - "hasLabel": "sub-01", - "hasSession": [ - { - "identifier": "nb:85c7473c-6122-4999-ad3b-5cd57a883c87", - "hasLabel": "ses-01", - "hasAge": 34.1, - "hasSex": { - "identifier": "snomed:248152002", - "schemaKey": "Sex", - }, - "schemaKey": "PhenotypicSession", - }, - { - "identifier": "nb:eb57d0c1-fb96-4c04-8c16-1f29f7f40db4", - "hasLabel": "ses-02", - "hasAge": 35.3, - "hasSex": { - "identifier": "snomed:248152002", - "schemaKey": "Sex", - }, - "schemaKey": "PhenotypicSession", - }, - { - "identifier": "nb:e67fd08b-9bf9-4ed8-b4cc-d0142cd27789", - "hasLabel": "ses-im01", - "hasFilePath": "/data/neurobagel/bagel-cli/bids-examples/synthetic/sub-01/ses-01", - "hasAcquisition": [ - { - "identifier": "nb:5dc2e11e-4f7a-4b0e-9488-843f0a607f4b", - "hasContrastType": { - "identifier": "nidm:T1Weighted", - "schemaKey": "Image", - }, - "schemaKey": "Acquisition", - }, - ], - "schemaKey": "ImagingSession", - }, - ], - "schemaKey": "Subject", - } - example_subject = models.Subject(**example_subject_jsonld) - imaging_sessions = get_imaging_session_instances(example_subject) - - assert list(imaging_sessions.keys()) == ["ses-im01"] - - -def test_create_completed_pipelines(): - """ - Test that completed pipelines for a subject-session are accurately identified, - where a completed pipeline is one meeting the condition that *all* steps of that pipeline - that were run for the session are marked with a status of "SUCCESS". - """ - sub_ses_data = [ - [ - "01", - "sub-01", - "01", - "ses-01", - "fmriprep", - "20.2.7", - "step1", - "SUCCESS", - ], - [ - "01", - "sub-01", - "01", - "ses-01", - "fmriprep", - "20.2.7", - "step2", - "FAIL", - ], - [ - "01", - "sub-01", - "01", - "ses-01", - "fmriprep", - "23.1.3", - "default", - "SUCCESS", - ], - ] - example_ses_proc_df = pd.DataFrame.from_records( - columns=[ - "participant_id", - "bids_participant_id", - "session_id", - "bids_session_id", - "pipeline_name", - "pipeline_version", - "pipeline_step", - "status", - ], - data=sub_ses_data, - ) - completed_pipelines = dutil.create_completed_pipelines(example_ses_proc_df) - - assert len(completed_pipelines) == 1 - assert ( - completed_pipelines[0].hasPipelineName.identifier - == f"{mappings.NP.pf}:fmriprep" - ) - assert completed_pipelines[0].hasPipelineVersion == "23.1.3" - - -def test_used_namespaces_in_context(test_data_upload_path, load_test_json): - """ - Test that all namespaces used internally by the CLI for JSONLD dataset creation are defined - in the @context of reference example .jsonld files. - """ - # Fetch all .jsonld files to avoid having to add a test parameter whenever we add a new JSONLD - example_jsonld_files = list(test_data_upload_path.rglob("*.jsonld")) - for jsonld in example_jsonld_files: - jsonld_context = load_test_json(test_data_upload_path / jsonld)[ - "@context" - ] - - for ns in mappings.ALL_NAMESPACES: - assert ( - ns.pf in jsonld_context.keys() - ), f"The namespace '{ns.pf}' was not found in the @context of {jsonld}." diff --git a/tests/unit/test_bids_utils.py b/tests/unit/test_bids_utils.py new file mode 100644 index 00000000..114842d9 --- /dev/null +++ b/tests/unit/test_bids_utils.py @@ -0,0 +1,104 @@ +from collections import Counter +from pathlib import Path + +import pytest +from bids import BIDSLayout + +from bagel.utilities import bids_utils + + +@pytest.mark.parametrize( + "bids_dir", + ["synthetic", "ds000248"], +) +def test_get_bids_subjects_simple(bids_path, bids_dir): + """Test that get_bids_subjects_simple() correctly extracts subject IDs from a BIDS directory.""" + bids_subject_list = bids_utils.get_bids_subjects_simple( + bids_path / bids_dir + ) + expected_subjects = [ + f"sub-{sub_id}" + for sub_id in BIDSLayout( + bids_path / bids_dir, validate=True + ).get_subjects() + ] + assert sorted(bids_subject_list) == sorted(expected_subjects) + + +@pytest.mark.parametrize( + "bids_dir, acquisitions, bids_session", + [ + ( + "synthetic", + {"nidm:T1Weighted": 1, "nidm:FlowWeighted": 3}, + "01", + ), + ( + "ds001", + { + "nidm:T2Weighted": 1, + "nidm:T1Weighted": 1, + "nidm:FlowWeighted": 3, + }, + None, + ), + ("eeg_ds000117", {"nidm:T1Weighted": 1}, None), + ], +) +def test_create_acquisitions(bids_path, bids_dir, acquisitions, bids_session): + """Given a BIDS dataset, creates a list of acquisitions matching the image files found on disk.""" + image_list = bids_utils.create_acquisitions( + layout=BIDSLayout(bids_path / bids_dir, validate=True), + bids_sub_id="01", + session=bids_session, + ) + + image_counts = Counter( + [image.hasContrastType.identifier for image in image_list] + ) + + for contrast, count in acquisitions.items(): + assert image_counts[contrast] == count + + +@pytest.mark.parametrize( + "bids_sub_id, session", + [("01", "01"), ("02", "02"), ("03", "01")], +) +def test_get_session_path_when_session_exists( + bids_sub_id, session, bids_synthetic +): + """ + Test that given a subject and session ID (i.e. when BIDS session layer exists for dataset), + get_session_path() returns a path to the subject's session directory. + """ + session_path = bids_utils.get_session_path( + layout=BIDSLayout(bids_synthetic, validate=True), + bids_dir=bids_synthetic, + bids_sub_id=bids_sub_id, + session=session, + ) + + assert f"sub-{bids_sub_id}" in session_path + assert f"ses-{session}" in session_path + assert Path(session_path).is_absolute() + assert Path(session_path).is_dir() + + +@pytest.mark.parametrize("bids_sub_id", ["01", "03", "05"]) +def test_get_session_path_when_session_missing(bids_sub_id, bids_path): + """ + Test that given only a subject ID (i.e., when BIDS session layer is missing for dataset), + get_session_path() returns the path to the subject directory. + """ + bids_dir = bids_path / "ds001" + session_path = bids_utils.get_session_path( + layout=BIDSLayout(bids_dir, validate=True), + bids_dir=bids_dir, + bids_sub_id=bids_sub_id, + session=None, + ) + + assert session_path.endswith(f"sub-{bids_sub_id}") + assert Path(session_path).is_absolute() + assert Path(session_path).is_dir() diff --git a/tests/unit/test_derivative_utils.py b/tests/unit/test_derivative_utils.py new file mode 100644 index 00000000..655fb7a0 --- /dev/null +++ b/tests/unit/test_derivative_utils.py @@ -0,0 +1,128 @@ +import pandas as pd +import pytest + +from bagel import mappings +from bagel.utilities import derivative_utils + + +def test_pipeline_uris_are_loaded(): + """Test that pipeline URIs are loaded from the pipeline-catalog submodule.""" + + pipeline_dict = mappings.get_pipeline_uris() + assert all( + ((mappings.NP.pf in pipe_uri) and (" " not in pipe_uri)) + for pipe_uri in pipeline_dict.values() + ) + + +def test_pipeline_versions_are_loaded(): + """Test that pipeline versions are loaded from the pipeline-catalog submodule.""" + + pipeline_dict = mappings.get_pipeline_versions() + assert all( + isinstance(pipe_versions, list) and len(pipe_versions) > 0 + for pipe_versions in pipeline_dict.values() + ) + + +@pytest.mark.parametrize( + "pipelines, unrecog_pipelines", + [ + (["fmriprep", "pipeline1"], ["pipeline1"]), + (["pipelineA", "pipelineB"], ["pipelineA", "pipelineB"]), + ], +) +def test_unrecognized_pipeline_names_raise_error(pipelines, unrecog_pipelines): + """Test that pipeline names not found in the pipeline catalog raise an informative error.""" + with pytest.raises(LookupError) as e: + derivative_utils.check_pipelines_are_recognized(pipelines) + + assert all( + substr in str(e.value) + for substr in ["unrecognized pipelines"] + unrecog_pipelines + ) + + +@pytest.mark.parametrize( + "fmriprep_versions, unrecog_versions", + [ + (["20.2.7", "vA.B"], ["vA.B"]), + (["C.D.E", "F.G.H"], ["C.D.E", "F.G.H"]), + ], +) +def test_unrecognized_pipeline_versions_raise_error( + fmriprep_versions, unrecog_versions +): + """Test that versions of a pipeline not found in the pipeline catalog raise an informative error.""" + with pytest.raises(LookupError) as e: + derivative_utils.check_pipeline_versions_are_recognized( + "fmriprep", fmriprep_versions + ) + + assert all( + substr in str(e.value) + for substr in ["unrecognized fmriprep versions"] + unrecog_versions + ) + + +def test_create_completed_pipelines(): + """ + Test that completed pipelines for a subject-session are accurately identified, + where a completed pipeline is one meeting the condition that *all* steps of that pipeline + that were run for the session are marked with a status of "SUCCESS". + """ + sub_ses_data = [ + [ + "01", + "sub-01", + "01", + "ses-01", + "fmriprep", + "20.2.7", + "step1", + "SUCCESS", + ], + [ + "01", + "sub-01", + "01", + "ses-01", + "fmriprep", + "20.2.7", + "step2", + "FAIL", + ], + [ + "01", + "sub-01", + "01", + "ses-01", + "fmriprep", + "23.1.3", + "default", + "SUCCESS", + ], + ] + example_ses_proc_df = pd.DataFrame.from_records( + columns=[ + "participant_id", + "bids_participant_id", + "session_id", + "bids_session_id", + "pipeline_name", + "pipeline_version", + "pipeline_step", + "status", + ], + data=sub_ses_data, + ) + completed_pipelines = derivative_utils.create_completed_pipelines( + example_ses_proc_df + ) + + assert len(completed_pipelines) == 1 + assert ( + completed_pipelines[0].hasPipelineName.identifier + == f"{mappings.NP.pf}:fmriprep" + ) + assert completed_pipelines[0].hasPipelineVersion == "23.1.3" diff --git a/tests/unit/test_file_utils.py b/tests/unit/test_file_utils.py new file mode 100644 index 00000000..fe6cec04 --- /dev/null +++ b/tests/unit/test_file_utils.py @@ -0,0 +1,31 @@ +import pytest +import typer + +from bagel.utilities import file_utils + + +@pytest.mark.parametrize( + "unreadable_json,expected_message", + [ + ("example_iso88591.json", "Failed to decode the input file"), + ("example_invalid_json.json", "not valid JSON"), + ], +) +def test_failed_json_reading_raises_informative_error( + test_data, unreadable_json, expected_message, capsys +): + """Test that when there is an issue reading an input JSON file, the CLI exits with an informative error message.""" + with pytest.raises(typer.Exit): + file_utils.load_json(test_data / unreadable_json) + captured = capsys.readouterr() + + assert expected_message in captured.err + + +def test_unsupported_tsv_encoding_raises_informative_error(test_data, capsys): + """Test that given an input phenotypic TSV with an unsupported encoding, the CLI exits with an informative error message.""" + with pytest.raises(typer.Exit): + file_utils.load_tabular(test_data / "example_iso88591.tsv") + captured = capsys.readouterr() + + assert "Failed to decode the input file" in captured.err diff --git a/tests/unit/test_model_utils.py b/tests/unit/test_model_utils.py new file mode 100644 index 00000000..41274eeb --- /dev/null +++ b/tests/unit/test_model_utils.py @@ -0,0 +1,220 @@ +import pytest + +from bagel import mappings, models +from bagel.utilities import model_utils + + +@pytest.fixture +def get_test_context(): + """Generate an @context dictionary to test against.""" + return model_utils.generate_context() + + +@pytest.fixture +def get_values_by_key(): + """ + Get values of all instances of a specified key in a dictionary. Will also look inside lists of dictionaries and nested dictionaries. + """ + + def _find_by_key(data, target): + if isinstance(data, dict): + for key, value in data.items(): + if isinstance(value, (dict, list)): + yield from _find_by_key(value, target) + elif key == target: + yield value + elif isinstance(data, list): + for item in data: + yield from _find_by_key(item, target) + + return _find_by_key + + +def test_all_used_namespaces_have_urls( + get_test_context, get_values_by_key, load_test_json, test_data_upload_path +): + """Test that all namespace prefixes used in a comprehensive data dictionary have a corresponding URL in the @context.""" + data_dict = load_test_json( + test_data_upload_path / "example_synthetic.json" + ) + + prefixes = list( + map( + lambda term: term.split(":")[0], + get_values_by_key(data_dict, "TermURL"), + ) + ) + + # add nidm to the list of tested prefixes manually since it is the only one not present in the data dictionary + # but is used automatically during the bids step + for prefix in set(prefixes + ["nidm"]): + assert ( + prefix in get_test_context["@context"] + ), f"The namespace '{prefix}' was used in the data dictionary, but was not defined in the @context." + + +@pytest.mark.parametrize( + "bids_list, expected_bids_exclusive_subs", + [ + (["sub-01", "sub-02", "sub-03"], []), + ( + ["sub-01", "sub-02", "sub-03", "sub-04", "sub-05"], + ["sub-04", "sub-05"], + ), + ( + ["sub-cbm001", "sub-cbm002", "sub-cbm003"], + ["sub-cbm001", "sub-cbm002", "sub-cbm003"], + ), + ( + ["sub-pd123", "sub-pd234"], + ["sub-pd123", "sub-pd234"], + ), + ], +) +def test_get_subjects_missing_from_pheno_data( + bids_list, expected_bids_exclusive_subs +): + """ + Given a list of BIDS subject IDs, test that IDs not found in the phenotypic subject list are returned. + """ + pheno_list = ["sub-01", "sub-02", "sub-03", "sub-PD123", "sub-PD234"] + bids_exclusive_subs = model_utils.get_subs_missing_from_pheno_data( + pheno_subjects=pheno_list, subjects=bids_list + ) + + # We sort the list for comparison since the order of the missing subjects is not guaranteed + # due to using set operations + assert sorted(bids_exclusive_subs) == expected_bids_exclusive_subs + + +def test_get_subject_instances(): + """Test that subjects are correctly extracted from a Neurobagel dataset instance.""" + dataset = models.Dataset( + hasLabel="test_dataset", + hasSamples=[ + models.Subject( + hasLabel="sub-01", + hasSession=[ + models.PhenotypicSession( + hasLabel="ses-01", + hasAge=26, + ), + ], + ), + models.Subject( + hasLabel="sub-02", + hasSession=[ + models.PhenotypicSession( + hasLabel="ses-01", + hasAge=30, + ), + ], + ), + ], + ) + + subjects = model_utils.get_subject_instances(dataset) + assert list(subjects.keys()) == ["sub-01", "sub-02"] + + +def test_get_imaging_session_instances(): + """Test that get_imaging_session_instances() correctly returns existing imaging sessions for a given subject.""" + example_subject_jsonld = { + "identifier": "nb:34ec1e2d-9a81-4a50-bcd0-eb22c88d11e1", + "hasLabel": "sub-01", + "hasSession": [ + { + "identifier": "nb:85c7473c-6122-4999-ad3b-5cd57a883c87", + "hasLabel": "ses-01", + "hasAge": 34.1, + "hasSex": { + "identifier": "snomed:248152002", + "schemaKey": "Sex", + }, + "schemaKey": "PhenotypicSession", + }, + { + "identifier": "nb:eb57d0c1-fb96-4c04-8c16-1f29f7f40db4", + "hasLabel": "ses-02", + "hasAge": 35.3, + "hasSex": { + "identifier": "snomed:248152002", + "schemaKey": "Sex", + }, + "schemaKey": "PhenotypicSession", + }, + { + "identifier": "nb:e67fd08b-9bf9-4ed8-b4cc-d0142cd27789", + "hasLabel": "ses-im01", + "hasFilePath": "/data/neurobagel/bagel-cli/bids-examples/synthetic/sub-01/ses-01", + "hasAcquisition": [ + { + "identifier": "nb:5dc2e11e-4f7a-4b0e-9488-843f0a607f4b", + "hasContrastType": { + "identifier": "nidm:T1Weighted", + "schemaKey": "Image", + }, + "schemaKey": "Acquisition", + }, + ], + "schemaKey": "ImagingSession", + }, + ], + "schemaKey": "Subject", + } + example_subject = models.Subject(**example_subject_jsonld) + imaging_sessions = model_utils.get_imaging_session_instances( + example_subject + ) + + assert list(imaging_sessions.keys()) == ["ses-im01"] + + +def test_used_namespaces_in_context(test_data_upload_path, load_test_json): + """ + Test that all namespaces used internally by the CLI for JSONLD dataset creation are defined + in the @context of reference example .jsonld files. + """ + # Fetch all .jsonld files to avoid having to add a test parameter whenever we add a new JSONLD + example_jsonld_files = list(test_data_upload_path.rglob("*.jsonld")) + for jsonld in example_jsonld_files: + jsonld_context = load_test_json(test_data_upload_path / jsonld)[ + "@context" + ] + + for ns in mappings.ALL_NAMESPACES: + assert ( + ns.pf in jsonld_context.keys() + ), f"The namespace '{ns.pf}' was not found in the @context of {jsonld}." + + +def test_add_context_to_graph_dataset(): + """Test that add_context_to_graph_dataset() correctly adds the @context to a graph dataset instance.""" + dataset = models.Dataset( + hasLabel="test_dataset", + hasSamples=[ + models.Subject( + hasLabel="sub-01", + hasSession=[ + models.PhenotypicSession( + hasLabel="ses-01", + hasAge=26, + ), + ], + ), + models.Subject( + hasLabel="sub-02", + hasSession=[ + models.PhenotypicSession( + hasLabel="ses-01", + hasAge=30, + ), + ], + ), + ], + ) + + jsonld = model_utils.add_context_to_graph_dataset(dataset=dataset) + + assert "@context" in jsonld.keys() + assert len(jsonld["hasSamples"]) == 2 diff --git a/tests/unit/test_pheno_utils.py b/tests/unit/test_pheno_utils.py new file mode 100644 index 00000000..15988f8c --- /dev/null +++ b/tests/unit/test_pheno_utils.py @@ -0,0 +1,299 @@ +import pandas as pd +import pytest + +from bagel import mappings +from bagel.utilities import pheno_utils + + +@pytest.mark.parametrize( + "partial_data_dict, invalid_column_name", + [ + # sex column missing Levels + ( + { + "participant_id": { + "Description": "A participant ID", + "Annotations": { + "IsAbout": { + "TermURL": "nb:ParticipantID", + "Label": "Unique participant identifier", + }, + "Identifies": "participant", + }, + }, + "sex": { + "Description": "Participant sex", + "Annotations": { + "IsAbout": {"TermURL": "nb:Sex", "Label": ""} + }, + }, + }, + "sex", + ), + # age column missing Transformation + ( + { + "participant_id": { + "Description": "A participant ID", + "Annotations": { + "IsAbout": { + "TermURL": "nb:ParticipantID", + "Label": "Unique participant identifier", + }, + "Identifies": "participant", + }, + }, + "age": { + "Description": "Participant age", + "Annotations": { + "IsAbout": { + "TermURL": "nb:Age", + "Label": "Chronological age", + } + }, + }, + }, + "age", + ), + ], +) +def test_schema_invalid_column_raises_error( + partial_data_dict, invalid_column_name +): + """ + Test that when an input data dictionary contains a schema invalid column annotation, + an informative error is raised which includes the name of the offending column. + """ + with pytest.raises(ValueError) as e: + pheno_utils.validate_data_dict(partial_data_dict) + + for substring in [ + "not a valid Neurobagel data dictionary", + invalid_column_name, + ]: + assert substring in str(e.value) + + +def test_get_columns_that_are_about_concept(test_data, load_test_json): + """Test that matching annotated columns are returned as a list, + and that empty list is returned if nothing matches""" + data_dict = load_test_json(test_data / "example14.json") + + assert ["participant_id"] == pheno_utils.get_columns_about( + data_dict, concept=mappings.NEUROBAGEL["participant"] + ) + assert [] == pheno_utils.get_columns_about( + data_dict, concept="does not exist concept" + ) + + +def test_get_columns_with_annotations(): + example = { + "someOtherColumn": { + "Description": "This is cool in BIDS, but not in Neurobagel" + }, + "participant_id": { + "Description": "A participant ID", + "Annotations": { + "IsAbout": { + "TermURL": "nb:ParticipantID", + "Label": "Unique participant identifier", + } + }, + }, + } + result = pheno_utils.get_annotated_columns(example)[0] + assert result[0] == "participant_id" + assert result[1] == example["participant_id"] + + +def test_map_categories_to_columns(test_data, load_test_json): + """Test that inverse mapping of concepts to columns is correctly created""" + data_dict = load_test_json(test_data / "example2.json") + + result = pheno_utils.map_categories_to_columns(data_dict) + + assert {"participant", "session", "sex"}.issubset(result.keys()) + assert ["participant_id"] == result["participant"] + assert ["session_id"] == result["session"] + assert ["sex"] == result["sex"] + + +@pytest.mark.parametrize( + "tool, columns", + [ + ("cogatlas:1234", ["tool_item1", "tool_item2"]), + ("cogatlas:4321", ["other_tool_item1"]), + ], +) +def test_map_tools_to_columns(test_data, load_test_json, tool, columns): + data_dict = load_test_json(test_data / "example6.json") + + result = pheno_utils.map_tools_to_columns(data_dict) + + assert result[tool] == columns + + +@pytest.mark.parametrize( + "example, column_list, expected_values", + [ + ("example2", ["sex"], ["snomed:248153007"]), + ( + "example19", + ["group", "diagnosis"], + ["snomed:49049000", "snomed:724761004"], + ), + ], +) +def test_get_transformed_categorical_values( + test_data, load_test_json, example, column_list, expected_values +): + """Test that the correct transformed values are returned for a categorical variable""" + data_dict = load_test_json(test_data / f"{example}.json") + pheno = pd.read_csv(test_data / f"{example}.tsv", sep="\t") + + assert expected_values == pheno_utils.get_transformed_values( + columns=column_list, + row=pheno.iloc[0], + data_dict=data_dict, + ) + + +@pytest.mark.parametrize( + "example,expected_result", + [ + ( + { + "column": { + "Annotations": { + "IsAbout": {"TermURL": "something", "Labels": "other"}, + "Levels": { + "val1": {"TermURL": "something", "Label": "other"} + }, + } + } + }, + True, + ), + ( + { + "column": { + "Levels": {"val1": "some description"}, + "Annotations": { + "IsAbout": {"TermURL": "something", "Labels": "other"} + }, + } + }, + False, + ), + ], +) +def test_detect_categorical_column(example, expected_result): + result = pheno_utils.is_column_categorical( + column="column", data_dict=example + ) + + assert result is expected_result + + +@pytest.mark.parametrize( + "value,column,expected", + [ + ("test_value", "test_column", True), + ("does not exist", "test_column", False), + ("my_value", "empty_column", False), + ], +) +def test_missing_values(value, column, expected): + """Test that missing values are correctly detected""" + test_data_dict = { + "test_column": {"Annotations": {"MissingValues": ["test_value"]}}, + "empty_column": {"Annotations": {}}, + } + + assert ( + pheno_utils.is_missing_value(value, column, test_data_dict) is expected + ) + + +@pytest.mark.parametrize( + "subject_idx, is_avail", + [(0, True), (2, False), (4, True)], +) +def test_get_assessment_tool_availability( + test_data, load_test_json, subject_idx, is_avail +): + """ + Ensure that subjects who have one or more missing values in columns mapped to an assessment + tool are correctly identified as not having this assessment tool + """ + data_dict = load_test_json(test_data / "example6.json") + pheno = pd.read_csv(test_data / "example6.tsv", sep="\t") + test_columns = ["tool_item1", "tool_item2"] + + assert ( + pheno_utils.are_any_available( + test_columns, pheno.iloc[subject_idx], data_dict + ) + is is_avail + ) + + +@pytest.mark.parametrize( + "columns, expected_indices", + [(["participant_id"], [0]), (["session_id"], [2])], +) +def test_missing_ids_in_columns(test_data, columns, expected_indices): + """ + When a participant or session labeled column has missing values, + we raise and provide the list of offending row indices + """ + pheno = pd.read_csv( + test_data / "example11.tsv", sep="\t", keep_default_na=False, dtype=str + ) + assert expected_indices == pheno_utils.get_rows_with_empty_strings( + pheno, columns=columns + ) + + +@pytest.mark.parametrize( + "raw_age,expected_age,heuristic", + [ + ("11.0", 11.0, "nb:FromFloat"), + ("11", 11.0, "nb:FromInt"), + ("11,0", 11.0, "nb:FromEuro"), + ("90+", 90.0, "nb:FromBounded"), + ("20Y6M", 20.5, "nb:FromISO8601"), + ("P20Y6M", 20.5, "nb:FromISO8601"), + ("20Y9M", 20.75, "nb:FromISO8601"), + ], +) +def test_age_gets_converted(raw_age, expected_age, heuristic): + assert expected_age == pheno_utils.transform_age(raw_age, heuristic) + + +@pytest.mark.parametrize( + "raw_age, incorrect_heuristic", + [ + ("11,0", "nb:FromFloat"), + ("11.0", "nb:FromISO8601"), + ("20-30", "nb:FromBounded"), + ], +) +def test_incorrect_age_heuristic(raw_age, incorrect_heuristic): + """Given an age transformation that does not match the type of age value provided, returns an informative error.""" + with pytest.raises(ValueError) as e: + pheno_utils.transform_age(raw_age, incorrect_heuristic) + + assert ( + f"problem with applying the age transformation: {incorrect_heuristic}." + in str(e.value) + ) + + +def test_invalid_age_heuristic(): + """Given an age transformation that is not recognized, returns an informative ValueError.""" + with pytest.raises(ValueError) as e: + pheno_utils.transform_age("11,0", "nb:birthyear") + + assert "unrecognized age transformation: nb:birthyear" in str(e.value)