From e187e8e8ce14fa7cb941678d1b3cd87e25985fdb Mon Sep 17 00:00:00 2001 From: Alyssa Dai Date: Mon, 8 Jan 2024 11:22:06 -0500 Subject: [PATCH] [ENH] Handle multi-column attribute annotations (#264) * update docstrings and type hints for clarity * add example with multiple diagnosis cols * add new example with two diagnosis columns * update test of categorical value transform * update get_transformed_values to return a list * add multi-diagnosis example subject with healthy control value * update transformed value storing for attributes that expect a single value * test subject level output with multicolumn diagnosis handling * add comment and update test data README * add example with multiple columns about age and sex * add multicolumn annotation examples to bagel pheno smoke test * add example 20 to README * remove addressed TODOs * add and test warning for multiple age/sex columns --- bagel/cli.py | 22 +++-- bagel/pheno_utils.py | 55 ++++++++---- bagel/tests/data/README.md | 2 + bagel/tests/data/example19.json | 109 ++++++++++++++++++++++++ bagel/tests/data/example19.tsv | 4 + bagel/tests/data/example20.json | 145 ++++++++++++++++++++++++++++++++ bagel/tests/data/example20.tsv | 4 + bagel/tests/test_cli_pheno.py | 108 +++++++++++++++++++++++- bagel/tests/test_utility.py | 25 ++++-- 9 files changed, 439 insertions(+), 35 deletions(-) create mode 100644 bagel/tests/data/example19.json create mode 100644 bagel/tests/data/example19.tsv create mode 100644 bagel/tests/data/example20.json create mode 100644 bagel/tests/data/example20.tsv diff --git a/bagel/cli.py b/bagel/cli.py index 36772b12..23e25fb5 100644 --- a/bagel/cli.py +++ b/bagel/cli.py @@ -112,31 +112,38 @@ def pheno( _ses_pheno = session_row if "sex" in column_mapping.keys(): - _sex_val = putil.get_transformed_values( + _sex_vals = putil.get_transformed_values( column_mapping["sex"], _ses_pheno, data_dictionary ) - if _sex_val: - session.hasSex = models.Sex(identifier=_sex_val) + if _sex_vals: + # NOTE: Our data model only allows a single sex value, so we only take the first instance if multiple columns are about sex + session.hasSex = models.Sex(identifier=_sex_vals[0]) if "diagnosis" in column_mapping.keys(): - _dx_val = putil.get_transformed_values( + _dx_vals = putil.get_transformed_values( column_mapping["diagnosis"], _ses_pheno, data_dictionary ) - if _dx_val is None: + if not _dx_vals: pass - elif _dx_val == mappings.NEUROBAGEL["healthy_control"]: + # NOTE: If the subject has both a diagnosis value and a value of healthy control, we assume the healthy control designation is more important + # and do not assign diagnoses to the subject + elif mappings.NEUROBAGEL["healthy_control"] in _dx_vals: session.isSubjectGroup = models.SubjectGroup( identifier=mappings.NEUROBAGEL["healthy_control"], ) else: session.hasDiagnosis = [ models.Diagnosis(identifier=_dx_val) + for _dx_val in _dx_vals ] if "age" in column_mapping.keys(): - session.hasAge = putil.get_transformed_values( + _age_vals = putil.get_transformed_values( column_mapping["age"], _ses_pheno, data_dictionary ) + if _age_vals: + # NOTE: Our data model only allows a single age value, so we only take the first instance if multiple columns are about age + session.hasAge = _age_vals[0] if tool_mapping: _assessments = [ @@ -288,7 +295,6 @@ def bids( session=session, ) - # TODO: needs refactoring once we also handle phenotypic information at the session level session_list.append( # Add back "ses" prefix because pybids stripped it models.ImagingSession( diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py index 2e839804..98cd4010 100644 --- a/bagel/pheno_utils.py +++ b/bagel/pheno_utils.py @@ -98,7 +98,7 @@ def generate_context(): def get_columns_about(data_dict: dict, concept: str) -> list: """ - Returns column names that have been annotated as "IsAbout" the desired concept. + Returns all column names that have been annotated as "IsAbout" the desired concept. Parameters ---------- data_dict: dict @@ -120,7 +120,12 @@ def get_columns_about(data_dict: dict, concept: str) -> list: ] -def get_annotated_columns(data_dict: dict) -> list: +def get_annotated_columns(data_dict: dict) -> list(tuple[str, dict]): + """ + Return a list of all columns that have Neurobagel 'Annotations' in a data dictionary, + where each column is represented as a tuple of the column name (dictionary key from the data dictionary) and + properties (all dictionary contents from the data dictionary). + """ return [ (col, content) for col, content in data_dict.items() @@ -130,8 +135,10 @@ def get_annotated_columns(data_dict: dict) -> list: def map_categories_to_columns(data_dict: dict) -> dict: """ - Maps all pre-defined Neurobagel categories (e.g. "Sex") to a list of column names (if any) that + Maps all pre-defined Neurobagel categories (e.g. "Sex") to a list containing all column names (if any) that have been linked to this category. + + Returns a dictionary where the keys are the Neurobagel categories and the values are lists of column names. """ return { cat_name: get_columns_about(data_dict, cat_iri) @@ -144,6 +151,8 @@ def map_tools_to_columns(data_dict: dict) -> dict: """ Return a mapping of all assessment tools described in the data dictionary to the columns that are mapped to it. + + Returns a dictionary where the keys are the assessment tool IRIs and the values are lists of column names. """ out_dict = defaultdict(list) for col, content in get_annotated_columns(data_dict): @@ -212,31 +221,23 @@ def transform_age(value: str, heuristic: str) -> float: def get_transformed_values( columns: list, row: pd.Series, data_dict: dict -) -> Union[str, None]: - """Convert a raw phenotypic value to the corresponding controlled term""" - transf_val = [] - # TODO: Currently, this function accepts a list of columns + populates a list of transformed values because multiple columns should in theory - # be able to be annotated as being about a single Neurobagel concept/variable. However, we don't yet have a proper way to support multiple transformed values - # so this function returns just a single value or None. - # In future, we need to implement a way to handle cases where more than one column contains information. - for col in columns[:1]: +) -> list: + """Convert a list of raw phenotypic values to the corresponding controlled terms, from columns that have not been annotated as being about an assessment tool.""" + transf_vals = [] + for col in columns: value = row[col] if is_missing_value(value, col, data_dict): continue if is_column_categorical(col, data_dict): - transf_val.append(map_cat_val_to_term(value, col, data_dict)) + transf_vals.append(map_cat_val_to_term(value, col, data_dict)) else: # TODO: replace with more flexible solution when we have more # continuous variables than just age - transf_val.append( + transf_vals.append( transform_age(str(value), get_age_heuristic(col, data_dict)) ) - # TODO: once we can handle multiple columns, this section should be removed - # and we should just return an empty list if no transform can be generated - if not transf_val: - return None - return transf_val[0] + return transf_vals # TODO: Check all columns and then return list of offending columns' names @@ -394,6 +395,24 @@ def validate_data_dict(data_dict: dict) -> None: "Please make sure that only one column is annotated for participant and session IDs." ) + if ( + len(get_columns_about(data_dict, concept=mappings.NEUROBAGEL["sex"])) + > 1 + ): + warnings.warn( + "The provided data dictionary indicates more than one column about sex. " + "Neurobagel cannot resolve multiple sex values per subject-session, and so will only consider the first of these columns for sex data." + ) + + if ( + len(get_columns_about(data_dict, concept=mappings.NEUROBAGEL["age"])) + > 1 + ): + warnings.warn( + "The provided data dictionary indicates more than one column about age. " + "Neurobagel cannot resolve multiple sex values per subject-session, so will only consider the first of these columns for age data." + ) + if not categorical_cols_have_bids_levels(data_dict): warnings.warn( "The data dictionary contains at least one column that looks categorical but lacks a BIDS 'Levels' attribute." diff --git a/bagel/tests/data/README.md b/bagel/tests/data/README.md index 3c77a0ed..a83ab40f 100644 --- a/bagel/tests/data/README.md +++ b/bagel/tests/data/README.md @@ -23,6 +23,8 @@ Example inputs to the CLI | 16 | Invalid, same as example2.csv, but with a sneaky .tsv file ending | Valid, same as example2 | fail | | 17 | Valid, contains data for three subjects, but no session column | Same as example 2 JSON, without `session_id` column | pass | | 18 | Invalid, example2.tsv without `session_id` column, so there are non-unique participant rows | Same as example 2 JSON, without session_id column | fail | +| 19 | Example with two columns about diagnosis | Valid | pass | +| 20 | Valid, based on example 19 but contains multiple annotated columns about age and sex | Valid | pass | `* this is expected to fail until we enable multiple participant_ID handling`. diff --git a/bagel/tests/data/example19.json b/bagel/tests/data/example19.json new file mode 100644 index 00000000..5e038001 --- /dev/null +++ b/bagel/tests/data/example19.json @@ -0,0 +1,109 @@ +{ + "participant_id": { + "Description": "A participant ID", + "Annotations": { + "IsAbout": { + "TermURL": "nb:ParticipantID", + "Label": "Unique participant identifier" + }, + "Identifies": "participant" + } + }, + "session_id": { + "Description": "A session ID", + "Annotations": { + "IsAbout": { + "TermURL": "nb:SessionID", + "Label": "Unique session identifier" + }, + "Identifies": "session" + } + }, + "group": { + "Description": "Group variable", + "Levels": { + "PD": "Parkinson's disease", + "CTRL": "Control subject" + }, + "Annotations": { + "IsAbout": { + "TermURL": "nb:Diagnosis", + "Label": "Diagnosis" + }, + "Levels": { + "PD": { + "TermURL": "snomed:49049000", + "Label": "Parkinson's disease" + }, + "CTRL": { + "TermURL": "ncit:C94342", + "Label": "Healthy Control" + } + } + } + }, + "diagnosis": { + "Description": "Diagnosis at baseline visit", + "Levels": { + "PD": "Parkinson's disease", + "SPD": "Sporadic Parkinson disease" + }, + "Annotations": { + "IsAbout": { + "TermURL": "nb:Diagnosis", + "Label": "Diagnosis" + }, + "Levels": { + "PD": { + "TermURL": "snomed:49049000", + "Label": "Parkinson's disease" + }, + "SPD": { + "TermURL": "snomed:724761004", + "Label": "Sporadic Parkinson disease" + }, + "AD": { + "TermURL": "snomed:26929004", + "Label": "Alzheimer's disease" + } + }, + "MissingValues": [""] + } + }, + "sex": { + "Description": "Sex variable", + "Levels": { + "M": "Male", + "F": "Female" + }, + "Annotations": { + "IsAbout": { + "TermURL": "nb:Sex", + "Label": "Sex" + }, + "Levels": { + "M": { + "TermURL": "snomed:248153007", + "Label": "Male" + }, + "F": { + "TermURL": "snomed:248152002", + "Label": "Female" + } + } + } + }, + "participant_age": { + "Description": "Age of the participant", + "Annotations": { + "IsAbout": { + "TermURL": "nb:Age", + "Label": "Chronological age" + }, + "Transformation": { + "TermURL": "nb:FromISO8601", + "Label": "A period of time defined according to the ISO8601 standard" + } + } + } +} \ No newline at end of file diff --git a/bagel/tests/data/example19.tsv b/bagel/tests/data/example19.tsv new file mode 100644 index 00000000..51312422 --- /dev/null +++ b/bagel/tests/data/example19.tsv @@ -0,0 +1,4 @@ +participant_id session_id group diagnosis sex participant_age +sub-01 ses-01 PD SPD M P60Y6M +sub-02 ses-01 CTRL F P56Y4M +sub-03 ses-01 CTRL AD F P60Y6M \ No newline at end of file diff --git a/bagel/tests/data/example20.json b/bagel/tests/data/example20.json new file mode 100644 index 00000000..e2b19029 --- /dev/null +++ b/bagel/tests/data/example20.json @@ -0,0 +1,145 @@ +{ + "participant_id": { + "Description": "A participant ID", + "Annotations": { + "IsAbout": { + "TermURL": "nb:ParticipantID", + "Label": "Unique participant identifier" + }, + "Identifies": "participant" + } + }, + "session_id": { + "Description": "A session ID", + "Annotations": { + "IsAbout": { + "TermURL": "nb:SessionID", + "Label": "Unique session identifier" + }, + "Identifies": "session" + } + }, + "group": { + "Description": "Group variable", + "Levels": { + "PD": "Parkinson's disease", + "CTRL": "Control subject" + }, + "Annotations": { + "IsAbout": { + "TermURL": "nb:Diagnosis", + "Label": "Diagnosis" + }, + "Levels": { + "PD": { + "TermURL": "snomed:49049000", + "Label": "Parkinson's disease" + }, + "CTRL": { + "TermURL": "ncit:C94342", + "Label": "Healthy Control" + } + } + } + }, + "diagnosis": { + "Description": "Diagnosis at baseline visit", + "Levels": { + "PD": "Parkinson's disease", + "SPD": "Sporadic Parkinson disease" + }, + "Annotations": { + "IsAbout": { + "TermURL": "nb:Diagnosis", + "Label": "Diagnosis" + }, + "Levels": { + "PD": { + "TermURL": "snomed:49049000", + "Label": "Parkinson's disease" + }, + "SPD": { + "TermURL": "snomed:724761004", + "Label": "Sporadic Parkinson disease" + }, + "AD": { + "TermURL": "snomed:26929004", + "Label": "Alzheimer's disease" + } + }, + "MissingValues": [""] + } + }, + "sex": { + "Description": "Sex variable", + "Levels": { + "M": "Male", + "F": "Female" + }, + "Annotations": { + "IsAbout": { + "TermURL": "nb:Sex", + "Label": "Sex" + }, + "Levels": { + "M": { + "TermURL": "snomed:248153007", + "Label": "Male" + }, + "F": { + "TermURL": "snomed:248152002", + "Label": "Female" + } + } + } + }, + "sex_full": { + "Description": "Sex variable", + "Levels": { + "Male": "Male", + "Female": "Female" + }, + "Annotations": { + "IsAbout": { + "TermURL": "nb:Sex", + "Label": "Sex" + }, + "Levels": { + "Male": { + "TermURL": "snomed:248153007", + "Label": "Male" + }, + "Female": { + "TermURL": "snomed:248152002", + "Label": "Female" + } + } + } + }, + "participant_age": { + "Description": "Age of the participant", + "Annotations": { + "IsAbout": { + "TermURL": "nb:Age", + "Label": "Chronological age" + }, + "Transformation": { + "TermURL": "nb:FromISO8601", + "Label": "A period of time defined according to the ISO8601 standard" + } + } + }, + "participant_age_float": { + "Description": "Age of the participant", + "Annotations": { + "IsAbout": { + "TermURL": "nb:Age", + "Label": "Chronological age" + }, + "Transformation": { + "TermURL": "nb:FromFloat", + "Label": "Age as a floating point number" + } + } + } +} \ No newline at end of file diff --git a/bagel/tests/data/example20.tsv b/bagel/tests/data/example20.tsv new file mode 100644 index 00000000..32028f6c --- /dev/null +++ b/bagel/tests/data/example20.tsv @@ -0,0 +1,4 @@ +participant_id session_id group diagnosis sex sex_full participant_age participant_age_float +sub-01 ses-01 PD SPD M Male P60Y6M 60.5 +sub-02 ses-01 CTRL F Female P56Y4M 56.3 +sub-03 ses-01 CTRL AD F Female P60Y6M 60.5 \ No newline at end of file diff --git a/bagel/tests/test_cli_pheno.py b/bagel/tests/test_cli_pheno.py index c98f633b..315f8cf2 100644 --- a/bagel/tests/test_cli_pheno.py +++ b/bagel/tests/test_cli_pheno.py @@ -22,6 +22,8 @@ def default_pheno_output_path(tmp_path): "example14", "example_synthetic", "example17", + "example19", + "example20", ], ) def test_pheno_valid_inputs_run_successfully( @@ -187,6 +189,38 @@ def test_invalid_portal_uris_produces_error( ) +def test_multiple_age_or_sex_columns_raises_warning( + runner, + test_data, + default_pheno_output_path, +): + """Test that an informative warning is raised when multiple columns in the phenotypic file have been annotated as being about age or sex.""" + with pytest.warns(UserWarning) as w: + runner.invoke( + bagel, + [ + "pheno", + "--pheno", + test_data / "example20.tsv", + "--dictionary", + test_data / "example20.json", + "--output", + default_pheno_output_path, + "--name", + "Multiple age/sex columns dataset", + ], + catch_exceptions=False, + ) + + assert len(w) == 2 + warnings = [warning.message.args[0] for warning in w] + for warn_substring in [ + "more than one column about age", + "more than one column about sex", + ]: + assert [any(warn_substring in warning_str for warning_str in warnings)] + + def test_missing_bids_levels_raises_warning( runner, test_data, @@ -713,7 +747,7 @@ def test_pheno_sessions_have_correct_labels( def test_pheno_session_created_for_missing_session_column( runner, test_data, - tmp_path, + default_pheno_output_path, load_test_json, ): """ @@ -731,12 +765,80 @@ def test_pheno_session_created_for_missing_session_column( "--name", "Missing session column dataset", "--output", - tmp_path / "example_synthetic.jsonld", + default_pheno_output_path, ], ) - pheno = load_test_json(tmp_path / "example_synthetic.jsonld") + pheno = load_test_json(default_pheno_output_path) for sub in pheno["hasSamples"]: assert 1 == len(sub["hasSession"]) assert sub["hasSession"][0]["schemaKey"] == "PhenotypicSession" assert sub["hasSession"][0]["hasLabel"] == "ses-nb01" + + +def test_multicolumn_diagnosis_annot_is_handled( + runner, + test_data, + default_pheno_output_path, + load_test_json, +): + """Test that when a subject has a non-healthy control diagnosis across multiple columns, they are all correctly parsed and stored as part of the subject's data.""" + runner.invoke( + bagel, + [ + "pheno", + "--pheno", + test_data / "example19.tsv", + "--dictionary", + test_data / "example19.json", + "--name", + "Multi-column annotation dataset", + "--output", + default_pheno_output_path, + ], + ) + + pheno = load_test_json(default_pheno_output_path) + # Check the subject with only disease diagnoses + sub_01_diagnoses = [ + diagnosis["identifier"] + for diagnosis in pheno["hasSamples"][0]["hasSession"][0][ + "hasDiagnosis" + ] + ] + assert sub_01_diagnoses == ["snomed:49049000", "snomed:724761004"] + + +@pytest.mark.parametrize("sub_idx", [1, 2]) +def test_multicolumn_diagnosis_annot_with_healthy_control_is_handled( + runner, test_data, default_pheno_output_path, load_test_json, sub_idx +): + """ + Test that when there are multiple columns about diagnosis and a subject has a healthy control status in one column, + the healthy control status is used and any other diagnoses are ignored. + """ + runner.invoke( + bagel, + [ + "pheno", + "--pheno", + test_data / "example19.tsv", + "--dictionary", + test_data / "example19.json", + "--name", + "Multi-column annotation dataset", + "--output", + default_pheno_output_path, + ], + ) + + pheno = load_test_json(default_pheno_output_path) + sub_with_healthy_control_annotation = pheno["hasSamples"][sub_idx][ + "hasSession" + ][0] + + assert "hasDiagnosis" not in sub_with_healthy_control_annotation.keys() + assert ( + sub_with_healthy_control_annotation["isSubjectGroup"]["identifier"] + == "ncit:C94342" + ) diff --git a/bagel/tests/test_utility.py b/bagel/tests/test_utility.py index a1602d38..a3d004ba 100644 --- a/bagel/tests/test_utility.py +++ b/bagel/tests/test_utility.py @@ -145,13 +145,26 @@ def test_map_tools_to_columns(test_data, load_test_json, tool, columns): assert result[tool] == columns -def test_get_transformed_categorical_value(test_data, load_test_json): - """Test that the correct transformed value is returned for a categorical variable""" - data_dict = load_test_json(test_data / "example2.json") - pheno = pd.read_csv(test_data / "example2.tsv", sep="\t") +@pytest.mark.parametrize( + "example, column_list, expected_values", + [ + ("example2", ["sex"], ["snomed:248153007"]), + ( + "example19", + ["group", "diagnosis"], + ["snomed:49049000", "snomed:724761004"], + ), + ], +) +def test_get_transformed_categorical_values( + test_data, load_test_json, example, column_list, expected_values +): + """Test that the correct transformed values are returned for a categorical variable""" + data_dict = load_test_json(test_data / f"{example}.json") + pheno = pd.read_csv(test_data / f"{example}.tsv", sep="\t") - assert "snomed:248153007" == putil.get_transformed_values( - columns=["sex"], + assert expected_values == putil.get_transformed_values( + columns=column_list, row=pheno.iloc[0], data_dict=data_dict, )