Skip to content

Commit

Permalink
[ENH] Handle multi-column attribute annotations (#264)
Browse files Browse the repository at this point in the history
* update docstrings and type hints for clarity

* add example with multiple diagnosis cols

* add new example with two diagnosis columns

* update test of categorical value transform

* update get_transformed_values to return a list

* add multi-diagnosis example subject with healthy control value

* update transformed value storing for attributes that expect a single value

* test subject level output with multicolumn diagnosis handling

* add comment and update test data README

* add example with multiple columns about age and sex

* add multicolumn annotation examples to bagel pheno smoke test

* add example 20 to README

* remove addressed TODOs

* add and test warning for multiple age/sex columns
  • Loading branch information
alyssadai authored Jan 8, 2024
1 parent ccd6f8b commit e187e8e
Show file tree
Hide file tree
Showing 9 changed files with 439 additions and 35 deletions.
22 changes: 14 additions & 8 deletions bagel/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,31 +112,38 @@ def pheno(
_ses_pheno = session_row

if "sex" in column_mapping.keys():
_sex_val = putil.get_transformed_values(
_sex_vals = putil.get_transformed_values(
column_mapping["sex"], _ses_pheno, data_dictionary
)
if _sex_val:
session.hasSex = models.Sex(identifier=_sex_val)
if _sex_vals:
# NOTE: Our data model only allows a single sex value, so we only take the first instance if multiple columns are about sex
session.hasSex = models.Sex(identifier=_sex_vals[0])

if "diagnosis" in column_mapping.keys():
_dx_val = putil.get_transformed_values(
_dx_vals = putil.get_transformed_values(
column_mapping["diagnosis"], _ses_pheno, data_dictionary
)
if _dx_val is None:
if not _dx_vals:
pass
elif _dx_val == mappings.NEUROBAGEL["healthy_control"]:
# NOTE: If the subject has both a diagnosis value and a value of healthy control, we assume the healthy control designation is more important
# and do not assign diagnoses to the subject
elif mappings.NEUROBAGEL["healthy_control"] in _dx_vals:
session.isSubjectGroup = models.SubjectGroup(
identifier=mappings.NEUROBAGEL["healthy_control"],
)
else:
session.hasDiagnosis = [
models.Diagnosis(identifier=_dx_val)
for _dx_val in _dx_vals
]

if "age" in column_mapping.keys():
session.hasAge = putil.get_transformed_values(
_age_vals = putil.get_transformed_values(
column_mapping["age"], _ses_pheno, data_dictionary
)
if _age_vals:
# NOTE: Our data model only allows a single age value, so we only take the first instance if multiple columns are about age
session.hasAge = _age_vals[0]

if tool_mapping:
_assessments = [
Expand Down Expand Up @@ -288,7 +295,6 @@ def bids(
session=session,
)

# TODO: needs refactoring once we also handle phenotypic information at the session level
session_list.append(
# Add back "ses" prefix because pybids stripped it
models.ImagingSession(
Expand Down
55 changes: 37 additions & 18 deletions bagel/pheno_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def generate_context():

def get_columns_about(data_dict: dict, concept: str) -> list:
"""
Returns column names that have been annotated as "IsAbout" the desired concept.
Returns all column names that have been annotated as "IsAbout" the desired concept.
Parameters
----------
data_dict: dict
Expand All @@ -120,7 +120,12 @@ def get_columns_about(data_dict: dict, concept: str) -> list:
]


def get_annotated_columns(data_dict: dict) -> list:
def get_annotated_columns(data_dict: dict) -> list(tuple[str, dict]):
"""
Return a list of all columns that have Neurobagel 'Annotations' in a data dictionary,
where each column is represented as a tuple of the column name (dictionary key from the data dictionary) and
properties (all dictionary contents from the data dictionary).
"""
return [
(col, content)
for col, content in data_dict.items()
Expand All @@ -130,8 +135,10 @@ def get_annotated_columns(data_dict: dict) -> list:

def map_categories_to_columns(data_dict: dict) -> dict:
"""
Maps all pre-defined Neurobagel categories (e.g. "Sex") to a list of column names (if any) that
Maps all pre-defined Neurobagel categories (e.g. "Sex") to a list containing all column names (if any) that
have been linked to this category.
Returns a dictionary where the keys are the Neurobagel categories and the values are lists of column names.
"""
return {
cat_name: get_columns_about(data_dict, cat_iri)
Expand All @@ -144,6 +151,8 @@ def map_tools_to_columns(data_dict: dict) -> dict:
"""
Return a mapping of all assessment tools described in the data dictionary to the columns that
are mapped to it.
Returns a dictionary where the keys are the assessment tool IRIs and the values are lists of column names.
"""
out_dict = defaultdict(list)
for col, content in get_annotated_columns(data_dict):
Expand Down Expand Up @@ -212,31 +221,23 @@ def transform_age(value: str, heuristic: str) -> float:

def get_transformed_values(
columns: list, row: pd.Series, data_dict: dict
) -> Union[str, None]:
"""Convert a raw phenotypic value to the corresponding controlled term"""
transf_val = []
# TODO: Currently, this function accepts a list of columns + populates a list of transformed values because multiple columns should in theory
# be able to be annotated as being about a single Neurobagel concept/variable. However, we don't yet have a proper way to support multiple transformed values
# so this function returns just a single value or None.
# In future, we need to implement a way to handle cases where more than one column contains information.
for col in columns[:1]:
) -> list:
"""Convert a list of raw phenotypic values to the corresponding controlled terms, from columns that have not been annotated as being about an assessment tool."""
transf_vals = []
for col in columns:
value = row[col]
if is_missing_value(value, col, data_dict):
continue
if is_column_categorical(col, data_dict):
transf_val.append(map_cat_val_to_term(value, col, data_dict))
transf_vals.append(map_cat_val_to_term(value, col, data_dict))
else:
# TODO: replace with more flexible solution when we have more
# continuous variables than just age
transf_val.append(
transf_vals.append(
transform_age(str(value), get_age_heuristic(col, data_dict))
)

# TODO: once we can handle multiple columns, this section should be removed
# and we should just return an empty list if no transform can be generated
if not transf_val:
return None
return transf_val[0]
return transf_vals


# TODO: Check all columns and then return list of offending columns' names
Expand Down Expand Up @@ -394,6 +395,24 @@ def validate_data_dict(data_dict: dict) -> None:
"Please make sure that only one column is annotated for participant and session IDs."
)

if (
len(get_columns_about(data_dict, concept=mappings.NEUROBAGEL["sex"]))
> 1
):
warnings.warn(
"The provided data dictionary indicates more than one column about sex. "
"Neurobagel cannot resolve multiple sex values per subject-session, and so will only consider the first of these columns for sex data."
)

if (
len(get_columns_about(data_dict, concept=mappings.NEUROBAGEL["age"]))
> 1
):
warnings.warn(
"The provided data dictionary indicates more than one column about age. "
"Neurobagel cannot resolve multiple sex values per subject-session, so will only consider the first of these columns for age data."
)

if not categorical_cols_have_bids_levels(data_dict):
warnings.warn(
"The data dictionary contains at least one column that looks categorical but lacks a BIDS 'Levels' attribute."
Expand Down
2 changes: 2 additions & 0 deletions bagel/tests/data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ Example inputs to the CLI
| 16 | Invalid, same as example2.csv, but with a sneaky .tsv file ending | Valid, same as example2 | fail |
| 17 | Valid, contains data for three subjects, but no session column | Same as example 2 JSON, without `session_id` column | pass |
| 18 | Invalid, example2.tsv without `session_id` column, so there are non-unique participant rows | Same as example 2 JSON, without session_id column | fail |
| 19 | Example with two columns about diagnosis | Valid | pass |
| 20 | Valid, based on example 19 but contains multiple annotated columns about age and sex | Valid | pass |

`* this is expected to fail until we enable multiple participant_ID handling`.

Expand Down
109 changes: 109 additions & 0 deletions bagel/tests/data/example19.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
{
"participant_id": {
"Description": "A participant ID",
"Annotations": {
"IsAbout": {
"TermURL": "nb:ParticipantID",
"Label": "Unique participant identifier"
},
"Identifies": "participant"
}
},
"session_id": {
"Description": "A session ID",
"Annotations": {
"IsAbout": {
"TermURL": "nb:SessionID",
"Label": "Unique session identifier"
},
"Identifies": "session"
}
},
"group": {
"Description": "Group variable",
"Levels": {
"PD": "Parkinson's disease",
"CTRL": "Control subject"
},
"Annotations": {
"IsAbout": {
"TermURL": "nb:Diagnosis",
"Label": "Diagnosis"
},
"Levels": {
"PD": {
"TermURL": "snomed:49049000",
"Label": "Parkinson's disease"
},
"CTRL": {
"TermURL": "ncit:C94342",
"Label": "Healthy Control"
}
}
}
},
"diagnosis": {
"Description": "Diagnosis at baseline visit",
"Levels": {
"PD": "Parkinson's disease",
"SPD": "Sporadic Parkinson disease"
},
"Annotations": {
"IsAbout": {
"TermURL": "nb:Diagnosis",
"Label": "Diagnosis"
},
"Levels": {
"PD": {
"TermURL": "snomed:49049000",
"Label": "Parkinson's disease"
},
"SPD": {
"TermURL": "snomed:724761004",
"Label": "Sporadic Parkinson disease"
},
"AD": {
"TermURL": "snomed:26929004",
"Label": "Alzheimer's disease"
}
},
"MissingValues": [""]
}
},
"sex": {
"Description": "Sex variable",
"Levels": {
"M": "Male",
"F": "Female"
},
"Annotations": {
"IsAbout": {
"TermURL": "nb:Sex",
"Label": "Sex"
},
"Levels": {
"M": {
"TermURL": "snomed:248153007",
"Label": "Male"
},
"F": {
"TermURL": "snomed:248152002",
"Label": "Female"
}
}
}
},
"participant_age": {
"Description": "Age of the participant",
"Annotations": {
"IsAbout": {
"TermURL": "nb:Age",
"Label": "Chronological age"
},
"Transformation": {
"TermURL": "nb:FromISO8601",
"Label": "A period of time defined according to the ISO8601 standard"
}
}
}
}
4 changes: 4 additions & 0 deletions bagel/tests/data/example19.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
participant_id session_id group diagnosis sex participant_age
sub-01 ses-01 PD SPD M P60Y6M
sub-02 ses-01 CTRL F P56Y4M
sub-03 ses-01 CTRL AD F P60Y6M
Loading

0 comments on commit e187e8e

Please sign in to comment.