[ENH] Handle multi-column attribute annotations (#264)

* update docstrings and type hints for clarity * add example with multiple diagnosis cols * add new example with two diagnosis columns * update test of categorical value transform * update get_transformed_values to return a list * add multi-diagnosis example subject with healthy control value * update transformed value storing for attributes that expect a single value * test subject level output with multicolumn diagnosis handling * add comment and update test data README * add example with multiple columns about age and sex * add multicolumn annotation examples to bagel pheno smoke test * add example 20 to README * remove addressed TODOs * add and test warning for multiple age/sex columns
neurobagel · Jan 8, 2024 · e187e8e · e187e8e
1 parent ccd6f8b
commit e187e8e
Show file tree

Hide file tree

Showing 9 changed files with 439 additions and 35 deletions.
diff --git a/bagel/cli.py b/bagel/cli.py
@@ -112,31 +112,38 @@ def pheno(
             _ses_pheno = session_row
 
             if "sex" in column_mapping.keys():
-                _sex_val = putil.get_transformed_values(
+                _sex_vals = putil.get_transformed_values(
                     column_mapping["sex"], _ses_pheno, data_dictionary
                 )
-                if _sex_val:
-                    session.hasSex = models.Sex(identifier=_sex_val)
+                if _sex_vals:
+                    # NOTE: Our data model only allows a single sex value, so we only take the first instance if multiple columns are about sex
+                    session.hasSex = models.Sex(identifier=_sex_vals[0])
 
             if "diagnosis" in column_mapping.keys():
-                _dx_val = putil.get_transformed_values(
+                _dx_vals = putil.get_transformed_values(
                     column_mapping["diagnosis"], _ses_pheno, data_dictionary
                 )
-                if _dx_val is None:
+                if not _dx_vals:
                     pass
-                elif _dx_val == mappings.NEUROBAGEL["healthy_control"]:
+                # NOTE: If the subject has both a diagnosis value and a value of healthy control, we assume the healthy control designation is more important
+                # and do not assign diagnoses to the subject
+                elif mappings.NEUROBAGEL["healthy_control"] in _dx_vals:
                     session.isSubjectGroup = models.SubjectGroup(
                         identifier=mappings.NEUROBAGEL["healthy_control"],
                     )
                 else:
                     session.hasDiagnosis = [
                         models.Diagnosis(identifier=_dx_val)
+                        for _dx_val in _dx_vals
                     ]
 
             if "age" in column_mapping.keys():
-                session.hasAge = putil.get_transformed_values(
+                _age_vals = putil.get_transformed_values(
                     column_mapping["age"], _ses_pheno, data_dictionary
                 )
+                if _age_vals:
+                    # NOTE: Our data model only allows a single age value, so we only take the first instance if multiple columns are about age
+                    session.hasAge = _age_vals[0]
 
             if tool_mapping:
                 _assessments = [
@@ -288,7 +295,6 @@ def bids(
                 session=session,
             )
 
-            # TODO: needs refactoring once we also handle phenotypic information at the session level
             session_list.append(
                 # Add back "ses" prefix because pybids stripped it
                 models.ImagingSession(

diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py
@@ -98,7 +98,7 @@ def generate_context():
 
 def get_columns_about(data_dict: dict, concept: str) -> list:
     """
-    Returns column names that have been annotated as "IsAbout" the desired concept.
+    Returns all column names that have been annotated as "IsAbout" the desired concept.
     Parameters
     ----------
     data_dict: dict
@@ -120,7 +120,12 @@ def get_columns_about(data_dict: dict, concept: str) -> list:
     ]
 
 
-def get_annotated_columns(data_dict: dict) -> list:
+def get_annotated_columns(data_dict: dict) -> list(tuple[str, dict]):
+    """
+    Return a list of all columns that have Neurobagel 'Annotations' in a data dictionary,
+    where each column is represented as a tuple of the column name (dictionary key from the data dictionary) and
+    properties (all dictionary contents from the data dictionary).
+    """
     return [
         (col, content)
         for col, content in data_dict.items()
@@ -130,8 +135,10 @@ def get_annotated_columns(data_dict: dict) -> list:
 
 def map_categories_to_columns(data_dict: dict) -> dict:
     """
-    Maps all pre-defined Neurobagel categories (e.g. "Sex") to a list of column names (if any) that
+    Maps all pre-defined Neurobagel categories (e.g. "Sex") to a list containing all column names (if any) that
     have been linked to this category.
+
+    Returns a dictionary where the keys are the Neurobagel categories and the values are lists of column names.
     """
     return {
         cat_name: get_columns_about(data_dict, cat_iri)
@@ -144,6 +151,8 @@ def map_tools_to_columns(data_dict: dict) -> dict:
     """
     Return a mapping of all assessment tools described in the data dictionary to the columns that
     are mapped to it.
+
+    Returns a dictionary where the keys are the assessment tool IRIs and the values are lists of column names.
     """
     out_dict = defaultdict(list)
     for col, content in get_annotated_columns(data_dict):
@@ -212,31 +221,23 @@ def transform_age(value: str, heuristic: str) -> float:
 
 def get_transformed_values(
     columns: list, row: pd.Series, data_dict: dict
-) -> Union[str, None]:
-    """Convert a raw phenotypic value to the corresponding controlled term"""
-    transf_val = []
-    # TODO: Currently, this function accepts a list of columns + populates a list of transformed values because multiple columns should in theory
-    # be able to be annotated as being about a single Neurobagel concept/variable. However, we don't yet have a proper way to support multiple transformed values
-    # so this function returns just a single value or None.
-    # In future, we need to implement a way to handle cases where more than one column contains information.
-    for col in columns[:1]:
+) -> list:
+    """Convert a list of raw phenotypic values to the corresponding controlled terms, from columns that have not been annotated as being about an assessment tool."""
+    transf_vals = []
+    for col in columns:
         value = row[col]
         if is_missing_value(value, col, data_dict):
             continue
         if is_column_categorical(col, data_dict):
-            transf_val.append(map_cat_val_to_term(value, col, data_dict))
+            transf_vals.append(map_cat_val_to_term(value, col, data_dict))
         else:
             # TODO: replace with more flexible solution when we have more
             # continuous variables than just age
-            transf_val.append(
+            transf_vals.append(
                 transform_age(str(value), get_age_heuristic(col, data_dict))
             )
 
-    # TODO: once we can handle multiple columns, this section should be removed
-    # and we should just return an empty list if no transform can be generated
-    if not transf_val:
-        return None
-    return transf_val[0]
+    return transf_vals
 
 
 # TODO: Check all columns and then return list of offending columns' names
@@ -394,6 +395,24 @@ def validate_data_dict(data_dict: dict) -> None:
             "Please make sure that only one column is annotated for participant and session IDs."
         )
 
+    if (
+        len(get_columns_about(data_dict, concept=mappings.NEUROBAGEL["sex"]))
+        > 1
+    ):
+        warnings.warn(
+            "The provided data dictionary indicates more than one column about sex. "
+            "Neurobagel cannot resolve multiple sex values per subject-session, and so will only consider the first of these columns for sex data."
+        )
+
+    if (
+        len(get_columns_about(data_dict, concept=mappings.NEUROBAGEL["age"]))
+        > 1
+    ):
+        warnings.warn(
+            "The provided data dictionary indicates more than one column about age. "
+            "Neurobagel cannot resolve multiple sex values per subject-session, so will only consider the first of these columns for age data."
+        )
+
     if not categorical_cols_have_bids_levels(data_dict):
         warnings.warn(
             "The data dictionary contains at least one column that looks categorical but lacks a BIDS 'Levels' attribute."

diff --git a/bagel/tests/data/README.md b/bagel/tests/data/README.md
@@ -23,6 +23,8 @@ Example inputs to the CLI
 | 16           | Invalid, same as example2.csv, but with a sneaky .tsv file ending               | Valid, same as example2                                                              | fail               |
 | 17 | Valid, contains data for three subjects, but no session column | Same as example 2 JSON, without `session_id` column | pass |
 | 18 | Invalid, example2.tsv without `session_id` column, so there are non-unique participant rows | Same as example 2 JSON, without session_id column | fail |
+| 19 | Example with two columns about diagnosis | Valid | pass |
+| 20 | Valid, based on example 19 but contains multiple annotated columns about age and sex | Valid | pass |
 
 `* this is expected to fail until we enable multiple participant_ID handling`.
 

diff --git a/bagel/tests/data/example19.json b/bagel/tests/data/example19.json
@@ -0,0 +1,109 @@
+{
+  "participant_id": {
+    "Description": "A participant ID",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:ParticipantID",
+        "Label": "Unique participant identifier"
+      },
+      "Identifies": "participant"
+    }
+  },
+  "session_id": {
+    "Description": "A session ID",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:SessionID",
+        "Label": "Unique session identifier"
+      },
+      "Identifies": "session"
+    }
+  },
+  "group": {
+    "Description": "Group variable",
+    "Levels": {
+      "PD": "Parkinson's disease",
+      "CTRL": "Control subject"
+    },
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Diagnosis",
+        "Label": "Diagnosis"
+      },
+      "Levels": {
+        "PD": {
+          "TermURL": "snomed:49049000",
+          "Label": "Parkinson's disease"
+        },
+        "CTRL": {
+          "TermURL": "ncit:C94342",
+          "Label": "Healthy Control"
+        }
+      }
+    }
+  },
+  "diagnosis": {
+    "Description": "Diagnosis at baseline visit",
+    "Levels": {
+      "PD": "Parkinson's disease",
+      "SPD": "Sporadic Parkinson disease"
+    },
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Diagnosis",
+        "Label": "Diagnosis"
+      },
+      "Levels": {
+        "PD": {
+          "TermURL": "snomed:49049000",
+          "Label": "Parkinson's disease"
+        },
+        "SPD": {
+          "TermURL": "snomed:724761004",
+          "Label": "Sporadic Parkinson disease"
+        },
+        "AD": {
+          "TermURL": "snomed:26929004",
+          "Label": "Alzheimer's disease"
+        }
+      },
+      "MissingValues": [""]
+    }
+  },
+  "sex": {
+    "Description": "Sex variable",
+    "Levels": {
+      "M": "Male",
+      "F": "Female"
+    },
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Sex",
+        "Label": "Sex"
+      },
+      "Levels": {
+        "M": {
+          "TermURL": "snomed:248153007",
+          "Label": "Male"
+        },
+        "F": {
+          "TermURL": "snomed:248152002",
+          "Label": "Female"
+        }
+      }
+    }
+  },
+  "participant_age": {
+    "Description": "Age of the participant",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Age",
+        "Label": "Chronological age"
+      },
+      "Transformation": {
+        "TermURL": "nb:FromISO8601",
+        "Label": "A period of time defined according to the ISO8601 standard"
+      }
+    }
+  }
+}
diff --git a/bagel/tests/data/example19.tsv b/bagel/tests/data/example19.tsv
@@ -0,0 +1,4 @@
+participant_id	session_id	group	diagnosis	sex	participant_age
+sub-01	ses-01	PD	SPD	M	P60Y6M
+sub-02	ses-01	CTRL		F	P56Y4M
+sub-03	ses-01	CTRL	AD	F	P60Y6M