From e187e8e8ce14fa7cb941678d1b3cd87e25985fdb Mon Sep 17 00:00:00 2001
From: Alyssa Dai <alyssa.ydai@gmail.com>
Date: Mon, 8 Jan 2024 11:22:06 -0500
Subject: [PATCH] [ENH] Handle multi-column attribute annotations (#264)

* update docstrings and type hints for clarity

* add example with multiple diagnosis cols

* add new example with two diagnosis columns

* update test of categorical value transform

* update get_transformed_values to return a list

* add multi-diagnosis example subject with healthy control value

* update transformed value storing for attributes that expect a single value

* test subject level output with multicolumn diagnosis handling

* add comment and update test data README

* add example with multiple columns about age and sex

* add multicolumn annotation examples to bagel pheno smoke test

* add example 20 to README

* remove addressed TODOs

* add and test warning for multiple age/sex columns
---
 bagel/cli.py                    |  22 +++--
 bagel/pheno_utils.py            |  55 ++++++++----
 bagel/tests/data/README.md      |   2 +
 bagel/tests/data/example19.json | 109 ++++++++++++++++++++++++
 bagel/tests/data/example19.tsv  |   4 +
 bagel/tests/data/example20.json | 145 ++++++++++++++++++++++++++++++++
 bagel/tests/data/example20.tsv  |   4 +
 bagel/tests/test_cli_pheno.py   | 108 +++++++++++++++++++++++-
 bagel/tests/test_utility.py     |  25 ++++--
 9 files changed, 439 insertions(+), 35 deletions(-)
 create mode 100644 bagel/tests/data/example19.json
 create mode 100644 bagel/tests/data/example19.tsv
 create mode 100644 bagel/tests/data/example20.json
 create mode 100644 bagel/tests/data/example20.tsv

diff --git a/bagel/cli.py b/bagel/cli.py
index 36772b12..23e25fb5 100644
--- a/bagel/cli.py
+++ b/bagel/cli.py
@@ -112,31 +112,38 @@ def pheno(
             _ses_pheno = session_row
 
             if "sex" in column_mapping.keys():
-                _sex_val = putil.get_transformed_values(
+                _sex_vals = putil.get_transformed_values(
                     column_mapping["sex"], _ses_pheno, data_dictionary
                 )
-                if _sex_val:
-                    session.hasSex = models.Sex(identifier=_sex_val)
+                if _sex_vals:
+                    # NOTE: Our data model only allows a single sex value, so we only take the first instance if multiple columns are about sex
+                    session.hasSex = models.Sex(identifier=_sex_vals[0])
 
             if "diagnosis" in column_mapping.keys():
-                _dx_val = putil.get_transformed_values(
+                _dx_vals = putil.get_transformed_values(
                     column_mapping["diagnosis"], _ses_pheno, data_dictionary
                 )
-                if _dx_val is None:
+                if not _dx_vals:
                     pass
-                elif _dx_val == mappings.NEUROBAGEL["healthy_control"]:
+                # NOTE: If the subject has both a diagnosis value and a value of healthy control, we assume the healthy control designation is more important
+                # and do not assign diagnoses to the subject
+                elif mappings.NEUROBAGEL["healthy_control"] in _dx_vals:
                     session.isSubjectGroup = models.SubjectGroup(
                         identifier=mappings.NEUROBAGEL["healthy_control"],
                     )
                 else:
                     session.hasDiagnosis = [
                         models.Diagnosis(identifier=_dx_val)
+                        for _dx_val in _dx_vals
                     ]
 
             if "age" in column_mapping.keys():
-                session.hasAge = putil.get_transformed_values(
+                _age_vals = putil.get_transformed_values(
                     column_mapping["age"], _ses_pheno, data_dictionary
                 )
+                if _age_vals:
+                    # NOTE: Our data model only allows a single age value, so we only take the first instance if multiple columns are about age
+                    session.hasAge = _age_vals[0]
 
             if tool_mapping:
                 _assessments = [
@@ -288,7 +295,6 @@ def bids(
                 session=session,
             )
 
-            # TODO: needs refactoring once we also handle phenotypic information at the session level
             session_list.append(
                 # Add back "ses" prefix because pybids stripped it
                 models.ImagingSession(
diff --git a/bagel/pheno_utils.py b/bagel/pheno_utils.py
index 2e839804..98cd4010 100644
--- a/bagel/pheno_utils.py
+++ b/bagel/pheno_utils.py
@@ -98,7 +98,7 @@ def generate_context():
 
 def get_columns_about(data_dict: dict, concept: str) -> list:
     """
-    Returns column names that have been annotated as "IsAbout" the desired concept.
+    Returns all column names that have been annotated as "IsAbout" the desired concept.
     Parameters
     ----------
     data_dict: dict
@@ -120,7 +120,12 @@ def get_columns_about(data_dict: dict, concept: str) -> list:
     ]
 
 
-def get_annotated_columns(data_dict: dict) -> list:
+def get_annotated_columns(data_dict: dict) -> list(tuple[str, dict]):
+    """
+    Return a list of all columns that have Neurobagel 'Annotations' in a data dictionary,
+    where each column is represented as a tuple of the column name (dictionary key from the data dictionary) and
+    properties (all dictionary contents from the data dictionary).
+    """
     return [
         (col, content)
         for col, content in data_dict.items()
@@ -130,8 +135,10 @@ def get_annotated_columns(data_dict: dict) -> list:
 
 def map_categories_to_columns(data_dict: dict) -> dict:
     """
-    Maps all pre-defined Neurobagel categories (e.g. "Sex") to a list of column names (if any) that
+    Maps all pre-defined Neurobagel categories (e.g. "Sex") to a list containing all column names (if any) that
     have been linked to this category.
+
+    Returns a dictionary where the keys are the Neurobagel categories and the values are lists of column names.
     """
     return {
         cat_name: get_columns_about(data_dict, cat_iri)
@@ -144,6 +151,8 @@ def map_tools_to_columns(data_dict: dict) -> dict:
     """
     Return a mapping of all assessment tools described in the data dictionary to the columns that
     are mapped to it.
+
+    Returns a dictionary where the keys are the assessment tool IRIs and the values are lists of column names.
     """
     out_dict = defaultdict(list)
     for col, content in get_annotated_columns(data_dict):
@@ -212,31 +221,23 @@ def transform_age(value: str, heuristic: str) -> float:
 
 def get_transformed_values(
     columns: list, row: pd.Series, data_dict: dict
-) -> Union[str, None]:
-    """Convert a raw phenotypic value to the corresponding controlled term"""
-    transf_val = []
-    # TODO: Currently, this function accepts a list of columns + populates a list of transformed values because multiple columns should in theory
-    # be able to be annotated as being about a single Neurobagel concept/variable. However, we don't yet have a proper way to support multiple transformed values
-    # so this function returns just a single value or None.
-    # In future, we need to implement a way to handle cases where more than one column contains information.
-    for col in columns[:1]:
+) -> list:
+    """Convert a list of raw phenotypic values to the corresponding controlled terms, from columns that have not been annotated as being about an assessment tool."""
+    transf_vals = []
+    for col in columns:
         value = row[col]
         if is_missing_value(value, col, data_dict):
             continue
         if is_column_categorical(col, data_dict):
-            transf_val.append(map_cat_val_to_term(value, col, data_dict))
+            transf_vals.append(map_cat_val_to_term(value, col, data_dict))
         else:
             # TODO: replace with more flexible solution when we have more
             # continuous variables than just age
-            transf_val.append(
+            transf_vals.append(
                 transform_age(str(value), get_age_heuristic(col, data_dict))
             )
 
-    # TODO: once we can handle multiple columns, this section should be removed
-    # and we should just return an empty list if no transform can be generated
-    if not transf_val:
-        return None
-    return transf_val[0]
+    return transf_vals
 
 
 # TODO: Check all columns and then return list of offending columns' names
@@ -394,6 +395,24 @@ def validate_data_dict(data_dict: dict) -> None:
             "Please make sure that only one column is annotated for participant and session IDs."
         )
 
+    if (
+        len(get_columns_about(data_dict, concept=mappings.NEUROBAGEL["sex"]))
+        > 1
+    ):
+        warnings.warn(
+            "The provided data dictionary indicates more than one column about sex. "
+            "Neurobagel cannot resolve multiple sex values per subject-session, and so will only consider the first of these columns for sex data."
+        )
+
+    if (
+        len(get_columns_about(data_dict, concept=mappings.NEUROBAGEL["age"]))
+        > 1
+    ):
+        warnings.warn(
+            "The provided data dictionary indicates more than one column about age. "
+            "Neurobagel cannot resolve multiple sex values per subject-session, so will only consider the first of these columns for age data."
+        )
+
     if not categorical_cols_have_bids_levels(data_dict):
         warnings.warn(
             "The data dictionary contains at least one column that looks categorical but lacks a BIDS 'Levels' attribute."
diff --git a/bagel/tests/data/README.md b/bagel/tests/data/README.md
index 3c77a0ed..a83ab40f 100644
--- a/bagel/tests/data/README.md
+++ b/bagel/tests/data/README.md
@@ -23,6 +23,8 @@ Example inputs to the CLI
 | 16           | Invalid, same as example2.csv, but with a sneaky .tsv file ending               | Valid, same as example2                                                              | fail               |
 | 17 | Valid, contains data for three subjects, but no session column | Same as example 2 JSON, without `session_id` column | pass |
 | 18 | Invalid, example2.tsv without `session_id` column, so there are non-unique participant rows | Same as example 2 JSON, without session_id column | fail |
+| 19 | Example with two columns about diagnosis | Valid | pass |
+| 20 | Valid, based on example 19 but contains multiple annotated columns about age and sex | Valid | pass |
 
 `* this is expected to fail until we enable multiple participant_ID handling`.
 
diff --git a/bagel/tests/data/example19.json b/bagel/tests/data/example19.json
new file mode 100644
index 00000000..5e038001
--- /dev/null
+++ b/bagel/tests/data/example19.json
@@ -0,0 +1,109 @@
+{
+  "participant_id": {
+    "Description": "A participant ID",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:ParticipantID",
+        "Label": "Unique participant identifier"
+      },
+      "Identifies": "participant"
+    }
+  },
+  "session_id": {
+    "Description": "A session ID",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:SessionID",
+        "Label": "Unique session identifier"
+      },
+      "Identifies": "session"
+    }
+  },
+  "group": {
+    "Description": "Group variable",
+    "Levels": {
+      "PD": "Parkinson's disease",
+      "CTRL": "Control subject"
+    },
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Diagnosis",
+        "Label": "Diagnosis"
+      },
+      "Levels": {
+        "PD": {
+          "TermURL": "snomed:49049000",
+          "Label": "Parkinson's disease"
+        },
+        "CTRL": {
+          "TermURL": "ncit:C94342",
+          "Label": "Healthy Control"
+        }
+      }
+    }
+  },
+  "diagnosis": {
+    "Description": "Diagnosis at baseline visit",
+    "Levels": {
+      "PD": "Parkinson's disease",
+      "SPD": "Sporadic Parkinson disease"
+    },
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Diagnosis",
+        "Label": "Diagnosis"
+      },
+      "Levels": {
+        "PD": {
+          "TermURL": "snomed:49049000",
+          "Label": "Parkinson's disease"
+        },
+        "SPD": {
+          "TermURL": "snomed:724761004",
+          "Label": "Sporadic Parkinson disease"
+        },
+        "AD": {
+          "TermURL": "snomed:26929004",
+          "Label": "Alzheimer's disease"
+        }
+      },
+      "MissingValues": [""]
+    }
+  },
+  "sex": {
+    "Description": "Sex variable",
+    "Levels": {
+      "M": "Male",
+      "F": "Female"
+    },
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Sex",
+        "Label": "Sex"
+      },
+      "Levels": {
+        "M": {
+          "TermURL": "snomed:248153007",
+          "Label": "Male"
+        },
+        "F": {
+          "TermURL": "snomed:248152002",
+          "Label": "Female"
+        }
+      }
+    }
+  },
+  "participant_age": {
+    "Description": "Age of the participant",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Age",
+        "Label": "Chronological age"
+      },
+      "Transformation": {
+        "TermURL": "nb:FromISO8601",
+        "Label": "A period of time defined according to the ISO8601 standard"
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/bagel/tests/data/example19.tsv b/bagel/tests/data/example19.tsv
new file mode 100644
index 00000000..51312422
--- /dev/null
+++ b/bagel/tests/data/example19.tsv
@@ -0,0 +1,4 @@
+participant_id	session_id	group	diagnosis	sex	participant_age
+sub-01	ses-01	PD	SPD	M	P60Y6M
+sub-02	ses-01	CTRL		F	P56Y4M
+sub-03	ses-01	CTRL	AD	F	P60Y6M
\ No newline at end of file
diff --git a/bagel/tests/data/example20.json b/bagel/tests/data/example20.json
new file mode 100644
index 00000000..e2b19029
--- /dev/null
+++ b/bagel/tests/data/example20.json
@@ -0,0 +1,145 @@
+{
+  "participant_id": {
+    "Description": "A participant ID",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:ParticipantID",
+        "Label": "Unique participant identifier"
+      },
+      "Identifies": "participant"
+    }
+  },
+  "session_id": {
+    "Description": "A session ID",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:SessionID",
+        "Label": "Unique session identifier"
+      },
+      "Identifies": "session"
+    }
+  },
+  "group": {
+    "Description": "Group variable",
+    "Levels": {
+      "PD": "Parkinson's disease",
+      "CTRL": "Control subject"
+    },
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Diagnosis",
+        "Label": "Diagnosis"
+      },
+      "Levels": {
+        "PD": {
+          "TermURL": "snomed:49049000",
+          "Label": "Parkinson's disease"
+        },
+        "CTRL": {
+          "TermURL": "ncit:C94342",
+          "Label": "Healthy Control"
+        }
+      }
+    }
+  },
+  "diagnosis": {
+    "Description": "Diagnosis at baseline visit",
+    "Levels": {
+      "PD": "Parkinson's disease",
+      "SPD": "Sporadic Parkinson disease"
+    },
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Diagnosis",
+        "Label": "Diagnosis"
+      },
+      "Levels": {
+        "PD": {
+          "TermURL": "snomed:49049000",
+          "Label": "Parkinson's disease"
+        },
+        "SPD": {
+          "TermURL": "snomed:724761004",
+          "Label": "Sporadic Parkinson disease"
+        },
+        "AD": {
+          "TermURL": "snomed:26929004",
+          "Label": "Alzheimer's disease"
+        }
+      },
+      "MissingValues": [""]
+    }
+  },
+  "sex": {
+    "Description": "Sex variable",
+    "Levels": {
+      "M": "Male",
+      "F": "Female"
+    },
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Sex",
+        "Label": "Sex"
+      },
+      "Levels": {
+        "M": {
+          "TermURL": "snomed:248153007",
+          "Label": "Male"
+        },
+        "F": {
+          "TermURL": "snomed:248152002",
+          "Label": "Female"
+        }
+      }
+    }
+  },
+  "sex_full": {
+    "Description": "Sex variable",
+    "Levels": {
+      "Male": "Male",
+      "Female": "Female"
+    },
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Sex",
+        "Label": "Sex"
+      },
+      "Levels": {
+        "Male": {
+          "TermURL": "snomed:248153007",
+          "Label": "Male"
+        },
+        "Female": {
+          "TermURL": "snomed:248152002",
+          "Label": "Female"
+        }
+      }
+    }
+  },
+  "participant_age": {
+    "Description": "Age of the participant",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Age",
+        "Label": "Chronological age"
+      },
+      "Transformation": {
+        "TermURL": "nb:FromISO8601",
+        "Label": "A period of time defined according to the ISO8601 standard"
+      }
+    }
+  },
+  "participant_age_float": {
+    "Description": "Age of the participant",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Age",
+        "Label": "Chronological age"
+      },
+      "Transformation": {
+        "TermURL": "nb:FromFloat",
+        "Label": "Age as a floating point number"
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/bagel/tests/data/example20.tsv b/bagel/tests/data/example20.tsv
new file mode 100644
index 00000000..32028f6c
--- /dev/null
+++ b/bagel/tests/data/example20.tsv
@@ -0,0 +1,4 @@
+participant_id	session_id	group	diagnosis	sex	sex_full	participant_age	participant_age_float
+sub-01	ses-01	PD	SPD	M	Male	P60Y6M	60.5
+sub-02	ses-01	CTRL		F	Female	P56Y4M	56.3
+sub-03	ses-01	CTRL	AD	F	Female	P60Y6M	60.5
\ No newline at end of file
diff --git a/bagel/tests/test_cli_pheno.py b/bagel/tests/test_cli_pheno.py
index c98f633b..315f8cf2 100644
--- a/bagel/tests/test_cli_pheno.py
+++ b/bagel/tests/test_cli_pheno.py
@@ -22,6 +22,8 @@ def default_pheno_output_path(tmp_path):
         "example14",
         "example_synthetic",
         "example17",
+        "example19",
+        "example20",
     ],
 )
 def test_pheno_valid_inputs_run_successfully(
@@ -187,6 +189,38 @@ def test_invalid_portal_uris_produces_error(
     )
 
 
+def test_multiple_age_or_sex_columns_raises_warning(
+    runner,
+    test_data,
+    default_pheno_output_path,
+):
+    """Test that an informative warning is raised when multiple columns in the phenotypic file have been annotated as being about age or sex."""
+    with pytest.warns(UserWarning) as w:
+        runner.invoke(
+            bagel,
+            [
+                "pheno",
+                "--pheno",
+                test_data / "example20.tsv",
+                "--dictionary",
+                test_data / "example20.json",
+                "--output",
+                default_pheno_output_path,
+                "--name",
+                "Multiple age/sex columns dataset",
+            ],
+            catch_exceptions=False,
+        )
+
+    assert len(w) == 2
+    warnings = [warning.message.args[0] for warning in w]
+    for warn_substring in [
+        "more than one column about age",
+        "more than one column about sex",
+    ]:
+        assert [any(warn_substring in warning_str for warning_str in warnings)]
+
+
 def test_missing_bids_levels_raises_warning(
     runner,
     test_data,
@@ -713,7 +747,7 @@ def test_pheno_sessions_have_correct_labels(
 def test_pheno_session_created_for_missing_session_column(
     runner,
     test_data,
-    tmp_path,
+    default_pheno_output_path,
     load_test_json,
 ):
     """
@@ -731,12 +765,80 @@ def test_pheno_session_created_for_missing_session_column(
             "--name",
             "Missing session column dataset",
             "--output",
-            tmp_path / "example_synthetic.jsonld",
+            default_pheno_output_path,
         ],
     )
 
-    pheno = load_test_json(tmp_path / "example_synthetic.jsonld")
+    pheno = load_test_json(default_pheno_output_path)
     for sub in pheno["hasSamples"]:
         assert 1 == len(sub["hasSession"])
         assert sub["hasSession"][0]["schemaKey"] == "PhenotypicSession"
         assert sub["hasSession"][0]["hasLabel"] == "ses-nb01"
+
+
+def test_multicolumn_diagnosis_annot_is_handled(
+    runner,
+    test_data,
+    default_pheno_output_path,
+    load_test_json,
+):
+    """Test that when a subject has a non-healthy control diagnosis across multiple columns, they are all correctly parsed and stored as part of the subject's data."""
+    runner.invoke(
+        bagel,
+        [
+            "pheno",
+            "--pheno",
+            test_data / "example19.tsv",
+            "--dictionary",
+            test_data / "example19.json",
+            "--name",
+            "Multi-column annotation dataset",
+            "--output",
+            default_pheno_output_path,
+        ],
+    )
+
+    pheno = load_test_json(default_pheno_output_path)
+    # Check the subject with only disease diagnoses
+    sub_01_diagnoses = [
+        diagnosis["identifier"]
+        for diagnosis in pheno["hasSamples"][0]["hasSession"][0][
+            "hasDiagnosis"
+        ]
+    ]
+    assert sub_01_diagnoses == ["snomed:49049000", "snomed:724761004"]
+
+
+@pytest.mark.parametrize("sub_idx", [1, 2])
+def test_multicolumn_diagnosis_annot_with_healthy_control_is_handled(
+    runner, test_data, default_pheno_output_path, load_test_json, sub_idx
+):
+    """
+    Test that when there are multiple columns about diagnosis and a subject has a healthy control status in one column,
+    the healthy control status is used and any other diagnoses are ignored.
+    """
+    runner.invoke(
+        bagel,
+        [
+            "pheno",
+            "--pheno",
+            test_data / "example19.tsv",
+            "--dictionary",
+            test_data / "example19.json",
+            "--name",
+            "Multi-column annotation dataset",
+            "--output",
+            default_pheno_output_path,
+        ],
+    )
+
+    pheno = load_test_json(default_pheno_output_path)
+    sub_with_healthy_control_annotation = pheno["hasSamples"][sub_idx][
+        "hasSession"
+    ][0]
+
+    assert "hasDiagnosis" not in sub_with_healthy_control_annotation.keys()
+    assert (
+        sub_with_healthy_control_annotation["isSubjectGroup"]["identifier"]
+        == "ncit:C94342"
+    )
diff --git a/bagel/tests/test_utility.py b/bagel/tests/test_utility.py
index a1602d38..a3d004ba 100644
--- a/bagel/tests/test_utility.py
+++ b/bagel/tests/test_utility.py
@@ -145,13 +145,26 @@ def test_map_tools_to_columns(test_data, load_test_json, tool, columns):
     assert result[tool] == columns
 
 
-def test_get_transformed_categorical_value(test_data, load_test_json):
-    """Test that the correct transformed value is returned for a categorical variable"""
-    data_dict = load_test_json(test_data / "example2.json")
-    pheno = pd.read_csv(test_data / "example2.tsv", sep="\t")
+@pytest.mark.parametrize(
+    "example, column_list, expected_values",
+    [
+        ("example2", ["sex"], ["snomed:248153007"]),
+        (
+            "example19",
+            ["group", "diagnosis"],
+            ["snomed:49049000", "snomed:724761004"],
+        ),
+    ],
+)
+def test_get_transformed_categorical_values(
+    test_data, load_test_json, example, column_list, expected_values
+):
+    """Test that the correct transformed values are returned for a categorical variable"""
+    data_dict = load_test_json(test_data / f"{example}.json")
+    pheno = pd.read_csv(test_data / f"{example}.tsv", sep="\t")
 
-    assert "snomed:248153007" == putil.get_transformed_values(
-        columns=["sex"],
+    assert expected_values == putil.get_transformed_values(
+        columns=column_list,
         row=pheno.iloc[0],
         data_dict=data_dict,
     )