neurobagel · alyssadai · Jan 25, 2025 · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -41,3 +41,14 @@ repos:
       - id: codespell
         additional_dependencies:
           - tomli
+
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.14.1
+    hooks:
+      - id: mypy
+        args:
+          - --ignore-missing-imports
+          - --no-warn-no-return
+          - --warn-redundant-casts
+          - --show-error-codes
+          - --pretty
diff --git a/bagel/cli.py b/bagel/cli.py
@@ -111,7 +111,7 @@ def pheno(
     tool_mapping = pheno_utils.map_tools_to_columns(data_dictionary)
 
     # TODO: needs refactoring once we handle multiple participant IDs
-    participants = column_mapping.get("participant")[0]
+    participants = column_mapping["participant"][0]
 
     # Note that `session_column will be None if there is no session column in the pheno.tsv
     session_column = column_mapping.get("session")
@@ -273,12 +273,14 @@ def bids(
     print("Initial checks of inputs passed.\n")
 
     print("Parsing and validating BIDS dataset. This may take a while...")
+    # NOTE: If there are no subjects in the BIDS dataset, the validation should fail.
+    # The rest of this workflow assumes there's at least one subject in the BIDS dataset.
     layout = BIDSLayout(bids_dir, validate=True)
     print("BIDS parsing completed.\n")
 
     print("Merging BIDS metadata with existing subject annotations...\n")
     for bids_sub_id in layout.get_subjects():
-        existing_subject = existing_subs_dict.get(f"sub-{bids_sub_id}")
+        existing_subject = existing_subs_dict[f"sub-{bids_sub_id}"]
         existing_sessions_dict = model_utils.get_imaging_session_instances(
             existing_subject
         )
@@ -323,9 +325,7 @@ def bids(
             # If a custom Neurobagel-created session already exists (if `bagel derivatives` was run first),
             # we add to that session when there is no session layer in the BIDS directory
             if session_label in existing_sessions_dict:
-                existing_img_session = existing_sessions_dict.get(
-                    session_label
-                )
+                existing_img_session = existing_sessions_dict[session_label]
                 existing_img_session.hasAcquisition = image_list
                 existing_img_session.hasFilePath = session_path
             else:
@@ -434,7 +434,7 @@ def derivatives(
     for subject, sub_proc_df in status_df.groupby(
         PROC_STATUS_COLS["participant"]
     ):
-        existing_subject = existing_subs_dict.get(subject)
+        existing_subject = existing_subs_dict[subject]
 
         # Note: Dictionary of existing imaging sessions can be empty if only bagel pheno was run
         existing_sessions_dict = model_utils.get_imaging_session_instances(
@@ -455,9 +455,7 @@ def derivatives(
                 CUSTOM_SESSION_LABEL if session_label == "" else session_label
             )
             if session_label in existing_sessions_dict:
-                existing_img_session = existing_sessions_dict.get(
-                    session_label
-                )
+                existing_img_session = existing_sessions_dict[session_label]
                 existing_img_session.hasCompletedPipeline = completed_pipelines
             else:
                 new_img_session = models.ImagingSession(

diff --git a/bagel/utilities/file_utils.py b/bagel/utilities/file_utils.py
@@ -1,6 +1,6 @@
 import json
 from pathlib import Path
-from typing import Optional
+from typing import Any
 
 import pandas as pd
 import typer
@@ -18,7 +18,7 @@ def file_encoding_error_message(input_p: Path) -> str:
 
 def load_tabular(
     input_p: Path, input_type: str = "phenotypic"
-) -> Optional[pd.DataFrame]:
+) -> pd.DataFrame:
     """Load a .tsv pheno file and do some basic validation of the file type."""
     if input_p.suffix == ".tsv":
         try:
@@ -61,7 +61,7 @@ def load_tabular(
     )
 
 
-def load_json(input_p: Path) -> dict:
+def load_json(input_p: Path) -> Any:
     """Load a user-specified json type file."""
     try:
         with open(input_p, "r", encoding="utf-8") as f:

diff --git a/bagel/utilities/model_utils.py b/bagel/utilities/model_utils.py
@@ -97,7 +97,9 @@ def extract_and_validate_jsonld_dataset(file_path: Path) -> models.Dataset:
     return jsonld_dataset
 
 
-def get_subject_instances(dataset: models.Dataset) -> dict:
+def get_subject_instances(
+    dataset: models.Dataset,
+) -> dict[str, models.Subject]:
     """
     Return a dictionary of subjects for a given Neurobagel dataset from JSONLD data,
     where keys are subject labels and values are the subject objects.
@@ -109,7 +111,7 @@ def get_subject_instances(dataset: models.Dataset) -> dict:
 
 def get_imaging_session_instances(
     jsonld_subject: models.Subject,
-) -> dict:
+) -> dict[str, models.ImagingSession]:
     """
     Return a dictionary of imaging sessions for a given subject from JSONLD data,
     where the keys are the session labels and values are the session objects.

diff --git a/bagel/utilities/pheno_utils.py b/bagel/utilities/pheno_utils.py
@@ -128,7 +128,7 @@ def find_deprecated_namespaces(namespaces: list) -> list:
     return [ns for ns in namespaces if ns in DEPRECATED_NAMESPACE_PREFIXES]
 
 
-def map_categories_to_columns(data_dict: dict) -> dict:
+def map_categories_to_columns(data_dict: dict) -> dict[str, list]:
     """
     Maps all pre-defined Neurobagel categories (e.g. "Sex") to a list containing all column names (if any) that
     have been linked to this category.
@@ -142,7 +142,7 @@ def map_categories_to_columns(data_dict: dict) -> dict:
     }
 
 
-def map_tools_to_columns(data_dict: dict) -> dict:
+def map_tools_to_columns(data_dict: dict) -> dict[str, list]:
     """
     Return a mapping of all assessment tools described in the data dictionary to the columns that
     are mapped to it.
@@ -184,7 +184,6 @@ def get_age_heuristic(column: str, data_dict: dict) -> str:
 
 
 def transform_age(value: str, heuristic: str) -> float:
-    is_recognized_heuristic = True
     try:
         if heuristic in [
             AGE_HEURISTICS["float"],
@@ -200,26 +199,23 @@ def transform_age(value: str, heuristic: str) -> float:
                 value = "P" + value
             duration = isodate.parse_duration(value)
             return float(duration.years + duration.months / 12)
-        else:
-            is_recognized_heuristic = False
+        raise ValueError(
+            f"The provided data dictionary contains an unrecognized age transformation: {heuristic}. "
+            f"Ensure that the transformation TermURL is one of {list(AGE_HEURISTICS.values())}."
+        )
     except (ValueError, isodate.isoerror.ISO8601Error) as e:
         raise ValueError(
             f"There was a problem with applying the age transformation: {heuristic}. Error: {str(e)}\n"
             f"Check that the transformation specified in the data dictionary ({heuristic}) is correct for the age values in your phenotypic file, "
             "and that you correctly annotated any missing values in your age column."
         ) from e
-    if not is_recognized_heuristic:
-        raise ValueError(
-            f"The provided data dictionary contains an unrecognized age transformation: {heuristic}. "
-            f"Ensure that the transformation TermURL is one of {list(AGE_HEURISTICS.values())}."
-        )
 
 
 def get_transformed_values(
     columns: list, row: pd.Series, data_dict: dict
 ) -> list:
     """Convert a list of raw phenotypic values to the corresponding controlled terms, from columns that have not been annotated as being about an assessment tool."""
-    transf_vals = []
+    transf_vals: list[float | str] = []
     for col in columns:
         value = row[col]
         if is_missing_value(value, col, data_dict):
@@ -290,7 +286,7 @@ def are_inputs_compatible(data_dict: dict, pheno_df: pd.DataFrame) -> bool:
 
 def find_undefined_cat_col_values(
     data_dict: dict, pheno_df: pd.DataFrame
-) -> dict:
+) -> dict[str, list]:
     """
     Checks that all categorical column values have annotations. Returns a dictionary containing
     any categorical column names and specific column values not defined in the corresponding data
@@ -314,7 +310,7 @@ def find_undefined_cat_col_values(
 
 def find_unused_missing_values(
     data_dict: dict, pheno_df: pd.DataFrame
-) -> dict:
+) -> dict[str, list]:
     """
     Checks if missing values annotated in the data dictionary appear at least once in the phenotypic file.
     Returns a dictionary containing any column names and annotated missing values not found in the phenotypic

diff --git a/tests/unit/test_derivative_utils.py b/tests/unit/test_derivative_utils.py
@@ -41,7 +41,7 @@ def test_raises_exception_if_remote_and_local_pipeline_catalog_fails(
     monkeypatch, tmp_path
 ):
     """
-    If I cannot get the pipeline catalog from the remote location and the local backup, I should raise an exception.
+    If I cannot get the pipeline catalog from both the remote location and the local backup, I should raise an exception.
     """
     nonsense_url = "https://does.not.exist.url"
 
@@ -59,11 +59,15 @@ def mock_httpx_get(*args, **kwargs):
 
     monkeypatch.setattr(httpx, "get", mock_httpx_get)
 
-    with pytest.raises(FileNotFoundError) as e:
-        mappings.get_pipeline_catalog(
-            url=nonsense_url, path=tmp_path / "does_not_exist.json"
-        )
+    with pytest.warns(
+        UserWarning, match="Unable to download pipeline catalog"
+    ) as w:
+        with pytest.raises(FileNotFoundError) as e:
+            mappings.get_pipeline_catalog(
+                url=nonsense_url, path=tmp_path / "does_not_exist.json"
+            )
 
+    assert len(w) == 1
     assert "Have you correctly initialized the submodules" in str(e.value)