Change default action for invalid target data check (#4131)

* Change default action for invalid target data check * update release notes * update docstring * test fixes * expand integration test * update doc test * update response * update tests * update doc test
alteryx · Apr 10, 2023 · 6189c62 · 6189c62
1 parent 401bf9a
commit 6189c62
Show file tree

Hide file tree

Showing 5 changed files with 41 additions and 37 deletions.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -10,7 +10,7 @@ Release Notes
         * Added calls to ``_handle_nullable_types`` in component fit, transform, and predict methods when needed :pr:`4046`, :pr:`4043`
         * Removed existing nullable type handling across AutoMLSearch to just use new handling :pr:`4085`, :pr:`4043`
         * Handled nullable type incompatibility in ``Decomposer`` :pr:`4105`, :pr:`4043`
-        * Removed nullable type incompatibility handling for ARIMA and ExponentialSmoothingRegressor :pr:`4129`
+        * Changed the default value for ``null_strategy`` in ``InvalidTargetDataCheck`` to ``drop`` :pr:`4131`
     * Documentation Changes
     * Testing Changes
         * Fixed installation of prophet for linux nightly tests :pr:`4114`

diff --git a/evalml/data_checks/invalid_target_data_check.py b/evalml/data_checks/invalid_target_data_check.py
@@ -38,14 +38,14 @@ class InvalidTargetDataCheck(DataCheck):
         n_unique (int): Number of unique target values to store when problem type is binary and target
             incorrectly has more than 2 unique values. Non-negative integer. If None, stores all unique values. Defaults to 100.
         null_strategy (str): The type of action option that should be returned if the target is partially null.
-            The options are `impute` (default) and `drop`.
+            The options are `impute` and `drop` (default).
             `impute` - Will return a `DataCheckActionOption` for imputing the target column.
             `drop` - Will return a `DataCheckActionOption` for dropping the null rows in the target column.
     """
 
     multiclass_continuous_threshold = 0.05
 
-    def __init__(self, problem_type, objective, n_unique=100, null_strategy="impute"):
+    def __init__(self, problem_type, objective, n_unique=100, null_strategy="drop"):
         self.problem_type = handle_problem_types(problem_type)
         self.objective = get_objective(objective)
         if n_unique is not None and n_unique <= 0:
@@ -82,7 +82,7 @@ def validate(self, X, y):
 
             >>> X = pd.DataFrame({"col": [1, 2, 3, 1]})
             >>> y = pd.Series(["cat_1", "cat_2", "cat_1", "cat_2"])
-            >>> target_check = InvalidTargetDataCheck("regression", "R2")
+            >>> target_check = InvalidTargetDataCheck("regression", "R2", null_strategy="impute")
             >>> assert target_check.validate(X, y) == [
             ...     {
             ...         "message": "Target is unsupported Unknown type. Valid Woodwork logical types include: integer, double, boolean, age, age_fractional, integer_nullable, boolean_nullable, age_nullable",
@@ -125,7 +125,7 @@ def validate(self, X, y):
             ...         "level": "error",
             ...         "details": {
             ...             "columns": None,
-            ...             "rows": None,
+            ...             "rows": [1, 3],
             ...             "num_null_rows": 2,
             ...             "pct_null_rows": 50.0
             ...         },
@@ -294,6 +294,7 @@ def _check_target_has_nan(self, y, messages):
                     details={
                         "num_null_rows": num_null_rows,
                         "pct_null_rows": pct_null_rows,
+                        "rows": rows_to_drop,
                     },
                     action_options=action_options,
                 ).to_dict(),

diff --git a/evalml/tests/data_checks_tests/test_data_checks.py b/evalml/tests/data_checks_tests/test_data_checks.py
@@ -16,7 +16,6 @@
     DataChecks,
     DataCheckWarning,
     DateTimeFormatDataCheck,
-    DCAOParameterType,
     DefaultDataChecks,
     InvalidTargetDataCheck,
     TargetDistributionDataCheck,
@@ -27,7 +26,6 @@
 from evalml.problem_types import (
     ProblemTypes,
     is_classification,
-    is_regression,
     is_time_series,
 )
 
@@ -229,24 +227,13 @@ def get_expected_messages(problem_type):
             message="1 row(s) (20.0%) of target values are null",
             data_check_name="InvalidTargetDataCheck",
             message_code=DataCheckMessageCode.TARGET_HAS_NULL,
-            details={"num_null_rows": 1, "pct_null_rows": 20.0},
+            details={"num_null_rows": 1, "pct_null_rows": 20.0, "rows": [2]},
             action_options=[
                 DataCheckActionOption(
-                    DataCheckActionCode.IMPUTE_COL,
+                    DataCheckActionCode.DROP_ROWS,
                     data_check_name="InvalidTargetDataCheck",
-                    parameters={
-                        "impute_strategy": {
-                            "parameter_type": DCAOParameterType.GLOBAL,
-                            "type": "category",
-                            "categories": ["mean", "most_frequent"]
-                            if is_regression(problem_type)
-                            else ["most_frequent"],
-                            "default_value": "mean"
-                            if is_regression(problem_type)
-                            else "most_frequent",
-                        },
-                    },
-                    metadata={"is_target": True},
+                    parameters={},
+                    metadata={"is_target": True, "rows": [2]},
                 ),
             ],
         ).to_dict(),
@@ -475,20 +462,13 @@ def __eq__(self, series_2):
             message="1 row(s) (20.0%) of target values are null",
             data_check_name="InvalidTargetDataCheck",
             message_code=DataCheckMessageCode.TARGET_HAS_NULL,
-            details={"num_null_rows": 1, "pct_null_rows": 20.0},
+            details={"num_null_rows": 1, "pct_null_rows": 20.0, "rows": [2]},
             action_options=[
                 DataCheckActionOption(
-                    DataCheckActionCode.IMPUTE_COL,
+                    DataCheckActionCode.DROP_ROWS,
                     data_check_name="InvalidTargetDataCheck",
-                    parameters={
-                        "impute_strategy": {
-                            "parameter_type": DCAOParameterType.GLOBAL,
-                            "type": "category",
-                            "categories": ["mean", "most_frequent"],
-                            "default_value": "mean",
-                        },
-                    },
-                    metadata={"is_target": True},
+                    parameters={},
+                    metadata={"is_target": True, "rows": [2]},
                 ),
             ],
         ).to_dict(),

diff --git a/evalml/tests/data_checks_tests/test_invalid_target_data_check.py b/evalml/tests/data_checks_tests/test_invalid_target_data_check.py
@@ -198,7 +198,7 @@ def test_invalid_target_data_null_strategies(null_strategy):
             message="2 row(s) (40.0%) of target values are null",
             data_check_name=invalid_targets_data_check_name,
             message_code=DataCheckMessageCode.TARGET_HAS_NULL,
-            details={"num_null_rows": 2, "pct_null_rows": 40.0},
+            details={"num_null_rows": 2, "pct_null_rows": 40.0, "rows": [0, 3]},
             action_options=expected_action_options,
         ).to_dict(),
     ]
@@ -214,6 +214,7 @@ def test_invalid_target_data_input_formats():
     invalid_targets_check = InvalidTargetDataCheck(
         "binary",
         get_default_primary_search_objective("binary"),
+        null_strategy="impute",
     )
 
     # test empty pd.Series
@@ -233,7 +234,7 @@ def test_invalid_target_data_input_formats():
             message="3 row(s) (75.0%) of target values are null",
             data_check_name=invalid_targets_data_check_name,
             message_code=DataCheckMessageCode.TARGET_HAS_NULL,
-            details={"num_null_rows": 3, "pct_null_rows": 75},
+            details={"num_null_rows": 3, "pct_null_rows": 75, "rows": [0, 1, 2]},
             action_options=[
                 DataCheckActionOption(
                     DataCheckActionCode.IMPUTE_COL,
@@ -736,13 +737,14 @@ def test_invalid_target_data_check_action_for_data_with_null(
     invalid_targets_check = InvalidTargetDataCheck(
         problem_type,
         get_default_primary_search_objective(problem_type),
+        null_strategy="impute",
     )
     expected = [
         DataCheckError(
             message="3 row(s) (30.0%) of target values are null",
             data_check_name=invalid_targets_data_check_name,
             message_code=DataCheckMessageCode.TARGET_HAS_NULL,
-            details={"num_null_rows": 3, "pct_null_rows": 30.0},
+            details={"num_null_rows": 3, "pct_null_rows": 30.0, "rows": [0, 1, 2]},
             action_options=[
                 DataCheckActionOption(
                     DataCheckActionCode.IMPUTE_COL,

diff --git a/evalml/tests/integration_tests/test_data_checks_and_actions_integration.py b/evalml/tests/integration_tests/test_data_checks_and_actions_integration.py
@@ -242,7 +242,7 @@ def test_data_checks_impute_cols(problem_type):
             pd.Series([0, 0.1, 0.2, 0.1, 0.1]),
             logical_type="double",
         )
-    data_check = InvalidTargetDataCheck(problem_type, objective)
+    data_check = InvalidTargetDataCheck(problem_type, objective, null_strategy="impute")
     data_checks_output = data_check.validate(None, y)
 
     action_pipeline = make_pipeline_from_data_check_output(
@@ -297,3 +297,24 @@ def test_data_checks_suggests_drop_rows():
     X_t, y_t = action_pipeline.transform(X, y)
     assert_frame_equal(X_expected, X_t)
     assert_series_equal(y_expected, y_t)
+
+    y = pd.Series(np.concatenate([np.tile([0, 1], 49), [np.nan, np.nan]]))
+
+    data_check = InvalidTargetDataCheck("binary", "Log Loss Binary")
+    data_checks_output = data_check.validate(None, y)
+
+    action_pipeline = make_pipeline_from_data_check_output("binary", data_checks_output)
+    assert action_pipeline == BinaryClassificationPipeline(
+        component_graph={"Drop Rows Transformer": [DropRowsTransformer, "X", "y"]},
+        parameters={"Drop Rows Transformer": {"indices_to_drop": [98, 99]}},
+        random_seed=0,
+    )
+
+    X_expected = X.drop([98, 99])
+    X_expected.ww.init()
+    y_expected = y.drop([98, 99]).astype("Int64")
+
+    action_pipeline.fit(X, y)
+    X_t, y_t = action_pipeline.transform(X, y)
+    assert_frame_equal(X_expected, X_t)
+    assert_series_equal(y_expected, y_t)