Skip to content

Commit

Permalink
Change default action for invalid target data check (#4131)
Browse files Browse the repository at this point in the history
* Change default action for invalid target data check

* update release notes

* update docstring

* test fixes

* expand integration test

* update doc test

* update response

* update tests

* update doc test
  • Loading branch information
ParthivNaresh authored Apr 10, 2023
1 parent 401bf9a commit 6189c62
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 37 deletions.
2 changes: 1 addition & 1 deletion docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Release Notes
* Added calls to ``_handle_nullable_types`` in component fit, transform, and predict methods when needed :pr:`4046`, :pr:`4043`
* Removed existing nullable type handling across AutoMLSearch to just use new handling :pr:`4085`, :pr:`4043`
* Handled nullable type incompatibility in ``Decomposer`` :pr:`4105`, :pr:`4043`
* Removed nullable type incompatibility handling for ARIMA and ExponentialSmoothingRegressor :pr:`4129`
* Changed the default value for ``null_strategy`` in ``InvalidTargetDataCheck`` to ``drop`` :pr:`4131`
* Documentation Changes
* Testing Changes
* Fixed installation of prophet for linux nightly tests :pr:`4114`
Expand Down
9 changes: 5 additions & 4 deletions evalml/data_checks/invalid_target_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,14 @@ class InvalidTargetDataCheck(DataCheck):
n_unique (int): Number of unique target values to store when problem type is binary and target
incorrectly has more than 2 unique values. Non-negative integer. If None, stores all unique values. Defaults to 100.
null_strategy (str): The type of action option that should be returned if the target is partially null.
The options are `impute` (default) and `drop`.
The options are `impute` and `drop` (default).
`impute` - Will return a `DataCheckActionOption` for imputing the target column.
`drop` - Will return a `DataCheckActionOption` for dropping the null rows in the target column.
"""

multiclass_continuous_threshold = 0.05

def __init__(self, problem_type, objective, n_unique=100, null_strategy="impute"):
def __init__(self, problem_type, objective, n_unique=100, null_strategy="drop"):
self.problem_type = handle_problem_types(problem_type)
self.objective = get_objective(objective)
if n_unique is not None and n_unique <= 0:
Expand Down Expand Up @@ -82,7 +82,7 @@ def validate(self, X, y):
>>> X = pd.DataFrame({"col": [1, 2, 3, 1]})
>>> y = pd.Series(["cat_1", "cat_2", "cat_1", "cat_2"])
>>> target_check = InvalidTargetDataCheck("regression", "R2")
>>> target_check = InvalidTargetDataCheck("regression", "R2", null_strategy="impute")
>>> assert target_check.validate(X, y) == [
... {
... "message": "Target is unsupported Unknown type. Valid Woodwork logical types include: integer, double, boolean, age, age_fractional, integer_nullable, boolean_nullable, age_nullable",
Expand Down Expand Up @@ -125,7 +125,7 @@ def validate(self, X, y):
... "level": "error",
... "details": {
... "columns": None,
... "rows": None,
... "rows": [1, 3],
... "num_null_rows": 2,
... "pct_null_rows": 50.0
... },
Expand Down Expand Up @@ -294,6 +294,7 @@ def _check_target_has_nan(self, y, messages):
details={
"num_null_rows": num_null_rows,
"pct_null_rows": pct_null_rows,
"rows": rows_to_drop,
},
action_options=action_options,
).to_dict(),
Expand Down
36 changes: 8 additions & 28 deletions evalml/tests/data_checks_tests/test_data_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
DataChecks,
DataCheckWarning,
DateTimeFormatDataCheck,
DCAOParameterType,
DefaultDataChecks,
InvalidTargetDataCheck,
TargetDistributionDataCheck,
Expand All @@ -27,7 +26,6 @@
from evalml.problem_types import (
ProblemTypes,
is_classification,
is_regression,
is_time_series,
)

Expand Down Expand Up @@ -229,24 +227,13 @@ def get_expected_messages(problem_type):
message="1 row(s) (20.0%) of target values are null",
data_check_name="InvalidTargetDataCheck",
message_code=DataCheckMessageCode.TARGET_HAS_NULL,
details={"num_null_rows": 1, "pct_null_rows": 20.0},
details={"num_null_rows": 1, "pct_null_rows": 20.0, "rows": [2]},
action_options=[
DataCheckActionOption(
DataCheckActionCode.IMPUTE_COL,
DataCheckActionCode.DROP_ROWS,
data_check_name="InvalidTargetDataCheck",
parameters={
"impute_strategy": {
"parameter_type": DCAOParameterType.GLOBAL,
"type": "category",
"categories": ["mean", "most_frequent"]
if is_regression(problem_type)
else ["most_frequent"],
"default_value": "mean"
if is_regression(problem_type)
else "most_frequent",
},
},
metadata={"is_target": True},
parameters={},
metadata={"is_target": True, "rows": [2]},
),
],
).to_dict(),
Expand Down Expand Up @@ -475,20 +462,13 @@ def __eq__(self, series_2):
message="1 row(s) (20.0%) of target values are null",
data_check_name="InvalidTargetDataCheck",
message_code=DataCheckMessageCode.TARGET_HAS_NULL,
details={"num_null_rows": 1, "pct_null_rows": 20.0},
details={"num_null_rows": 1, "pct_null_rows": 20.0, "rows": [2]},
action_options=[
DataCheckActionOption(
DataCheckActionCode.IMPUTE_COL,
DataCheckActionCode.DROP_ROWS,
data_check_name="InvalidTargetDataCheck",
parameters={
"impute_strategy": {
"parameter_type": DCAOParameterType.GLOBAL,
"type": "category",
"categories": ["mean", "most_frequent"],
"default_value": "mean",
},
},
metadata={"is_target": True},
parameters={},
metadata={"is_target": True, "rows": [2]},
),
],
).to_dict(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def test_invalid_target_data_null_strategies(null_strategy):
message="2 row(s) (40.0%) of target values are null",
data_check_name=invalid_targets_data_check_name,
message_code=DataCheckMessageCode.TARGET_HAS_NULL,
details={"num_null_rows": 2, "pct_null_rows": 40.0},
details={"num_null_rows": 2, "pct_null_rows": 40.0, "rows": [0, 3]},
action_options=expected_action_options,
).to_dict(),
]
Expand All @@ -214,6 +214,7 @@ def test_invalid_target_data_input_formats():
invalid_targets_check = InvalidTargetDataCheck(
"binary",
get_default_primary_search_objective("binary"),
null_strategy="impute",
)

# test empty pd.Series
Expand All @@ -233,7 +234,7 @@ def test_invalid_target_data_input_formats():
message="3 row(s) (75.0%) of target values are null",
data_check_name=invalid_targets_data_check_name,
message_code=DataCheckMessageCode.TARGET_HAS_NULL,
details={"num_null_rows": 3, "pct_null_rows": 75},
details={"num_null_rows": 3, "pct_null_rows": 75, "rows": [0, 1, 2]},
action_options=[
DataCheckActionOption(
DataCheckActionCode.IMPUTE_COL,
Expand Down Expand Up @@ -736,13 +737,14 @@ def test_invalid_target_data_check_action_for_data_with_null(
invalid_targets_check = InvalidTargetDataCheck(
problem_type,
get_default_primary_search_objective(problem_type),
null_strategy="impute",
)
expected = [
DataCheckError(
message="3 row(s) (30.0%) of target values are null",
data_check_name=invalid_targets_data_check_name,
message_code=DataCheckMessageCode.TARGET_HAS_NULL,
details={"num_null_rows": 3, "pct_null_rows": 30.0},
details={"num_null_rows": 3, "pct_null_rows": 30.0, "rows": [0, 1, 2]},
action_options=[
DataCheckActionOption(
DataCheckActionCode.IMPUTE_COL,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def test_data_checks_impute_cols(problem_type):
pd.Series([0, 0.1, 0.2, 0.1, 0.1]),
logical_type="double",
)
data_check = InvalidTargetDataCheck(problem_type, objective)
data_check = InvalidTargetDataCheck(problem_type, objective, null_strategy="impute")
data_checks_output = data_check.validate(None, y)

action_pipeline = make_pipeline_from_data_check_output(
Expand Down Expand Up @@ -297,3 +297,24 @@ def test_data_checks_suggests_drop_rows():
X_t, y_t = action_pipeline.transform(X, y)
assert_frame_equal(X_expected, X_t)
assert_series_equal(y_expected, y_t)

y = pd.Series(np.concatenate([np.tile([0, 1], 49), [np.nan, np.nan]]))

data_check = InvalidTargetDataCheck("binary", "Log Loss Binary")
data_checks_output = data_check.validate(None, y)

action_pipeline = make_pipeline_from_data_check_output("binary", data_checks_output)
assert action_pipeline == BinaryClassificationPipeline(
component_graph={"Drop Rows Transformer": [DropRowsTransformer, "X", "y"]},
parameters={"Drop Rows Transformer": {"indices_to_drop": [98, 99]}},
random_seed=0,
)

X_expected = X.drop([98, 99])
X_expected.ww.init()
y_expected = y.drop([98, 99]).astype("Int64")

action_pipeline.fit(X, y)
X_t, y_t = action_pipeline.transform(X, y)
assert_frame_equal(X_expected, X_t)
assert_series_equal(y_expected, y_t)

0 comments on commit 6189c62

Please sign in to comment.