Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change default action for invalid target data check #4131

Merged
merged 10 commits into from
Apr 10, 2023
2 changes: 1 addition & 1 deletion docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Release Notes
* Added calls to ``_handle_nullable_types`` in component fit, transform, and predict methods when needed :pr:`4046`, :pr:`4043`
* Removed existing nullable type handling across AutoMLSearch to just use new handling :pr:`4085`, :pr:`4043`
* Handled nullable type incompatibility in ``Decomposer`` :pr:`4105`, :pr:`4043`
* Removed nullable type incompatibility handling for ARIMA and ExponentialSmoothingRegressor :pr:`4129`
* Changed the default value for ``null_strategy`` in ``InvalidTargetDataCheck`` to ``drop`` :pr:`4131`
* Documentation Changes
* Testing Changes
* Fixed installation of prophet for linux nightly tests :pr:`4114`
Expand Down
9 changes: 5 additions & 4 deletions evalml/data_checks/invalid_target_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,14 @@ class InvalidTargetDataCheck(DataCheck):
n_unique (int): Number of unique target values to store when problem type is binary and target
incorrectly has more than 2 unique values. Non-negative integer. If None, stores all unique values. Defaults to 100.
null_strategy (str): The type of action option that should be returned if the target is partially null.
The options are `impute` (default) and `drop`.
The options are `impute` and `drop` (default).
`impute` - Will return a `DataCheckActionOption` for imputing the target column.
`drop` - Will return a `DataCheckActionOption` for dropping the null rows in the target column.
"""

multiclass_continuous_threshold = 0.05

def __init__(self, problem_type, objective, n_unique=100, null_strategy="impute"):
def __init__(self, problem_type, objective, n_unique=100, null_strategy="drop"):
self.problem_type = handle_problem_types(problem_type)
self.objective = get_objective(objective)
if n_unique is not None and n_unique <= 0:
Expand Down Expand Up @@ -82,7 +82,7 @@ def validate(self, X, y):

>>> X = pd.DataFrame({"col": [1, 2, 3, 1]})
>>> y = pd.Series(["cat_1", "cat_2", "cat_1", "cat_2"])
>>> target_check = InvalidTargetDataCheck("regression", "R2")
>>> target_check = InvalidTargetDataCheck("regression", "R2", null_strategy="impute")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it would be nice to keep the default behavior in the docstring. Can you file an issue for it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For sure!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Filed here

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cheers!

>>> assert target_check.validate(X, y) == [
... {
... "message": "Target is unsupported Unknown type. Valid Woodwork logical types include: integer, double, boolean, age, age_fractional, integer_nullable, boolean_nullable, age_nullable",
Expand Down Expand Up @@ -125,7 +125,7 @@ def validate(self, X, y):
... "level": "error",
... "details": {
... "columns": None,
... "rows": None,
... "rows": [1, 3],
... "num_null_rows": 2,
... "pct_null_rows": 50.0
... },
Expand Down Expand Up @@ -294,6 +294,7 @@ def _check_target_has_nan(self, y, messages):
details={
"num_null_rows": num_null_rows,
"pct_null_rows": pct_null_rows,
"rows": rows_to_drop,
},
action_options=action_options,
).to_dict(),
Expand Down
36 changes: 8 additions & 28 deletions evalml/tests/data_checks_tests/test_data_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
DataChecks,
DataCheckWarning,
DateTimeFormatDataCheck,
DCAOParameterType,
DefaultDataChecks,
InvalidTargetDataCheck,
TargetDistributionDataCheck,
Expand All @@ -27,7 +26,6 @@
from evalml.problem_types import (
ProblemTypes,
is_classification,
is_regression,
is_time_series,
)

Expand Down Expand Up @@ -229,24 +227,13 @@ def get_expected_messages(problem_type):
message="1 row(s) (20.0%) of target values are null",
data_check_name="InvalidTargetDataCheck",
message_code=DataCheckMessageCode.TARGET_HAS_NULL,
details={"num_null_rows": 1, "pct_null_rows": 20.0},
details={"num_null_rows": 1, "pct_null_rows": 20.0, "rows": [2]},
action_options=[
DataCheckActionOption(
DataCheckActionCode.IMPUTE_COL,
DataCheckActionCode.DROP_ROWS,
data_check_name="InvalidTargetDataCheck",
parameters={

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are we removing all the params?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is testing the default action which is now drop rows, and the drop rows action doesn't have any of the parameters required by the impute column action

"impute_strategy": {
"parameter_type": DCAOParameterType.GLOBAL,
"type": "category",
"categories": ["mean", "most_frequent"]
if is_regression(problem_type)
else ["most_frequent"],
"default_value": "mean"
if is_regression(problem_type)
else "most_frequent",
},
},
metadata={"is_target": True},
parameters={},
metadata={"is_target": True, "rows": [2]},
),
],
).to_dict(),
Expand Down Expand Up @@ -475,20 +462,13 @@ def __eq__(self, series_2):
message="1 row(s) (20.0%) of target values are null",
data_check_name="InvalidTargetDataCheck",
message_code=DataCheckMessageCode.TARGET_HAS_NULL,
details={"num_null_rows": 1, "pct_null_rows": 20.0},
details={"num_null_rows": 1, "pct_null_rows": 20.0, "rows": [2]},
action_options=[
DataCheckActionOption(
DataCheckActionCode.IMPUTE_COL,
DataCheckActionCode.DROP_ROWS,
data_check_name="InvalidTargetDataCheck",
parameters={
"impute_strategy": {
"parameter_type": DCAOParameterType.GLOBAL,
"type": "category",
"categories": ["mean", "most_frequent"],
"default_value": "mean",
},
},
metadata={"is_target": True},
parameters={},
metadata={"is_target": True, "rows": [2]},
),
],
).to_dict(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def test_invalid_target_data_null_strategies(null_strategy):
message="2 row(s) (40.0%) of target values are null",
data_check_name=invalid_targets_data_check_name,
message_code=DataCheckMessageCode.TARGET_HAS_NULL,
details={"num_null_rows": 2, "pct_null_rows": 40.0},
details={"num_null_rows": 2, "pct_null_rows": 40.0, "rows": [0, 3]},
action_options=expected_action_options,
).to_dict(),
]
Expand All @@ -214,6 +214,7 @@ def test_invalid_target_data_input_formats():
invalid_targets_check = InvalidTargetDataCheck(
"binary",
get_default_primary_search_objective("binary"),
null_strategy="impute",
)

# test empty pd.Series
Expand All @@ -233,7 +234,7 @@ def test_invalid_target_data_input_formats():
message="3 row(s) (75.0%) of target values are null",
data_check_name=invalid_targets_data_check_name,
message_code=DataCheckMessageCode.TARGET_HAS_NULL,
details={"num_null_rows": 3, "pct_null_rows": 75},
details={"num_null_rows": 3, "pct_null_rows": 75, "rows": [0, 1, 2]},
action_options=[
DataCheckActionOption(
DataCheckActionCode.IMPUTE_COL,
Expand Down Expand Up @@ -736,13 +737,14 @@ def test_invalid_target_data_check_action_for_data_with_null(
invalid_targets_check = InvalidTargetDataCheck(
problem_type,
get_default_primary_search_objective(problem_type),
null_strategy="impute",
)
expected = [
DataCheckError(
message="3 row(s) (30.0%) of target values are null",
data_check_name=invalid_targets_data_check_name,
message_code=DataCheckMessageCode.TARGET_HAS_NULL,
details={"num_null_rows": 3, "pct_null_rows": 30.0},
details={"num_null_rows": 3, "pct_null_rows": 30.0, "rows": [0, 1, 2]},
action_options=[
DataCheckActionOption(
DataCheckActionCode.IMPUTE_COL,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def test_data_checks_impute_cols(problem_type):
pd.Series([0, 0.1, 0.2, 0.1, 0.1]),
logical_type="double",
)
data_check = InvalidTargetDataCheck(problem_type, objective)
data_check = InvalidTargetDataCheck(problem_type, objective, null_strategy="impute")
data_checks_output = data_check.validate(None, y)

action_pipeline = make_pipeline_from_data_check_output(
Expand Down Expand Up @@ -297,3 +297,24 @@ def test_data_checks_suggests_drop_rows():
X_t, y_t = action_pipeline.transform(X, y)
assert_frame_equal(X_expected, X_t)
assert_series_equal(y_expected, y_t)

y = pd.Series(np.concatenate([np.tile([0, 1], 49), [np.nan, np.nan]]))

data_check = InvalidTargetDataCheck("binary", "Log Loss Binary")
data_checks_output = data_check.validate(None, y)

action_pipeline = make_pipeline_from_data_check_output("binary", data_checks_output)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will be interesting to see how we fit these components or "datacheck pipelines" into our search pipelines!

assert action_pipeline == BinaryClassificationPipeline(
component_graph={"Drop Rows Transformer": [DropRowsTransformer, "X", "y"]},
parameters={"Drop Rows Transformer": {"indices_to_drop": [98, 99]}},
random_seed=0,
)

X_expected = X.drop([98, 99])
X_expected.ww.init()
y_expected = y.drop([98, 99]).astype("Int64")

action_pipeline.fit(X, y)
X_t, y_t = action_pipeline.transform(X, y)
assert_frame_equal(X_expected, X_t)
assert_series_equal(y_expected, y_t)