diff --git a/CHANGELOG.md b/CHANGELOG.md index fa97ef948..25f8e741c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -42,7 +42,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Optimize memory usage in `TFTNativeModel` by eliminating copying during making samples ([#494](https://github.com/etna-team/etna/pull/494)) - Optimize memory usage in `DeepStateModel` and `DeepARNativeModel` by eliminating copying during making samples ([#499](https://github.com/etna-team/etna/pull/499)) - Fix working with NaN target in `MeanEncoderTransform` ([#492](https://github.com/etna-team/etna/pull/492)) -- +- Fix `target` leakage in `MeanSegmentEncoderTransform` ([#503](https://github.com/etna-team/etna/pull/503)) - - - diff --git a/etna/transforms/encoders/mean_segment_encoder.py b/etna/transforms/encoders/mean_segment_encoder.py index e1ed2bf76..c5037e077 100644 --- a/etna/transforms/encoders/mean_segment_encoder.py +++ b/etna/transforms/encoders/mean_segment_encoder.py @@ -1,24 +1,34 @@ -import reprlib -from typing import Dict from typing import List -from typing import Optional import numpy as np import pandas as pd from etna.transforms import IrreversibleTransform -from etna.transforms.math.statistics import MeanTransform +from etna.transforms.encoders.mean_encoder import MeanEncoderTransform class MeanSegmentEncoderTransform(IrreversibleTransform): """Makes expanding mean target encoding of the segment. Creates column 'segment_mean'.""" - idx = pd.IndexSlice + _segment_column = "segment_column" + out_column = "segment_mean" def __init__(self): super().__init__(required_features=["target"]) - self.mean_encoder = MeanTransform(in_column="target", window=-1, out_column="segment_mean") - self.global_means: Optional[Dict[str, float]] = None + self._mean_encoder = MeanEncoderTransform( + in_column=self._segment_column, mode="per-segment", out_column=self.out_column, smoothing=0 + ) + + def _add_segment_column(self, df): + segments = df.columns.get_level_values("segment").unique() + flatten_segments = np.repeat(segments.values[np.newaxis, :], len(df), axis=0) + segment_values = pd.DataFrame( + data=flatten_segments, + columns=pd.MultiIndex.from_product([segments, [self._segment_column]]), + index=df.index, + ) + df = pd.concat([df, segment_values], axis=1).sort_index(axis=1) + return df def _fit(self, df: pd.DataFrame) -> "MeanSegmentEncoderTransform": """ @@ -34,10 +44,8 @@ def _fit(self, df: pd.DataFrame) -> "MeanSegmentEncoderTransform": : Fitted transform """ - self.mean_encoder._fit(df) - mean_values = df.loc[:, self.idx[:, "target"]].mean().to_dict() - mean_values = {key[0]: value for key, value in mean_values.items()} - self.global_means = mean_values + df = self._add_segment_column(df) + self._mean_encoder._fit(df) return self def _transform(self, df: pd.DataFrame) -> pd.DataFrame: @@ -61,25 +69,11 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame: NotImplementedError: If there are segments that weren't present during training. """ - if self.global_means is None: - raise ValueError("The transform isn't fitted!") - - segments = df.columns.get_level_values("segment").unique().tolist() - new_segments = set(segments) - self.global_means.keys() - if len(new_segments) > 0: - raise NotImplementedError( - f"This transform can't process segments that weren't present on train data: {reprlib.repr(new_segments)}" - ) - - df = self.mean_encoder._transform(df) - segment = segments[0] - nan_timestamps = df[df.loc[:, self.idx[segment, "target"]].isna()].index - values_to_set = np.array([self.global_means[x] for x in segments]) - # repetition isn't necessary for pandas >= 1.2 - values_to_set = np.repeat(values_to_set[np.newaxis, :], len(nan_timestamps), axis=0) - df.loc[nan_timestamps, self.idx[:, "segment_mean"]] = values_to_set - return df + df = self._add_segment_column(df) + df_transformed = self._mean_encoder._transform(df) + df_transformed = df_transformed.drop(columns=[self._segment_column], level="feature") + return df_transformed def get_regressors_info(self) -> List[str]: """Return the list with regressors created by the transform.""" - return ["segment_mean"] + return [self.out_column] diff --git a/tests/test_transforms/test_encoders/conftest.py b/tests/test_transforms/test_encoders/conftest.py index ec4a95e67..4b3b0c8d3 100644 --- a/tests/test_transforms/test_encoders/conftest.py +++ b/tests/test_transforms/test_encoders/conftest.py @@ -1,38 +1,34 @@ import numpy as np -import pandas as pd import pytest from etna.datasets import TSDataset +from etna.datasets import generate_ar_df @pytest.fixture -def simple_ts() -> TSDataset: - df_1 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-06-07", freq="D")}) - df_2 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-06-07", freq="D")}) - df_1["segment"] = "Moscow" - df_1["target"] = [1.0, 2.0, 3.0, 4.0, 5.0, np.NAN, np.NAN] - df_1["exog"] = [6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0] - df_2["segment"] = "Omsk" - df_2["target"] = [10.0, 20.0, 30.0, 40.0, 50.0, np.NAN, np.NAN] - df_2["exog"] = [60.0, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0] - classic_df = pd.concat([df_1, df_2], ignore_index=True) - df = TSDataset.to_dataset(classic_df) - ts = TSDataset(df, freq="D") +def mean_segment_encoder_ts() -> TSDataset: + df = generate_ar_df(n_segments=2, start_time="2001-01-01", periods=5) + df["target"] = [0.0, 1.0, np.NaN, 3.0, 4.0] + [np.NaN, 1.0, 2.0, 3.0, 4.0] + + ts = TSDataset(df=df, freq="D") + return ts + + +@pytest.fixture +def expected_mean_segment_encoder_ts() -> TSDataset: + df = generate_ar_df(n_segments=2, start_time="2001-01-01", periods=5) + df["target"] = [0.0, 1.0, np.NaN, 3.0, 4.0] + [np.NaN, 1.0, 2.0, 3.0, 4.0] + df["segment_mean"] = [np.NaN, 0, 0.5, 0.5, 1.33] + [np.NaN, np.NaN, 1, 1.5, 2.0] + + ts = TSDataset(df=df, freq="D") return ts @pytest.fixture -def transformed_simple_df() -> pd.DataFrame: - df_1 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-06-07", freq="D")}) - df_2 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-06-07", freq="D")}) - df_1["segment"] = "Moscow" - df_1["target"] = [1.0, 2.0, 3.0, 4.0, 5.0, np.NAN, np.NAN] - df_1["exog"] = [6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0] - df_1["segment_mean"] = [1, 1.5, 2, 2.5, 3, 3, 3] - df_2["segment"] = "Omsk" - df_2["target"] = [10.0, 20.0, 30.0, 40.0, 50.0, np.NAN, np.NAN] - df_2["exog"] = [60.0, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0] - df_2["segment_mean"] = [10.0, 15.0, 20.0, 25.0, 30, 30, 30] - classic_df = pd.concat([df_1, df_2], ignore_index=True) - df = TSDataset.to_dataset(classic_df) - return df +def expected_make_future_mean_segment_encoder_ts() -> TSDataset: + df = generate_ar_df(start_time="2001-01-06", periods=2, n_segments=2) + df["target"] = [np.NaN, np.NaN] + [np.NaN, np.NaN] + df["segment_mean"] = [2.0, 2.0] + [2.5, 2.5] + + ts = TSDataset(df=df, freq="D") + return ts diff --git a/tests/test_transforms/test_encoders/test_mean_encoder_transform.py b/tests/test_transforms/test_encoders/test_mean_encoder_transform.py index 973bcae05..98b731d10 100644 --- a/tests/test_transforms/test_encoders/test_mean_encoder_transform.py +++ b/tests/test_transforms/test_encoders/test_mean_encoder_transform.py @@ -32,7 +32,6 @@ def expected_micro_category_ts() -> TSDataset: df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2) df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) df["mean_encoded_regressor"] = [np.NaN, np.NaN, np.NaN, 1.5, 2.75, 2.25] + [np.NaN, np.NaN, 6.25, 7, 7.625, np.NaN] - ts = TSDataset(df, freq="D") return ts @@ -151,28 +150,14 @@ def expected_multiple_nan_target_category_ts() -> TSDataset: @pytest.fixture -def mean_segment_encoder_ts() -> TSDataset: - df = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=5) - df["target"] = [0, 1, np.NaN, 3, 4] - - df_exog = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=10) - df_exog.rename(columns={"target": "segment_feature"}, inplace=True) - df_exog["segment_feature"] = "segment_0" - - ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future="all") - - return ts - - -@pytest.fixture -def expected_mean_segment_encoder_ts() -> TSDataset: - df = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=5) - df.rename(columns={"target": "segment_mean"}, inplace=True) - df["segment_mean"] = [np.NaN, 0, 0.5, 0.5, 1.33] +def mean_segment_encoder_ts(mean_segment_encoder_ts) -> TSDataset: + df = generate_ar_df(n_segments=2, start_time="2001-01-01", periods=7) + df = df.drop(columns=["target"]) + df["segment_feature"] = ["segment_0"] * 7 + ["segment_1"] * 7 + df_wide = TSDataset.to_dataset(df) + mean_segment_encoder_ts.add_columns_from_pandas(df_wide, update_exog=True, regressors=["segment_feature"]) - ts = TSDataset(df=df, freq="D") - - return ts + return mean_segment_encoder_ts @pytest.fixture @@ -407,7 +392,7 @@ def test_mean_segment_encoder(mean_segment_encoder_ts, expected_mean_segment_enc mean_encoder.fit_transform(mean_segment_encoder_ts) assert_frame_equal( mean_segment_encoder_ts.df.loc[:, pd.IndexSlice[:, "segment_mean"]], - expected_mean_segment_encoder_ts.df, + expected_mean_segment_encoder_ts.df.loc[:, pd.IndexSlice[:, "segment_mean"]], atol=0.01, ) diff --git a/tests/test_transforms/test_encoders/test_mean_segment_encoder_transform.py b/tests/test_transforms/test_encoders/test_mean_segment_encoder_transform.py index 9b546ba62..7eefaa46f 100644 --- a/tests/test_transforms/test_encoders/test_mean_segment_encoder_transform.py +++ b/tests/test_transforms/test_encoders/test_mean_segment_encoder_transform.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd import pytest +from pandas.testing import assert_frame_equal from etna.datasets import TSDataset from etna.metrics import R2 @@ -10,44 +11,31 @@ from tests.utils import select_segments_subset -@pytest.mark.parametrize("expected_global_means", [{"Moscow": 3, "Omsk": 30}]) -def test_mean_segment_encoder_fit(simple_ts, expected_global_means): +def test_mean_segment_encoder_transform(mean_segment_encoder_ts, expected_mean_segment_encoder_ts): encoder = MeanSegmentEncoderTransform() - encoder.fit(simple_ts) - assert encoder.global_means == expected_global_means + transformed_df = encoder.fit_transform(mean_segment_encoder_ts).to_pandas() + assert_frame_equal(transformed_df, expected_mean_segment_encoder_ts.to_pandas(), atol=0.01) -def test_mean_segment_encoder_transform(simple_ts, transformed_simple_df): - encoder = MeanSegmentEncoderTransform() - transformed_df = encoder.fit_transform(simple_ts).to_pandas() - transformed_simple_df.index.freq = "D" - pd.testing.assert_frame_equal(transformed_simple_df, transformed_df) - - -def test_subset_segments(simple_ts): - train_ts = simple_ts - test_df = simple_ts.loc[:, pd.IndexSlice["Omsk", :]] - test_ts = TSDataset(df=test_df, freq=simple_ts.freq) - transform = MeanSegmentEncoderTransform() - - transform.fit(train_ts) - transformed_test_df = transform.transform(test_ts).to_pandas() +def test_make_future_mean_segment_encoder_transform( + mean_segment_encoder_ts, expected_make_future_mean_segment_encoder_ts +): + mean_segment_encoder = MeanSegmentEncoderTransform() + mean_segment_encoder.fit_transform(mean_segment_encoder_ts) + future_ts = mean_segment_encoder_ts.make_future(future_steps=2, transforms=[mean_segment_encoder]) - segments = sorted(transformed_test_df.columns.get_level_values("segment").unique()) - features = sorted(transformed_test_df.columns.get_level_values("feature").unique()) - assert segments == ["Omsk"] - assert features == ["exog", "segment_mean", "target"] + assert_frame_equal(future_ts.to_pandas(), expected_make_future_mean_segment_encoder_ts.to_pandas()) -def test_not_fitted_error(simple_ts): +def test_not_fitted_error(mean_segment_encoder_ts): encoder = MeanSegmentEncoderTransform() with pytest.raises(ValueError, match="The transform isn't fitted"): - encoder.transform(simple_ts) + encoder.transform(mean_segment_encoder_ts) -def test_new_segments_error(simple_ts): - train_ts = select_segments_subset(ts=simple_ts, segments=["Moscow"]) - test_ts = select_segments_subset(ts=simple_ts, segments=["Omsk"]) +def test_new_segments_error(mean_segment_encoder_ts): + train_ts = select_segments_subset(ts=mean_segment_encoder_ts, segments=["segment_0"]) + test_ts = select_segments_subset(ts=mean_segment_encoder_ts, segments=["segment_1"]) transform = MeanSegmentEncoderTransform() transform.fit(train_ts) diff --git a/tests/test_transforms/test_encoders/test_segment_encoder_transform.py b/tests/test_transforms/test_encoders/test_segment_encoder_transform.py index 60574b904..fa869d9d8 100644 --- a/tests/test_transforms/test_encoders/test_segment_encoder_transform.py +++ b/tests/test_transforms/test_encoders/test_segment_encoder_transform.py @@ -2,21 +2,20 @@ import pandas as pd import pytest -from etna.datasets import TSDataset from etna.transforms import SegmentEncoderTransform from tests.test_transforms.utils import assert_transformation_equals_loaded_original from tests.utils import select_segments_subset -def test_segment_encoder_transform(simple_ts): +def test_segment_encoder_transform(mean_segment_encoder_ts): transform = SegmentEncoderTransform() - transformed_df = transform.fit_transform(simple_ts).to_pandas() + transformed_df = transform.fit_transform(mean_segment_encoder_ts).to_pandas() assert ( len(transformed_df.loc[:, pd.IndexSlice[:, "segment_code"]].columns) == 2 ), "Number of columns not the same as segments" - assert len(simple_ts.to_pandas()) == len(transformed_df), "Row missing" + assert len(mean_segment_encoder_ts.to_pandas()) == len(transformed_df), "Row missing" codes = set() - for segment in simple_ts.segments: + for segment in mean_segment_encoder_ts.segments: column = transformed_df.loc[:, pd.IndexSlice[segment, "segment_code"]] assert column.dtype == "category", "Column type is not category" assert np.all(column == column.iloc[0]), "Values are not the same for the whole column" @@ -24,32 +23,15 @@ def test_segment_encoder_transform(simple_ts): assert codes == {0, 1}, "Codes are not 0 and 1" -def test_subset_segments(simple_ts): - train_ts = simple_ts - test_df = simple_ts.loc[:, pd.IndexSlice["Omsk", :]] - test_ts = TSDataset(df=test_df, freq=simple_ts.freq) - transform = SegmentEncoderTransform() - - transform.fit(train_ts) - transformed_test_df = transform.transform(test_ts).to_pandas() - - segments = sorted(transformed_test_df.columns.get_level_values("segment").unique()) - features = sorted(transformed_test_df.columns.get_level_values("feature").unique()) - assert segments == ["Omsk"] - assert features == ["exog", "segment_code", "target"] - values = transformed_test_df.loc[:, pd.IndexSlice[:, "segment_code"]] - assert np.all(values == values.iloc[0]) - - -def test_not_fitted_error(simple_ts): +def test_not_fitted_error(mean_segment_encoder_ts): encoder = SegmentEncoderTransform() with pytest.raises(ValueError, match="The transform isn't fitted"): - encoder.transform(simple_ts) + encoder.transform(mean_segment_encoder_ts) -def test_new_segments_error(simple_ts): - train_ts = select_segments_subset(ts=simple_ts, segments=["Moscow"]) - test_ts = select_segments_subset(ts=simple_ts, segments=["Omsk"]) +def test_new_segments_error(mean_segment_encoder_ts): + train_ts = select_segments_subset(ts=mean_segment_encoder_ts, segments=["segment_0"]) + test_ts = select_segments_subset(ts=mean_segment_encoder_ts, segments=["segment_1"]) transform = SegmentEncoderTransform() transform.fit(train_ts)