From 7cc68eb7fda6c94cabf21e8f6301b2128c863cf9 Mon Sep 17 00:00:00 2001 From: brsnw250 Date: Mon, 24 Feb 2025 15:50:45 +0300 Subject: [PATCH 1/6] added test --- tests/test_datasets/test_dataset.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 4378fc7fa..06522cdda 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -1513,6 +1513,16 @@ def test_tsdataset_idx_slice_pass_prediction_intervals_to_output(ts_with_predict ) +def test_tsdataset_idx_slice_pass_hierarchical_structure_to_output(product_level_constant_forecast_with_quantiles): + ts = product_level_constant_forecast_with_quantiles + initial_hs = ts.hierarchical_structure + slice_hs = ts.tsdataset_idx_slice(start_idx=1, end_idx=2).hierarchical_structure + + assert slice_hs is not None + assert slice_hs.level_names == initial_hs.level_names + assert slice_hs.level_structure == initial_hs.level_structure + + def test_to_torch_dataset_without_drop(tsdf_with_exog): def make_samples(df): return [{"target": df.target.values, "segment": df["segment"].values[0]}] From 9a4cae3a8ff9fabfeab71061b3ba1016c1b91093 Mon Sep 17 00:00:00 2001 From: brsnw250 Date: Mon, 24 Feb 2025 15:51:42 +0300 Subject: [PATCH 2/6] reworked `tsdataset_idx_slice` --- etna/datasets/tsdataset.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/etna/datasets/tsdataset.py b/etna/datasets/tsdataset.py index ba30b6580..db1c2b4a5 100644 --- a/etna/datasets/tsdataset.py +++ b/etna/datasets/tsdataset.py @@ -520,16 +520,11 @@ def tsdataset_idx_slice(self, start_idx: Optional[int] = None, end_idx: Optional : TSDataset based on indexing slice. """ - df_slice = self.df.iloc[start_idx:end_idx].copy(deep=True) - tsdataset_slice = TSDataset(df=df_slice, freq=self.freq) - # can't put known_future into constructor, _check_known_future fails with df_exog=None - tsdataset_slice.known_future = deepcopy(self.known_future) - tsdataset_slice._regressors = deepcopy(self.regressors) - if self.df_exog is not None: - tsdataset_slice.df_exog = self.df_exog.copy(deep=True) - tsdataset_slice._target_components_names = deepcopy(self._target_components_names) - tsdataset_slice._prediction_intervals_names = deepcopy(self._prediction_intervals_names) - return tsdataset_slice + ts_slice = deepcopy(self) + ts_slice.df = ts_slice.df.iloc[start_idx:end_idx].copy(deep=None) + ts_slice.raw_df = ts_slice.raw_df.iloc[start_idx:end_idx].copy(deep=None) + + return ts_slice @staticmethod def _check_known_future( From 199451e90390c0159db6b6208a4a5a5eb6bc4dac Mon Sep 17 00:00:00 2001 From: brsnw250 Date: Mon, 24 Feb 2025 17:10:27 +0300 Subject: [PATCH 3/6] updated changelog --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a506a6c30..f6cd1e140 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -46,7 +46,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Breaking:** Bump minimum `optuna` version to 4.0 ([#599](https://github.com/etna-team/etna/pull/599)) - **Breaking:** Bump minimum `statsforecast` version to 2.0 ([#599](https://github.com/etna-team/etna/pull/599)) - Optimize performance of exogenous variables addition to the dataset ([#596](https://github.com/etna-team/etna/pull/596)) -- +- Update `TSDataset.tsdataset_idx_slice` method ([#618](https://github.com/etna-team/etna/pull/618)) +- ### Fixed - Fix possibility of silent handling of duplicate features when updating dataset with `TSDataset.update_columns_from_pandas` ([#522](https://github.com/etna-team/etna/pull/552)) @@ -59,6 +60,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Breaking:** rename `DaylySeasonalitySSM` to `DailySeasonalitySSM` ([#615](https://github.com/etna-team/etna/pull/615)) - Fix `TSDataset.train_test_split` to pass all features to train and test parts ([#545](https://github.com/etna-team/etna/pull/545)) - Fix `ConfigSampler` to handle trials without hash ([#616](https://github.com/etna-team/etna/pull/616)) +- Fix method `TSDataset.tsdataset_idx_slice` loses hierarchical structure ([#618](https://github.com/etna-team/etna/pull/618)) +- ### Removed - **Breaking:** Remove `FutureMixin`, `OutliersTransform.outliers_timestamps` and `OutliersTransform.original_values` ([#577](https://github.com/etna-team/etna/pull/577)) From 443d9d43849a11c2fa58561b1cbfe49302a56286 Mon Sep 17 00:00:00 2001 From: brsnw250 Date: Mon, 24 Feb 2025 18:41:02 +0300 Subject: [PATCH 4/6] updated implementation --- etna/datasets/tsdataset.py | 55 ++++++++++++++----------------- etna/datasets/utils.py | 17 ++++++++++ tests/test_datasets/test_utils.py | 15 +++++++++ 3 files changed, 57 insertions(+), 30 deletions(-) diff --git a/etna/datasets/tsdataset.py b/etna/datasets/tsdataset.py index db1c2b4a5..8dbab0835 100644 --- a/etna/datasets/tsdataset.py +++ b/etna/datasets/tsdataset.py @@ -26,6 +26,7 @@ from etna.datasets.utils import DataFrameFormat from etna.datasets.utils import _check_features_in_segments from etna.datasets.utils import _check_timestamp_param +from etna.datasets.utils import _slice_index_wide_dataframe from etna.datasets.utils import _TorchDataset from etna.datasets.utils import apply_alignment from etna.datasets.utils import get_level_dataframe @@ -520,9 +521,23 @@ def tsdataset_idx_slice(self, start_idx: Optional[int] = None, end_idx: Optional : TSDataset based on indexing slice. """ - ts_slice = deepcopy(self) - ts_slice.df = ts_slice.df.iloc[start_idx:end_idx].copy(deep=None) - ts_slice.raw_df = ts_slice.raw_df.iloc[start_idx:end_idx].copy(deep=None) + self_df = self.df + self_raw_df = self.raw_df + + try: + # we do this to avoid redundant copying of data + self.df = None + self.raw_df = None + + ts_slice = deepcopy(self) + ts_slice.df = _slice_index_wide_dataframe(df=self_df, start=start_idx, stop=end_idx, label_indexing=False) + ts_slice.raw_df = _slice_index_wide_dataframe( + df=self_raw_df, start=start_idx, stop=end_idx, label_indexing=False + ) + + finally: + self.df = self_df + self.raw_df = self_raw_df return ts_slice @@ -1255,36 +1270,16 @@ def train_test_split( # we do this to avoid redundant copying of data self.df = None self.raw_df = None - train = deepcopy(self) - # we want to make sure it makes only one copy - train_df = self_df.loc[train_start_defined:train_end_defined] - if train_df._is_view or train_df._is_copy is not None: - train.df = train_df.copy() - else: - train.df = train_df - - # we want to make sure it makes only one copy - train_raw_df = self_raw_df.loc[train_start_defined:train_end_defined] - if train_raw_df._is_view or train_raw_df._is_copy is not None: - train.raw_df = train_raw_df.copy() - else: - train.raw_df = train_raw_df + train = deepcopy(self) + train.df = _slice_index_wide_dataframe(df=self_df, start=train_start_defined, stop=train_end_defined) + train.raw_df = _slice_index_wide_dataframe( + df=self_raw_df, start=train_start_defined, stop=train_end_defined + ) - # we want to make sure it makes only one copy test = deepcopy(self) - test_df = self_df.loc[test_start_defined:test_end_defined] - if test_df._is_view or test_df._is_copy is not None: - test.df = test_df.copy() - else: - test.df = test_df - - # we want to make sure it makes only one copy - test_raw_df = self_raw_df.loc[train_start_defined:test_end_defined] - if test_raw_df._is_view or test_raw_df._is_copy is not None: - test.raw_df = test_raw_df.copy() - else: - test.raw_df = test_raw_df + test.df = _slice_index_wide_dataframe(df=self_df, start=test_start_defined, stop=test_end_defined) + test.raw_df = _slice_index_wide_dataframe(df=self_raw_df, start=train_start_defined, stop=test_end_defined) finally: self.df = self_df diff --git a/etna/datasets/utils.py b/etna/datasets/utils.py index e1bb8937c..ad6f4ed9b 100644 --- a/etna/datasets/utils.py +++ b/etna/datasets/utils.py @@ -756,3 +756,20 @@ def _check_features_in_segments(columns: pd.MultiIndex, segments: Optional[List[ raise ValueError( f"There is a mismatch in feature sets between segments '{compare_segment}' and '{segment}'!" ) + + +def _slice_index_wide_dataframe( + df: pd.DataFrame, + start: Optional[Union[int, str, pd.Timestamp]] = None, + stop: Optional[Union[int, str, pd.Timestamp]] = None, + label_indexing: bool = True, +) -> pd.DataFrame: + """Slice index of the dataframe in the wide format with copy.""" + indexer = df.loc if label_indexing else df.iloc + + # we want to make sure it makes only one copy + df = indexer[start:stop] # type: ignore + if df._is_view or df._is_copy is not None: + df = df.copy(deep=None) + + return df diff --git a/tests/test_datasets/test_utils.py b/tests/test_datasets/test_utils.py index d7081f76e..c156c8515 100644 --- a/tests/test_datasets/test_utils.py +++ b/tests/test_datasets/test_utils.py @@ -9,6 +9,7 @@ from etna.datasets import generate_ar_df from etna.datasets.utils import DataFrameFormat from etna.datasets.utils import _check_features_in_segments +from etna.datasets.utils import _slice_index_wide_dataframe from etna.datasets.utils import _TorchDataset from etna.datasets.utils import apply_alignment from etna.datasets.utils import determine_freq @@ -1013,3 +1014,17 @@ def test_check_features_in_segments_ok(columns): ) def test_check_features_in_segments_ok_with_expected_segments(columns): _check_features_in_segments(columns=columns, segments=[1, 2]) + + +@pytest.mark.parametrize("start, stop", ((0, 4), (4, -1), (-5, -1), (None, 6), (5, None), (None, None))) +def test_slice_index_wide_dataframe_int_idx(df_aligned_datetime, start, stop): + res = _slice_index_wide_dataframe(df=df_aligned_datetime, start=start, stop=stop, label_indexing=False) + pd.testing.assert_frame_equal(res, df_aligned_datetime.iloc[start:stop]) + + +@pytest.mark.parametrize( + "start, stop", (("2020-01-01", "2020-01-04"), (None, "2020-01-10"), ("2020-01-09", None), (None, None)) +) +def test_slice_index_wide_dataframe_label_idx(df_aligned_datetime, start, stop): + res = _slice_index_wide_dataframe(df=df_aligned_datetime, start=start, stop=stop, label_indexing=True) + pd.testing.assert_frame_equal(res, df_aligned_datetime.loc[start:stop]) From ca0da019de55d6c20ba760c2e44f96fd288abafc Mon Sep 17 00:00:00 2001 From: brsnw250 Date: Mon, 24 Feb 2025 18:42:59 +0300 Subject: [PATCH 5/6] fixed changelog --- CHANGELOG.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f6cd1e140..c553d611c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -46,7 +46,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Breaking:** Bump minimum `optuna` version to 4.0 ([#599](https://github.com/etna-team/etna/pull/599)) - **Breaking:** Bump minimum `statsforecast` version to 2.0 ([#599](https://github.com/etna-team/etna/pull/599)) - Optimize performance of exogenous variables addition to the dataset ([#596](https://github.com/etna-team/etna/pull/596)) -- Update `TSDataset.tsdataset_idx_slice` method ([#618](https://github.com/etna-team/etna/pull/618)) - ### Fixed @@ -60,7 +59,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Breaking:** rename `DaylySeasonalitySSM` to `DailySeasonalitySSM` ([#615](https://github.com/etna-team/etna/pull/615)) - Fix `TSDataset.train_test_split` to pass all features to train and test parts ([#545](https://github.com/etna-team/etna/pull/545)) - Fix `ConfigSampler` to handle trials without hash ([#616](https://github.com/etna-team/etna/pull/616)) -- Fix method `TSDataset.tsdataset_idx_slice` loses hierarchical structure ([#618](https://github.com/etna-team/etna/pull/618)) +- Fix method `TSDataset.tsdataset_idx_slice` to not lose hierarchical structure ([#618](https://github.com/etna-team/etna/pull/618)) - ### Removed From e36e09b2d130cb003e5c667ab5922fb7cf64306d Mon Sep 17 00:00:00 2001 From: brsnw250 Date: Tue, 25 Feb 2025 11:00:38 +0300 Subject: [PATCH 6/6] explicitly make deep copy --- etna/datasets/tsdataset.py | 4 ++-- etna/datasets/utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/etna/datasets/tsdataset.py b/etna/datasets/tsdataset.py index 8dbab0835..223e45092 100644 --- a/etna/datasets/tsdataset.py +++ b/etna/datasets/tsdataset.py @@ -144,7 +144,7 @@ def __init__( self.freq = freq self.df_exog = None self.raw_df = self._prepare_df(df=df, freq=freq) - self.df = self.raw_df.copy(deep=None) + self.df = self.raw_df.copy(deep=True) self.hierarchical_structure = hierarchical_structure self.current_df_level: Optional[str] = self._get_dataframe_level(df=self.df) @@ -1026,7 +1026,7 @@ def to_dataset(df: pd.DataFrame) -> pd.DataFrame: df.sort_index(axis=1, level=(0, 1), inplace=True) if df._is_view or df._is_copy is None: - df = df.copy(deep=None) + df = df.copy(deep=True) return df diff --git a/etna/datasets/utils.py b/etna/datasets/utils.py index ad6f4ed9b..64d7e7b70 100644 --- a/etna/datasets/utils.py +++ b/etna/datasets/utils.py @@ -770,6 +770,6 @@ def _slice_index_wide_dataframe( # we want to make sure it makes only one copy df = indexer[start:stop] # type: ignore if df._is_view or df._is_copy is not None: - df = df.copy(deep=None) + df = df.copy(deep=True) return df