From c30430f0e8c144948b81d82bdeddeb71b8b876a6 Mon Sep 17 00:00:00 2001
From: Dmitry Bunin <bunin260200@gmail.com>
Date: Tue, 10 Dec 2024 16:00:37 +0300
Subject: [PATCH 1/9] feature: add task files

---
 etna/auto/auto.py                        |  12 +-
 etna/metrics/utils.py                    |  99 +++++++++++---
 etna/pipeline/base.py                    |   7 +-
 tests/test_auto/conftest.py              |  63 +++++++++
 tests/test_auto/test_auto.py             |  76 +++++++++--
 tests/test_auto/test_tune.py             |  51 ++++++-
 tests/test_metrics/test_metrics_utils.py | 162 +++++++++++++++++++++++
 7 files changed, 433 insertions(+), 37 deletions(-)

diff --git a/etna/auto/auto.py b/etna/auto/auto.py
index 47101bc6d..525987f3e 100644
--- a/etna/auto/auto.py
+++ b/etna/auto/auto.py
@@ -484,7 +484,11 @@ def _objective(trial: Trial) -> float:
             for metric in aggregated_metrics:
                 trial.set_user_attr(metric, aggregated_metrics[metric])
 
-            return aggregated_metrics[f"{target_metric.name}_{metric_aggregation}"]
+            result_value = aggregated_metrics[f"{target_metric.name}_{metric_aggregation}"]
+            if result_value is None:
+                raise ValueError("Metric value is None! It should be float for optimization.")
+
+            return result_value
 
         return _objective
 
@@ -809,7 +813,11 @@ def _objective(trial: Trial) -> float:
                 for metric in aggregated_metrics:
                     trial.set_user_attr(metric, aggregated_metrics[metric])
 
-                return aggregated_metrics[f"{target_metric.name}_{metric_aggregation}"]
+                result_value = aggregated_metrics[f"{target_metric.name}_{metric_aggregation}"]
+                if result_value is None:
+                    raise ValueError("Metric value is None! It should be float for optimization.")
+
+                return result_value
 
         return _objective
 
diff --git a/etna/metrics/utils.py b/etna/metrics/utils.py
index 5e31c5d78..0d424b0cb 100644
--- a/etna/metrics/utils.py
+++ b/etna/metrics/utils.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Callable
 from typing import Dict
 from typing import List
@@ -37,24 +38,89 @@ def compute_metrics(
     return metrics_values
 
 
+def mean_agg():
+    """Mean for pandas agg."""
+
+    def func(x):
+        with warnings.catch_warnings():
+            # this helps to prevent warning in case of all nans
+            warnings.filterwarnings(
+                message="Mean of empty slice",
+                action="ignore",
+            )
+            return np.nanmean(a=x.values)
+
+    func.__name__ = "mean"
+    return func
+
+
+def median_agg():
+    """Median for pandas agg."""
+
+    def func(x):
+        with warnings.catch_warnings():
+            # this helps to prevent warning in case of all nans
+            warnings.filterwarnings(
+                message="All-NaN slice encountered",
+                action="ignore",
+            )
+            return np.nanmedian(a=x.values)
+
+    func.__name__ = "median"
+    return func
+
+
+def std_agg():
+    """Std for pandas agg."""
+
+    def func(x):
+        with warnings.catch_warnings():
+            # this helps to prevent warning in case of all nans
+            warnings.filterwarnings(
+                message="Degrees of freedom <=",
+                action="ignore",
+            )
+            return np.nanstd(a=x.values)
+
+    func.__name__ = "std"
+    return func
+
+
+def size_agg():
+    """Size for pandas agg."""
+
+    def func(x):
+        return len(x) - pd.isna(x.values).sum()
+
+    func.__name__ = "size"
+    return func
+
+
 def percentile(n: int):
     """Percentile for pandas agg."""
 
-    def percentile_(x):
-        return np.nanpercentile(a=x.values, q=n)
+    def func(x):
+        with warnings.catch_warnings():
+            # this helps to prevent warning in case of all nans
+            warnings.filterwarnings(
+                message="All-NaN slice encountered",
+                action="ignore",
+            )
+            return np.nanpercentile(a=x.values, q=n)
 
-    percentile_.__name__ = f"percentile_{n}"
-    return percentile_
+    func.__name__ = f"percentile_{n}"
+    return func
 
 
 MetricAggregationStatistics = Literal[
-    "median", "mean", "std", "percentile_5", "percentile_25", "percentile_75", "percentile_95"
+    "median", "mean", "std", "size", "percentile_5", "percentile_25", "percentile_75", "percentile_95"
 ]
 
 METRICS_AGGREGATION_MAP: Dict[MetricAggregationStatistics, Union[str, Callable]] = {
-    "median": "median",
-    "mean": "mean",
-    "std": "std",
+    "median": mean_agg(),
+    "mean": median_agg(),
+    "std": std_agg(),
+    "size": size_agg(),
     "percentile_5": percentile(5),
     "percentile_25": percentile(25),
     "percentile_75": percentile(75),
@@ -62,7 +128,7 @@ def percentile_(x):
 }
 
 
-def aggregate_metrics_df(metrics_df: pd.DataFrame) -> Dict[str, float]:
+def aggregate_metrics_df(metrics_df: pd.DataFrame) -> Dict[str, Optional[float]]:
     """Aggregate metrics in :py:meth:`log_backtest_metrics` method.
 
     Parameters
@@ -74,7 +140,7 @@ def aggregate_metrics_df(metrics_df: pd.DataFrame) -> Dict[str, float]:
     if "fold_number" in metrics_df.columns:
         metrics_dict = (
             metrics_df.groupby("segment")
-            .mean()
+            .apply(lambda x: x.mean(skipna=False, numeric_only=False))
             .reset_index()
             .drop(["segment", "fold_number"], axis=1)
             .apply(list(METRICS_AGGREGATION_MAP.values()))
@@ -85,10 +151,11 @@ def aggregate_metrics_df(metrics_df: pd.DataFrame) -> Dict[str, float]:
     else:
         metrics_dict = metrics_df.drop(["segment"], axis=1).apply(list(METRICS_AGGREGATION_MAP.values())).to_dict()
 
-    metrics_dict_wide = {
-        f"{metrics_key}_{statistics_key}": value
-        for metrics_key, values in metrics_dict.items()
-        for statistics_key, value in values.items()
-    }
+    cur_dict = {}
+    for metrics_key, values in metrics_dict.items():
+        for statistics_key, value in values.items():
+            new_key = f"{metrics_key}_{statistics_key}"
+            new_value = value if not pd.isna(value) else None
+            cur_dict[new_key] = new_value
 
-    return metrics_dict_wide
+    return cur_dict
diff --git a/etna/pipeline/base.py b/etna/pipeline/base.py
index d6728ee6c..3166a77ac 100644
--- a/etna/pipeline/base.py
+++ b/etna/pipeline/base.py
@@ -856,7 +856,12 @@ def _get_backtest_metrics(self, aggregate_metrics: bool = False) -> pd.DataFrame
         metrics_df.sort_values(["segment", self._fold_column], inplace=True)
 
         if aggregate_metrics:
-            metrics_df = metrics_df.groupby("segment").mean().reset_index().drop(self._fold_column, axis=1)
+            metrics_df = (
+                metrics_df.groupby("segment")
+                .apply(lambda x: x.mean(skipna=False, numeric_only=False))
+                .reset_index()
+                .drop(self._fold_column, axis=1)
+            )
 
         return metrics_df
 
diff --git a/tests/test_auto/conftest.py b/tests/test_auto/conftest.py
index 18168e8c3..fe6c03433 100644
--- a/tests/test_auto/conftest.py
+++ b/tests/test_auto/conftest.py
@@ -1,11 +1,14 @@
 from os import unlink
 
+import numpy as np
+import pandas as pd
 import pytest
 from optuna.storages import RDBStorage
 from optuna.trial import TrialState
 from typing_extensions import NamedTuple
 
 from etna.auto.utils import config_hash
+from etna.datasets import TSDataset
 from etna.models import NaiveModel
 from etna.pipeline import Pipeline
 
@@ -35,3 +38,63 @@ class Trial(NamedTuple):
     fail_trials = [Trial(user_attrs={}, state=TrialState.FAIL)]
 
     return complete_trials + complete_trials[:3] + fail_trials
+
+
+@pytest.fixture
+def ts_with_fold_missing_tail(random_seed) -> TSDataset:
+    periods = 100
+    df1 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)})
+    df1["segment"] = "segment_1"
+    df1["target"] = np.random.uniform(10, 20, size=periods)
+    df1.loc[df1.index[-7:], "target"] = np.NaN
+
+    df2 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)})
+    df2["segment"] = "segment_2"
+    df2["target"] = np.random.uniform(-15, 5, size=periods)
+    df2.loc[df2.index[-7:], "target"] = np.NaN
+
+    df = pd.concat([df1, df2]).reset_index(drop=True)
+    df = TSDataset.to_dataset(df)
+    tsds = TSDataset(df, freq="D")
+
+    return tsds
+
+
+@pytest.fixture
+def ts_with_fold_missing_middle(random_seed) -> TSDataset:
+    periods = 100
+    df1 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)})
+    df1["segment"] = "segment_1"
+    df1["target"] = np.random.uniform(10, 20, size=periods)
+    df1.loc[df1.index[-14:-7], "target"] = np.NaN
+
+    df2 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)})
+    df2["segment"] = "segment_2"
+    df2["target"] = np.random.uniform(-15, 5, size=periods)
+    df2.loc[df2.index[-14:-7], "target"] = np.NaN
+
+    df = pd.concat([df1, df2]).reset_index(drop=True)
+    df = TSDataset.to_dataset(df)
+    tsds = TSDataset(df, freq="D")
+
+    return tsds
+
+
+@pytest.fixture
+def ts_with_few_missing(random_seed) -> TSDataset:
+    periods = 100
+    df1 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)})
+    df1["segment"] = "segment_1"
+    df1["target"] = np.random.uniform(10, 20, size=periods)
+    df1.loc[df1.index[-4:-2], "target"] = np.NaN
+
+    df2 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)})
+    df2["segment"] = "segment_2"
+    df2["target"] = np.random.uniform(-15, 5, size=periods)
+    df2.loc[df2.index[-12:-10], "target"] = np.NaN
+
+    df = pd.concat([df1, df2]).reset_index(drop=True)
+    df = TSDataset.to_dataset(df)
+    tsds = TSDataset(df, freq="D")
+
+    return tsds
diff --git a/tests/test_auto/test_auto.py b/tests/test_auto/test_auto.py
index e93b56581..12251e8fd 100644
--- a/tests/test_auto/test_auto.py
+++ b/tests/test_auto/test_auto.py
@@ -11,11 +11,13 @@
 from etna.auto.auto import _Callback
 from etna.auto.auto import _Initializer
 from etna.metrics import MAE
+from etna.metrics import MSE
 from etna.models import LinearPerSegmentModel
 from etna.models import MovingAverageModel
 from etna.models import NaiveModel
 from etna.pipeline import Pipeline
 from etna.transforms import LagTransform
+from etna.transforms import TimeSeriesImputerTransform
 
 
 @pytest.fixture()
@@ -23,13 +25,15 @@ def pool_generator():
     pool = [
         {
             "_target_": "etna.pipeline.Pipeline",
-            "horizon": "${__aux__.horizon}",
             "model": {"_target_": "etna.models.MovingAverageModel", "window": "${mult:${horizon},1}"},
+            "transforms": [{"_target_": "etna.transforms.TimeSeriesImputerTransform"}],
+            "horizon": "${__aux__.horizon}",
         },
         {
             "_target_": "etna.pipeline.Pipeline",
-            "horizon": "${__aux__.horizon}",
             "model": {"_target_": "etna.models.NaiveModel", "lag": 1},
+            "transforms": [{"_target_": "etna.transforms.TimeSeriesImputerTransform"}],
+            "horizon": "${__aux__.horizon}",
         },
     ]
     pool_generator = PoolGenerator(pool)
@@ -38,7 +42,10 @@ def pool_generator():
 
 @pytest.fixture()
 def pool_list():
-    return [Pipeline(MovingAverageModel(7), horizon=7), Pipeline(NaiveModel(1), horizon=7)]
+    return [
+        Pipeline(MovingAverageModel(7), transforms=[TimeSeriesImputerTransform()], horizon=7),
+        Pipeline(NaiveModel(1), transforms=[TimeSeriesImputerTransform()], horizon=7),
+    ]
 
 
 def test_objective(
@@ -72,6 +79,39 @@ def test_objective(
     callback.assert_called_once()
 
 
+@pytest.mark.parametrize("ts_name", ["ts_with_fold_missing_tail", "ts_with_fold_missing_middle"])
+def test_objective_fail_none(
+    ts_name,
+    request,
+    target_metric=MSE(missing_mode="ignore"),
+    metric_aggregation: Literal["mean"] = "mean",
+    metrics=[MSE(missing_mode="ignore")],
+    backtest_params={},
+    initializer=MagicMock(spec=_Initializer),
+    callback=MagicMock(spec=_Callback),
+    relative_params={
+        "_target_": "etna.pipeline.Pipeline",
+        "horizon": 7,
+        "model": {"_target_": "etna.models.NaiveModel", "lag": 1},
+        "transforms": [{"_target_": "etna.transforms.TimeSeriesImputerTransform"}],
+    },
+):
+    ts = request.getfixturevalue(ts_name)
+    trial = MagicMock(relative_params=relative_params)
+    _objective = Auto.objective(
+        ts=ts,
+        target_metric=target_metric,
+        metric_aggregation=metric_aggregation,
+        metrics=metrics,
+        backtest_params=backtest_params,
+        initializer=initializer,
+        callback=callback,
+    )
+
+    with pytest.raises(ValueError, match="Metric value is None"):
+        _ = _objective(trial)
+
+
 @pytest.mark.parametrize("tune_size", [0, 2])
 def test_fit_called_tuning_pool(
     tune_size,
@@ -142,17 +182,20 @@ def test_init_optuna(
     )
 
 
+@pytest.mark.parametrize("ts_name", ["example_tsds", "ts_with_few_missing"])
 @pytest.mark.parametrize("pool", ["pool_list", "pool_generator"])
-def test_fit_without_tuning_list(example_tsds, optuna_storage, pool, request):
+def test_fit_without_tuning_list(ts_name, optuna_storage, pool, request):
+    ts = request.getfixturevalue(ts_name)
     pool = request.getfixturevalue(pool)
     auto = Auto(
-        MAE(),
+        MSE(missing_mode="ignore"),
+        metrics=[MSE(missing_mode="ignore")],
         pool=pool,
         metric_aggregation="median",
         horizon=7,
         storage=optuna_storage,
     )
-    auto.fit(ts=example_tsds, n_trials=2)
+    auto.fit(ts=ts, n_trials=2)
 
     assert len(auto._pool_optuna.study.trials) == 2
     assert len(auto.summary()) == 2
@@ -163,27 +206,36 @@ def test_fit_without_tuning_list(example_tsds, optuna_storage, pool, request):
     assert auto.top_k(k=1)[0].to_dict() == pool[0].to_dict()
 
 
+@pytest.mark.parametrize("ts_name", ["example_tsds", "ts_with_few_missing"])
 @pytest.mark.parametrize("tune_size", [1, 2])
 def test_fit_with_tuning(
+    ts_name,
     tune_size,
-    example_tsds,
+    request,
     optuna_storage,
     pool=(
-        Pipeline(MovingAverageModel(5), horizon=7),
-        Pipeline(NaiveModel(1), horizon=7),
+        Pipeline(MovingAverageModel(5), transforms=[TimeSeriesImputerTransform(strategy="forward_fill")], horizon=7),
+        Pipeline(NaiveModel(1), transforms=[TimeSeriesImputerTransform(strategy="forward_fill")], horizon=7),
         Pipeline(
-            LinearPerSegmentModel(), transforms=[LagTransform(in_column="target", lags=list(range(7, 21)))], horizon=7
+            LinearPerSegmentModel(),
+            transforms=[
+                TimeSeriesImputerTransform(strategy="forward_fill"),
+                LagTransform(in_column="target", lags=list(range(7, 21))),
+            ],
+            horizon=7,
         ),
     ),
 ):
+    ts = request.getfixturevalue(ts_name)
     auto = Auto(
-        MAE(),
+        MSE(missing_mode="ignore"),
+        metrics=[MSE(missing_mode="ignore")],
         pool=pool,
         metric_aggregation="median",
         horizon=7,
         storage=optuna_storage,
     )
-    auto.fit(ts=example_tsds, n_trials=11, tune_size=tune_size)
+    auto.fit(ts=ts, n_trials=11, tune_size=tune_size)
 
     assert len(auto._pool_optuna.study.trials) == 3
     assert len(auto.summary()) == 11
diff --git a/tests/test_auto/test_tune.py b/tests/test_auto/test_tune.py
index 0a1b972a0..175d05c13 100644
--- a/tests/test_auto/test_tune.py
+++ b/tests/test_auto/test_tune.py
@@ -13,6 +13,7 @@
 from etna.distributions import FloatDistribution
 from etna.distributions import IntDistribution
 from etna.metrics import MAE
+from etna.metrics import MSE
 from etna.models import NaiveModel
 from etna.models import SimpleExpSmoothingModel
 from etna.pipeline import AutoRegressivePipeline
@@ -21,6 +22,7 @@
 from etna.reconciliation import BottomUpReconciliator
 from etna.transforms import AddConstTransform
 from etna.transforms import DateFlagsTransform
+from etna.transforms import TimeSeriesImputerTransform
 
 
 def test_objective(
@@ -53,6 +55,36 @@ def test_objective(
     callback.assert_called_once()
 
 
+@pytest.mark.parametrize("ts_name", ["ts_with_fold_missing_tail", "ts_with_fold_missing_middle"])
+def test_objective_fail_none(
+    ts_name,
+    request,
+    target_metric=MSE(missing_mode="ignore"),
+    metric_aggregation: Literal["mean"] = "mean",
+    metrics=[MSE(missing_mode="ignore")],
+    backtest_params={},
+    initializer=MagicMock(spec=_Initializer),
+    callback=MagicMock(spec=_Callback),
+    pipeline=Pipeline(model=NaiveModel(), transforms=[TimeSeriesImputerTransform()], horizon=7),
+    params_to_tune={},
+):
+    ts = request.getfixturevalue(ts_name)
+    trial = MagicMock()
+    _objective = Tune.objective(
+        ts=ts,
+        pipeline=pipeline,
+        params_to_tune=params_to_tune,
+        target_metric=target_metric,
+        metric_aggregation=metric_aggregation,
+        metrics=metrics,
+        backtest_params=backtest_params,
+        initializer=initializer,
+        callback=callback,
+    )
+    with pytest.raises(ValueError, match="Metric value is None"):
+        _ = _objective(trial)
+
+
 def test_fit_called_tune(
     ts=MagicMock(),
     tune=MagicMock(),
@@ -165,23 +197,30 @@ def test_top_k(
     assert [pipeline.model.lag for pipeline in top_k] == [i for i in range(expected_k)]  # noqa C416
 
 
+@pytest.mark.parametrize("ts_name", ["example_tsds", "ts_with_few_missing"])
 @pytest.mark.parametrize(
     "pipeline",
     [
-        (Pipeline(NaiveModel(1), horizon=7)),
-        (AutoRegressivePipeline(model=NaiveModel(1), horizon=7, transforms=[])),
-        (AutoRegressivePipeline(model=NaiveModel(1), horizon=7, transforms=[DateFlagsTransform()])),
+        (Pipeline(NaiveModel(1), transforms=[TimeSeriesImputerTransform()], horizon=7)),
+        (AutoRegressivePipeline(model=NaiveModel(1), transforms=[TimeSeriesImputerTransform()], horizon=7)),
+        (
+            AutoRegressivePipeline(
+                model=NaiveModel(1), transforms=[DateFlagsTransform(), TimeSeriesImputerTransform()], horizon=7
+            )
+        ),
     ],
 )
-def test_tune_run(example_tsds, optuna_storage, pipeline):
+def test_tune_run(ts_name, optuna_storage, pipeline, request):
+    ts = request.getfixturevalue(ts_name)
     tune = Tune(
         pipeline=pipeline,
-        target_metric=MAE(),
+        target_metric=MSE(missing_mode="ignore"),
+        metrics=[MSE(missing_mode="ignore")],
         metric_aggregation="median",
         horizon=7,
         storage=optuna_storage,
     )
-    tune.fit(ts=example_tsds, n_trials=2)
+    tune.fit(ts=ts, n_trials=2)
 
     assert len(tune._optuna.study.trials) == 2
     assert len(tune.summary()) == 2
diff --git a/tests/test_metrics/test_metrics_utils.py b/tests/test_metrics/test_metrics_utils.py
index 8872ad7af..0bceec965 100644
--- a/tests/test_metrics/test_metrics_utils.py
+++ b/tests/test_metrics/test_metrics_utils.py
@@ -1,11 +1,16 @@
+from typing import Any
+from typing import Dict
 from typing import Tuple
 
 import numpy as np
+import pandas as pd
+import pytest
 
 from etna.datasets import TSDataset
 from etna.metrics import MAE
 from etna.metrics import MAPE
 from etna.metrics import MSE
+from etna.metrics.utils import aggregate_metrics_df
 from etna.metrics.utils import compute_metrics
 
 
@@ -21,3 +26,160 @@ def test_compute_metrics(train_test_dfs: Tuple[TSDataset, TSDataset]):
     ]
     result = compute_metrics(metrics=metrics, y_true=true_df, y_pred=forecast_df)
     np.testing.assert_array_equal(sorted(expected_keys), sorted(result.keys()))
+
+
+@pytest.fixture
+def metrics_df_with_folds() -> pd.DataFrame:
+    df = pd.DataFrame(
+        {
+            "segment": ["segment_0"] * 3 + ["segment_1"] * 3 + ["segment_2"] * 3,
+            "MAE": [1.0, 2.0, 3.0, 2.0, 3.0, 4.0, 3.0, 4.0, 5.0],
+            "MSE": [2.0, 3.0, 4.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0],
+            "fold_number": [0, 1, 2, 0, 1, 2, 0, 1, 2],
+        }
+    )
+    return df
+
+
+@pytest.fixture
+def metrics_df_no_folds(metrics_df_with_folds) -> pd.DataFrame:
+    df = metrics_df_with_folds
+    df = df.groupby("segment").mean().reset_index().drop("fold_number", axis=1)
+    return df
+
+
+@pytest.fixture
+def aggregated_metrics_df() -> Dict[str, Any]:
+    result = {
+        "MAE_median": 3.0,
+        "MAE_mean": 3.0,
+        "MAE_std": 0.816496580927726,
+        "MAE_size": 3.0,
+        "MAE_percentile_5": 2.1,
+        "MAE_percentile_25": 2.5,
+        "MAE_percentile_75": 3.5,
+        "MAE_percentile_95": 3.9,
+        "MSE_median": 4.0,
+        "MSE_mean": 4.333333333333333,
+        "MSE_std": 1.247219128924647,
+        "MSE_size": 3.0,
+        "MSE_percentile_5": 3.1,
+        "MSE_percentile_25": 3.5,
+        "MSE_percentile_75": 5.0,
+        "MSE_percentile_95": 5.8,
+    }
+    return result
+
+
+@pytest.fixture
+def metrics_df_with_folds_with_missing() -> pd.DataFrame:
+    df = pd.DataFrame(
+        {
+            "segment": ["segment_0"] * 3 + ["segment_1"] * 3 + ["segment_2"] * 3,
+            "MAE": [None, 2.0, 3.0, 2.0, 3.0, 4.0, 3.0, 4.0, 5.0],
+            "MSE": [2.0, 3.0, 4.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0],
+            "fold_number": [0, 1, 2, 0, 1, 2, 0, 1, 2],
+        }
+    )
+    return df
+
+
+@pytest.fixture
+def metrics_df_no_folds_with_missing(metrics_df_with_folds_with_missing) -> pd.DataFrame:
+    df = metrics_df_with_folds_with_missing
+    df = (
+        df.groupby("segment")
+        .apply(lambda x: x.mean(skipna=False, numeric_only=False))
+        .reset_index()
+        .drop("fold_number", axis=1)
+    )
+    return df
+
+
+@pytest.fixture
+def aggregated_metrics_df_with_missing() -> Dict[str, Any]:
+    result = {
+        "MAE_mean": 3.5,
+        "MAE_median": 3.5,
+        "MAE_std": 0.5,
+        "MAE_size": 2.0,
+        "MAE_percentile_5": 3.05,
+        "MAE_percentile_25": 3.25,
+        "MAE_percentile_75": 3.75,
+        "MAE_percentile_95": 3.95,
+        "MSE_mean": 4.333333333333333,
+        "MSE_median": 4.0,
+        "MSE_std": 1.247219128924647,
+        "MSE_size": 3.0,
+        "MSE_percentile_5": 3.1,
+        "MSE_percentile_25": 3.5,
+        "MSE_percentile_75": 5.0,
+        "MSE_percentile_95": 5.8,
+    }
+    return result
+
+
+@pytest.fixture
+def metrics_df_with_folds_with_full_missing() -> pd.DataFrame:
+    df = pd.DataFrame(
+        {
+            "segment": ["segment_0"] * 3 + ["segment_1"] * 3 + ["segment_2"] * 3,
+            "MAE": [None, 2.0, 3.0, 2.0, None, 4.0, 3.0, 4.0, None],
+            "MSE": [2.0, 3.0, 4.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0],
+            "fold_number": [0, 1, 2, 0, 1, 2, 0, 1, 2],
+        }
+    )
+    return df
+
+
+@pytest.fixture
+def metrics_df_no_folds_with_full_missing(metrics_df_with_folds_with_full_missing) -> pd.DataFrame:
+    df = metrics_df_with_folds_with_full_missing
+    df = (
+        df.groupby("segment")
+        .apply(lambda x: x.mean(skipna=False, numeric_only=False))
+        .reset_index()
+        .drop("fold_number", axis=1)
+    )
+    return df
+
+
+@pytest.fixture
+def aggregated_metrics_df_with_full_missing() -> Dict[str, Any]:
+    result = {
+        "MAE_mean": None,
+        "MAE_median": None,
+        "MAE_std": None,
+        "MAE_size": 0.0,
+        "MAE_percentile_5": None,
+        "MAE_percentile_25": None,
+        "MAE_percentile_75": None,
+        "MAE_percentile_95": None,
+        "MSE_mean": 4.333333333333333,
+        "MSE_median": 4.0,
+        "MSE_std": 1.247219128924647,
+        "MSE_size": 3.0,
+        "MSE_percentile_5": 3.1,
+        "MSE_percentile_25": 3.5,
+        "MSE_percentile_75": 5.0,
+        "MSE_percentile_95": 5.8,
+    }
+    return result
+
+
+@pytest.mark.parametrize(
+    "df_name, answer_name",
+    [
+        ("metrics_df_with_folds", "aggregated_metrics_df"),
+        ("metrics_df_no_folds", "aggregated_metrics_df"),
+        ("metrics_df_with_folds_with_missing", "aggregated_metrics_df_with_missing"),
+        ("metrics_df_no_folds_with_missing", "aggregated_metrics_df_with_missing"),
+        ("metrics_df_with_folds_with_full_missing", "aggregated_metrics_df_with_full_missing"),
+        ("metrics_df_no_folds_with_full_missing", "aggregated_metrics_df_with_full_missing"),
+    ],
+)
+def test_aggregate_metrics_df(df_name, answer_name, request):
+    metrics_df = request.getfixturevalue(df_name)
+    answer = request.getfixturevalue(answer_name)
+    result = aggregate_metrics_df(metrics_df)
+    assert result == answer

From 3a4d08fb474e0779da534c7a03219bb1f1462de1 Mon Sep 17 00:00:00 2001
From: Dmitry Bunin <bunin260200@gmail.com>
Date: Wed, 11 Dec 2024 16:07:47 +0300
Subject: [PATCH 2/9] fix: set numeric_only=True

---
 etna/metrics/utils.py | 2 +-
 etna/pipeline/base.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/etna/metrics/utils.py b/etna/metrics/utils.py
index 0d424b0cb..ca69152dd 100644
--- a/etna/metrics/utils.py
+++ b/etna/metrics/utils.py
@@ -140,7 +140,7 @@ def aggregate_metrics_df(metrics_df: pd.DataFrame) -> Dict[str, Optional[float]]
     if "fold_number" in metrics_df.columns:
         metrics_dict = (
             metrics_df.groupby("segment")
-            .apply(lambda x: x.mean(skipna=False, numeric_only=False))
+            .apply(lambda x: x.mean(skipna=False, numeric_only=True))
             .reset_index()
             .drop(["segment", "fold_number"], axis=1)
             .apply(list(METRICS_AGGREGATION_MAP.values()))
diff --git a/etna/pipeline/base.py b/etna/pipeline/base.py
index 3166a77ac..46b5a7d70 100644
--- a/etna/pipeline/base.py
+++ b/etna/pipeline/base.py
@@ -858,7 +858,7 @@ def _get_backtest_metrics(self, aggregate_metrics: bool = False) -> pd.DataFrame
         if aggregate_metrics:
             metrics_df = (
                 metrics_df.groupby("segment")
-                .apply(lambda x: x.mean(skipna=False, numeric_only=False))
+                .apply(lambda x: x.mean(skipna=False, numeric_only=True))
                 .reset_index()
                 .drop(self._fold_column, axis=1)
             )

From 2ba6c87761eb194a7fbedc5c32ba77fb6a0184ce Mon Sep 17 00:00:00 2001
From: Dmitry Bunin <bunin260200@gmail.com>
Date: Wed, 11 Dec 2024 17:30:45 +0300
Subject: [PATCH 3/9] fix: fix file logger tests

---
 tests/test_loggers/test_file_logger.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tests/test_loggers/test_file_logger.py b/tests/test_loggers/test_file_logger.py
index 8f3385435..1e120534f 100644
--- a/tests/test_loggers/test_file_logger.py
+++ b/tests/test_loggers/test_file_logger.py
@@ -154,6 +154,7 @@ def test_base_file_logger_log_backtest_run(example_tsds: TSDataset):
                 "median",
                 "mean",
                 "std",
+                "size",
                 "percentile_5",
                 "percentile_25",
                 "percentile_75",
@@ -213,7 +214,16 @@ def test_base_file_logger_log_backtest_metrics(example_tsds: TSDataset, aggregat
         with open(crossval_results_folder.joinpath("metrics_summary.json"), "r") as inf:
             metrics_summary = json.load(inf)
 
-        statistic_keys = ["median", "mean", "std", "percentile_5", "percentile_25", "percentile_75", "percentile_95"]
+        statistic_keys = [
+            "median",
+            "mean",
+            "std",
+            "size",
+            "percentile_5",
+            "percentile_25",
+            "percentile_75",
+            "percentile_95",
+        ]
         assert len(metrics_summary.keys()) == len(metrics) * len(statistic_keys)
 
     tslogger.remove(idx)

From 594d15e306b0764714519c3b526133c8af6a2e42 Mon Sep 17 00:00:00 2001
From: Dmitry Bunin <bunin260200@gmail.com>
Date: Wed, 11 Dec 2024 18:45:00 +0300
Subject: [PATCH 4/9] fix: add ignoring overflow warning in test

---
 etna/metrics/utils.py        | 2 +-
 tests/test_auto/test_tune.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/etna/metrics/utils.py b/etna/metrics/utils.py
index ca69152dd..8cc17a63c 100644
--- a/etna/metrics/utils.py
+++ b/etna/metrics/utils.py
@@ -77,7 +77,7 @@ def func(x):
         with warnings.catch_warnings():
             # this helps to prevent warning in case of all nans
             warnings.filterwarnings(
-                message="Degrees of freedom <=",
+                message="Degrees of freedom <= 0",
                 action="ignore",
             )
             return np.nanstd(a=x.values)
diff --git a/tests/test_auto/test_tune.py b/tests/test_auto/test_tune.py
index 175d05c13..52dc48db9 100644
--- a/tests/test_auto/test_tune.py
+++ b/tests/test_auto/test_tune.py
@@ -124,6 +124,7 @@ def test_init_optuna(
     )
 
 
+@pytest.mark.filterwarnings("ignore: overflow encountered in multiply")
 @pytest.mark.parametrize(
     "params, model",
     [

From ab908447212d2b7cefe62baf913c389d3f2b09b7 Mon Sep 17 00:00:00 2001
From: Dmitry Bunin <bunin260200@gmail.com>
Date: Fri, 13 Dec 2024 14:44:50 +0300
Subject: [PATCH 5/9] fix: rework handling missing values by folds, rework
 tests

---
 etna/metrics/utils.py                    |  12 +-
 etna/pipeline/base.py                    |   5 +-
 tests/test_auto/conftest.py              |  19 +++
 tests/test_auto/test_auto.py             |  41 ++++---
 tests/test_auto/test_tune.py             |  39 ++++--
 tests/test_metrics/test_metrics_utils.py | 145 ++++++-----------------
 6 files changed, 115 insertions(+), 146 deletions(-)

diff --git a/etna/metrics/utils.py b/etna/metrics/utils.py
index 8cc17a63c..1830a8ba9 100644
--- a/etna/metrics/utils.py
+++ b/etna/metrics/utils.py
@@ -86,13 +86,13 @@ def func(x):
     return func
 
 
-def size_agg():
-    """Size for pandas agg."""
+def notna_size_agg():
+    """Size of not-na elements for pandas agg."""
 
     def func(x):
         return len(x) - pd.isna(x.values).sum()
 
-    func.__name__ = "size"
+    func.__name__ = "notna_size"
     return func
 
 
@@ -113,14 +113,14 @@ def func(x):
 
 
 MetricAggregationStatistics = Literal[
-    "median", "mean", "std", "size", "percentile_5", "percentile_25", "percentile_75", "percentile_95"
+    "median", "mean", "std", "notna_size", "percentile_5", "percentile_25", "percentile_75", "percentile_95"
 ]
 
 METRICS_AGGREGATION_MAP: Dict[MetricAggregationStatistics, Union[str, Callable]] = {
     "median": mean_agg(),
     "mean": median_agg(),
     "std": std_agg(),
-    "size": size_agg(),
+    "notna_size": notna_size_agg(),
     "percentile_5": percentile(5),
     "percentile_25": percentile(25),
     "percentile_75": percentile(75),
@@ -140,7 +140,7 @@ def aggregate_metrics_df(metrics_df: pd.DataFrame) -> Dict[str, Optional[float]]
     if "fold_number" in metrics_df.columns:
         metrics_dict = (
             metrics_df.groupby("segment")
-            .apply(lambda x: x.mean(skipna=False, numeric_only=True))
+            .mean(numeric_only=False)
             .reset_index()
             .drop(["segment", "fold_number"], axis=1)
             .apply(list(METRICS_AGGREGATION_MAP.values()))
diff --git a/etna/pipeline/base.py b/etna/pipeline/base.py
index 46b5a7d70..c5b5dba3f 100644
--- a/etna/pipeline/base.py
+++ b/etna/pipeline/base.py
@@ -857,10 +857,7 @@ def _get_backtest_metrics(self, aggregate_metrics: bool = False) -> pd.DataFrame
 
         if aggregate_metrics:
             metrics_df = (
-                metrics_df.groupby("segment")
-                .apply(lambda x: x.mean(skipna=False, numeric_only=True))
-                .reset_index()
-                .drop(self._fold_column, axis=1)
+                metrics_df.groupby("segment").mean(numeric_only=False).reset_index().drop(self._fold_column, axis=1)
             )
 
         return metrics_df
diff --git a/tests/test_auto/conftest.py b/tests/test_auto/conftest.py
index fe6c03433..8688468e1 100644
--- a/tests/test_auto/conftest.py
+++ b/tests/test_auto/conftest.py
@@ -80,6 +80,25 @@ def ts_with_fold_missing_middle(random_seed) -> TSDataset:
     return tsds
 
 
+@pytest.fixture
+def ts_with_all_folds_missing_one_segment(random_seed) -> TSDataset:
+    periods = 100
+    df1 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)})
+    df1["segment"] = "segment_1"
+    df1["target"] = np.random.uniform(10, 20, size=periods)
+    df1.loc[df1.index[-21:], "target"] = np.NaN
+
+    df2 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)})
+    df2["segment"] = "segment_2"
+    df2["target"] = np.random.uniform(-15, 5, size=periods)
+
+    df = pd.concat([df1, df2]).reset_index(drop=True)
+    df = TSDataset.to_dataset(df)
+    tsds = TSDataset(df, freq="D")
+
+    return tsds
+
+
 @pytest.fixture
 def ts_with_few_missing(random_seed) -> TSDataset:
     periods = 100
diff --git a/tests/test_auto/test_auto.py b/tests/test_auto/test_auto.py
index 12251e8fd..371741f57 100644
--- a/tests/test_auto/test_auto.py
+++ b/tests/test_auto/test_auto.py
@@ -48,23 +48,35 @@ def pool_list():
     ]
 
 
+@pytest.mark.parametrize(
+    "ts_name",
+    [
+        "example_tsds",
+        "ts_with_few_missing",
+        "ts_with_fold_missing_tail",
+        "ts_with_fold_missing_middle",
+    ],
+)
 def test_objective(
-    example_tsds,
-    target_metric=MAE(),
+    ts_name,
+    request,
+    target_metric=MAE(missing_mode="ignore"),
     metric_aggregation: Literal["mean"] = "mean",
-    metrics=[MAE()],
+    metrics=[MAE(missing_mode="ignore")],
     backtest_params={},
-    initializer=MagicMock(spec=_Initializer),
-    callback=MagicMock(spec=_Callback),
     relative_params={
         "_target_": "etna.pipeline.Pipeline",
         "horizon": 7,
         "model": {"_target_": "etna.models.NaiveModel", "lag": 1},
+        "transforms": [{"_target_": "etna.transforms.TimeSeriesImputerTransform"}],
     },
 ):
+    ts = request.getfixturevalue(ts_name)
+    initializer = MagicMock(spec=_Initializer)
+    callback = MagicMock(spec=_Callback)
     trial = MagicMock(relative_params=relative_params)
     _objective = Auto.objective(
-        ts=example_tsds,
+        ts=ts,
         target_metric=target_metric,
         metric_aggregation=metric_aggregation,
         metrics=metrics,
@@ -79,13 +91,13 @@ def test_objective(
     callback.assert_called_once()
 
 
-@pytest.mark.parametrize("ts_name", ["ts_with_fold_missing_tail", "ts_with_fold_missing_middle"])
+@pytest.mark.parametrize("ts_name", ["ts_with_all_folds_missing_one_segment"])
 def test_objective_fail_none(
     ts_name,
     request,
-    target_metric=MSE(missing_mode="ignore"),
+    target_metric=MAE(missing_mode="ignore"),
     metric_aggregation: Literal["mean"] = "mean",
-    metrics=[MSE(missing_mode="ignore")],
+    metrics=[MAE(missing_mode="ignore")],
     backtest_params={},
     initializer=MagicMock(spec=_Initializer),
     callback=MagicMock(spec=_Callback),
@@ -108,7 +120,8 @@ def test_objective_fail_none(
         callback=callback,
     )
 
-    with pytest.raises(ValueError, match="Metric value is None"):
+    # TODO: discuss the error here
+    with pytest.raises(ValueError, match="Last train timestamp should be not later"):
         _ = _objective(trial)
 
 
@@ -188,8 +201,8 @@ def test_fit_without_tuning_list(ts_name, optuna_storage, pool, request):
     ts = request.getfixturevalue(ts_name)
     pool = request.getfixturevalue(pool)
     auto = Auto(
-        MSE(missing_mode="ignore"),
-        metrics=[MSE(missing_mode="ignore")],
+        MAE(missing_mode="ignore"),
+        metrics=[MAE(missing_mode="ignore")],
         pool=pool,
         metric_aggregation="median",
         horizon=7,
@@ -228,8 +241,8 @@ def test_fit_with_tuning(
 ):
     ts = request.getfixturevalue(ts_name)
     auto = Auto(
-        MSE(missing_mode="ignore"),
-        metrics=[MSE(missing_mode="ignore")],
+        MAE(missing_mode="ignore"),
+        metrics=[MAE(missing_mode="ignore")],
         pool=pool,
         metric_aggregation="median",
         horizon=7,
diff --git a/tests/test_auto/test_tune.py b/tests/test_auto/test_tune.py
index 52dc48db9..ac857b3d3 100644
--- a/tests/test_auto/test_tune.py
+++ b/tests/test_auto/test_tune.py
@@ -25,20 +25,31 @@
 from etna.transforms import TimeSeriesImputerTransform
 
 
+@pytest.mark.parametrize(
+    "ts_name",
+    [
+        "example_tsds",
+        "ts_with_few_missing",
+        "ts_with_fold_missing_tail",
+        "ts_with_fold_missing_middle",
+    ],
+)
 def test_objective(
-    example_tsds,
-    target_metric=MAE(),
+    ts_name,
+    request,
+    target_metric=MAE(missing_mode="ignore"),
     metric_aggregation: Literal["mean"] = "mean",
-    metrics=[MAE()],
+    metrics=[MAE(missing_mode="ignore")],
     backtest_params={},
-    initializer=MagicMock(spec=_Initializer),
-    callback=MagicMock(spec=_Callback),
-    pipeline=Pipeline(NaiveModel()),
+    pipeline=Pipeline(model=NaiveModel(), transforms=[TimeSeriesImputerTransform()], horizon=7),
     params_to_tune={},
 ):
+    ts = request.getfixturevalue(ts_name)
+    initializer = MagicMock(spec=_Initializer)
+    callback = MagicMock(spec=_Callback)
     trial = MagicMock()
     _objective = Tune.objective(
-        ts=example_tsds,
+        ts=ts,
         pipeline=pipeline,
         params_to_tune=params_to_tune,
         target_metric=target_metric,
@@ -55,13 +66,13 @@ def test_objective(
     callback.assert_called_once()
 
 
-@pytest.mark.parametrize("ts_name", ["ts_with_fold_missing_tail", "ts_with_fold_missing_middle"])
+@pytest.mark.parametrize("ts_name", ["ts_with_all_folds_missing_one_segment"])
 def test_objective_fail_none(
     ts_name,
     request,
-    target_metric=MSE(missing_mode="ignore"),
+    target_metric=MAE(missing_mode="ignore"),
     metric_aggregation: Literal["mean"] = "mean",
-    metrics=[MSE(missing_mode="ignore")],
+    metrics=[MAE(missing_mode="ignore")],
     backtest_params={},
     initializer=MagicMock(spec=_Initializer),
     callback=MagicMock(spec=_Callback),
@@ -81,7 +92,9 @@ def test_objective_fail_none(
         initializer=initializer,
         callback=callback,
     )
-    with pytest.raises(ValueError, match="Metric value is None"):
+
+    # TODO: discuss the error here
+    with pytest.raises(ValueError, match="Last train timestamp should be not later"):
         _ = _objective(trial)
 
 
@@ -215,8 +228,8 @@ def test_tune_run(ts_name, optuna_storage, pipeline, request):
     ts = request.getfixturevalue(ts_name)
     tune = Tune(
         pipeline=pipeline,
-        target_metric=MSE(missing_mode="ignore"),
-        metrics=[MSE(missing_mode="ignore")],
+        target_metric=MAE(missing_mode="ignore"),
+        metrics=[MAE(missing_mode="ignore")],
         metric_aggregation="median",
         horizon=7,
         storage=optuna_storage,
diff --git a/tests/test_metrics/test_metrics_utils.py b/tests/test_metrics/test_metrics_utils.py
index 9bf259599..38b74baef 100644
--- a/tests/test_metrics/test_metrics_utils.py
+++ b/tests/test_metrics/test_metrics_utils.py
@@ -34,7 +34,10 @@ def metrics_df_with_folds() -> pd.DataFrame:
         {
             "segment": ["segment_0"] * 3 + ["segment_1"] * 3 + ["segment_2"] * 3,
             "MAE": [1.0, 2.0, 3.0, 2.0, 3.0, 4.0, 3.0, 4.0, 5.0],
-            "MSE": [2.0, 3.0, 4.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0],
+            "MSE": [None, 3.0, 4.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0],
+            "MAPE": [None, None, None, 20.0, 30.0, 40.0, 30.0, 40.0, 50.0],
+            "SMAPE": [None, None, None, None, None, None, 50.0, 60.0, 70.0],
+            "RMSE": [None, None, None, None, None, None, None, None, None],
             "fold_number": [0, 1, 2, 0, 1, 2, 0, 1, 2],
         }
     )
@@ -44,125 +47,53 @@ def metrics_df_with_folds() -> pd.DataFrame:
 @pytest.fixture
 def metrics_df_no_folds(metrics_df_with_folds) -> pd.DataFrame:
     df = metrics_df_with_folds
-    df = df.groupby("segment").mean().reset_index().drop("fold_number", axis=1)
+    df = df.groupby("segment").mean(numeric_only=False).reset_index().drop("fold_number", axis=1)
     return df
 
 
 @pytest.fixture
 def aggregated_metrics_df() -> Dict[str, Any]:
     result = {
-        "MAE_median": 3.0,
         "MAE_mean": 3.0,
+        "MAE_median": 3.0,
         "MAE_std": 0.816496580927726,
-        "MAE_size": 3.0,
+        "MAE_notna_size": 3.0,
         "MAE_percentile_5": 2.1,
         "MAE_percentile_25": 2.5,
         "MAE_percentile_75": 3.5,
         "MAE_percentile_95": 3.9,
+        "MSE_mean": 4.5,
         "MSE_median": 4.0,
-        "MSE_mean": 4.333333333333333,
-        "MSE_std": 1.247219128924647,
-        "MSE_size": 3.0,
-        "MSE_percentile_5": 3.1,
-        "MSE_percentile_25": 3.5,
-        "MSE_percentile_75": 5.0,
-        "MSE_percentile_95": 5.8,
-    }
-    return result
-
-
-@pytest.fixture
-def metrics_df_with_folds_with_missing() -> pd.DataFrame:
-    df = pd.DataFrame(
-        {
-            "segment": ["segment_0"] * 3 + ["segment_1"] * 3 + ["segment_2"] * 3,
-            "MAE": [None, 2.0, 3.0, 2.0, 3.0, 4.0, 3.0, 4.0, 5.0],
-            "MSE": [2.0, 3.0, 4.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0],
-            "fold_number": [0, 1, 2, 0, 1, 2, 0, 1, 2],
-        }
-    )
-    return df
-
-
-@pytest.fixture
-def metrics_df_no_folds_with_missing(metrics_df_with_folds_with_missing) -> pd.DataFrame:
-    df = metrics_df_with_folds_with_missing
-    df = (
-        df.groupby("segment")
-        .apply(lambda x: x.mean(skipna=False, numeric_only=False))
-        .reset_index()
-        .drop("fold_number", axis=1)
-    )
-    return df
-
-
-@pytest.fixture
-def aggregated_metrics_df_with_missing() -> Dict[str, Any]:
-    result = {
-        "MAE_mean": 3.5,
-        "MAE_median": 3.5,
-        "MAE_std": 0.5,
-        "MAE_size": 2.0,
-        "MAE_percentile_5": 3.05,
-        "MAE_percentile_25": 3.25,
-        "MAE_percentile_75": 3.75,
-        "MAE_percentile_95": 3.95,
-        "MSE_mean": 4.333333333333333,
-        "MSE_median": 4.0,
-        "MSE_std": 1.247219128924647,
-        "MSE_size": 3.0,
-        "MSE_percentile_5": 3.1,
-        "MSE_percentile_25": 3.5,
-        "MSE_percentile_75": 5.0,
-        "MSE_percentile_95": 5.8,
-    }
-    return result
-
-
-@pytest.fixture
-def metrics_df_with_folds_with_full_missing() -> pd.DataFrame:
-    df = pd.DataFrame(
-        {
-            "segment": ["segment_0"] * 3 + ["segment_1"] * 3 + ["segment_2"] * 3,
-            "MAE": [None, 2.0, 3.0, 2.0, None, 4.0, 3.0, 4.0, None],
-            "MSE": [2.0, 3.0, 4.0, 3.0, 4.0, 5.0, 5.0, 6.0, 7.0],
-            "fold_number": [0, 1, 2, 0, 1, 2, 0, 1, 2],
-        }
-    )
-    return df
-
-
-@pytest.fixture
-def metrics_df_no_folds_with_full_missing(metrics_df_with_folds_with_full_missing) -> pd.DataFrame:
-    df = metrics_df_with_folds_with_full_missing
-    df = (
-        df.groupby("segment")
-        .apply(lambda x: x.mean(skipna=False, numeric_only=False))
-        .reset_index()
-        .drop("fold_number", axis=1)
-    )
-    return df
-
-
-@pytest.fixture
-def aggregated_metrics_df_with_full_missing() -> Dict[str, Any]:
-    result = {
-        "MAE_mean": None,
-        "MAE_median": None,
-        "MAE_std": None,
-        "MAE_size": 0.0,
-        "MAE_percentile_5": None,
-        "MAE_percentile_25": None,
-        "MAE_percentile_75": None,
-        "MAE_percentile_95": None,
-        "MSE_mean": 4.333333333333333,
-        "MSE_median": 4.0,
-        "MSE_std": 1.247219128924647,
-        "MSE_size": 3.0,
-        "MSE_percentile_5": 3.1,
-        "MSE_percentile_25": 3.5,
+        "MSE_std": 1.0801234497346435,
+        "MSE_notna_size": 3.0,
+        "MSE_percentile_5": 3.55,
+        "MSE_percentile_25": 3.75,
         "MSE_percentile_75": 5.0,
         "MSE_percentile_95": 5.8,
+        "MAPE_mean": 35.0,
+        "MAPE_median": 35.0,
+        "MAPE_std": 5.0,
+        "MAPE_notna_size": 2.0,
+        "MAPE_percentile_5": 30.5,
+        "MAPE_percentile_25": 32.5,
+        "MAPE_percentile_75": 37.5,
+        "MAPE_percentile_95": 39.5,
+        "SMAPE_mean": 60.0,
+        "SMAPE_median": 60.0,
+        "SMAPE_std": 0.0,
+        "SMAPE_notna_size": 1.0,
+        "SMAPE_percentile_5": 60.0,
+        "SMAPE_percentile_25": 60.0,
+        "SMAPE_percentile_75": 60.0,
+        "SMAPE_percentile_95": 60.0,
+        "RMSE_mean": None,
+        "RMSE_median": None,
+        "RMSE_std": None,
+        "RMSE_notna_size": 0.0,
+        "RMSE_percentile_5": None,
+        "RMSE_percentile_25": None,
+        "RMSE_percentile_75": None,
+        "RMSE_percentile_95": None,
     }
     return result
 
@@ -172,10 +103,6 @@ def aggregated_metrics_df_with_full_missing() -> Dict[str, Any]:
     [
         ("metrics_df_with_folds", "aggregated_metrics_df"),
         ("metrics_df_no_folds", "aggregated_metrics_df"),
-        ("metrics_df_with_folds_with_missing", "aggregated_metrics_df_with_missing"),
-        ("metrics_df_no_folds_with_missing", "aggregated_metrics_df_with_missing"),
-        ("metrics_df_with_folds_with_full_missing", "aggregated_metrics_df_with_full_missing"),
-        ("metrics_df_no_folds_with_full_missing", "aggregated_metrics_df_with_full_missing"),
     ],
 )
 def test_aggregate_metrics_df(df_name, answer_name, request):

From 1cf6d7b9f6a03c5887a6d2b0d419a3ee698ec2c0 Mon Sep 17 00:00:00 2001
From: Dmitry Bunin <bunin260200@gmail.com>
Date: Fri, 13 Dec 2024 14:47:32 +0300
Subject: [PATCH 6/9] style: fix styling

---
 tests/test_auto/test_auto.py | 1 -
 tests/test_auto/test_tune.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/tests/test_auto/test_auto.py b/tests/test_auto/test_auto.py
index 371741f57..8d180ebb4 100644
--- a/tests/test_auto/test_auto.py
+++ b/tests/test_auto/test_auto.py
@@ -11,7 +11,6 @@
 from etna.auto.auto import _Callback
 from etna.auto.auto import _Initializer
 from etna.metrics import MAE
-from etna.metrics import MSE
 from etna.models import LinearPerSegmentModel
 from etna.models import MovingAverageModel
 from etna.models import NaiveModel
diff --git a/tests/test_auto/test_tune.py b/tests/test_auto/test_tune.py
index ac857b3d3..efbf443b7 100644
--- a/tests/test_auto/test_tune.py
+++ b/tests/test_auto/test_tune.py
@@ -13,7 +13,6 @@
 from etna.distributions import FloatDistribution
 from etna.distributions import IntDistribution
 from etna.metrics import MAE
-from etna.metrics import MSE
 from etna.models import NaiveModel
 from etna.models import SimpleExpSmoothingModel
 from etna.pipeline import AutoRegressivePipeline

From 04aa3013ab73cab61ab38f1d36ebab23f5fa2f39 Mon Sep 17 00:00:00 2001
From: Dmitry Bunin <bunin260200@gmail.com>
Date: Mon, 16 Dec 2024 14:17:06 +0300
Subject: [PATCH 7/9] fix: add signature

---
 etna/metrics/utils.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/etna/metrics/utils.py b/etna/metrics/utils.py
index 1830a8ba9..e563010a7 100644
--- a/etna/metrics/utils.py
+++ b/etna/metrics/utils.py
@@ -41,7 +41,7 @@ def compute_metrics(
 def mean_agg():
     """Mean for pandas agg."""
 
-    def func(x):
+    def func(x: pd.Series):
         with warnings.catch_warnings():
             # this helps to prevent warning in case of all nans
             warnings.filterwarnings(
@@ -57,7 +57,7 @@ def func(x):
 def median_agg():
     """Median for pandas agg."""
 
-    def func(x):
+    def func(x: pd.Series):
         with warnings.catch_warnings():
             # this helps to prevent warning in case of all nans
             warnings.filterwarnings(
@@ -73,7 +73,7 @@ def func(x):
 def std_agg():
     """Std for pandas agg."""
 
-    def func(x):
+    def func(x: pd.Series):
         with warnings.catch_warnings():
             # this helps to prevent warning in case of all nans
             warnings.filterwarnings(
@@ -89,7 +89,7 @@ def func(x):
 def notna_size_agg():
     """Size of not-na elements for pandas agg."""
 
-    def func(x):
+    def func(x: pd.Series):
         return len(x) - pd.isna(x.values).sum()
 
     func.__name__ = "notna_size"
@@ -99,7 +99,7 @@ def func(x):
 def percentile(n: int):
     """Percentile for pandas agg."""
 
-    def func(x):
+    def func(x: pd.Series):
         with warnings.catch_warnings():
             # this helps to prevent warning in case of all nans
             warnings.filterwarnings(

From 3c92b14fc25bf020ee439f52009185e4403394ba Mon Sep 17 00:00:00 2001
From: Dmitry Bunin <bunin260200@gmail.com>
Date: Mon, 16 Dec 2024 15:24:26 +0300
Subject: [PATCH 8/9] fix: rework tests to raise valid error

---
 tests/test_auto/conftest.py  | 22 +++++++++++++++++++++-
 tests/test_auto/test_auto.py | 10 +++++++---
 tests/test_auto/test_tune.py | 10 +++++++---
 3 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/tests/test_auto/conftest.py b/tests/test_auto/conftest.py
index 8688468e1..66584e926 100644
--- a/tests/test_auto/conftest.py
+++ b/tests/test_auto/conftest.py
@@ -86,7 +86,7 @@ def ts_with_all_folds_missing_one_segment(random_seed) -> TSDataset:
     df1 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)})
     df1["segment"] = "segment_1"
     df1["target"] = np.random.uniform(10, 20, size=periods)
-    df1.loc[df1.index[-21:], "target"] = np.NaN
+    df1.loc[df1.index[-40:], "target"] = np.NaN
 
     df2 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)})
     df2["segment"] = "segment_2"
@@ -99,6 +99,26 @@ def ts_with_all_folds_missing_one_segment(random_seed) -> TSDataset:
     return tsds
 
 
+@pytest.fixture
+def ts_with_all_folds_missing_all_segments(random_seed) -> TSDataset:
+    periods = 100
+    df1 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)})
+    df1["segment"] = "segment_1"
+    df1["target"] = np.random.uniform(10, 20, size=periods)
+    df1.loc[df1.index[-40:], "target"] = np.NaN
+
+    df2 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)})
+    df2["segment"] = "segment_2"
+    df2["target"] = np.random.uniform(-15, 5, size=periods)
+    df2.loc[df2.index[-40:], "target"] = np.NaN
+
+    df = pd.concat([df1, df2]).reset_index(drop=True)
+    df = TSDataset.to_dataset(df)
+    tsds = TSDataset(df, freq="D")
+
+    return tsds
+
+
 @pytest.fixture
 def ts_with_few_missing(random_seed) -> TSDataset:
     periods = 100
diff --git a/tests/test_auto/test_auto.py b/tests/test_auto/test_auto.py
index 8d180ebb4..76f8e44e4 100644
--- a/tests/test_auto/test_auto.py
+++ b/tests/test_auto/test_auto.py
@@ -47,6 +47,7 @@ def pool_list():
     ]
 
 
+@patch("etna.pipeline.FoldMask.validate_on_dataset", return_value=MagicMock())  # TODO: remove after fix
 @pytest.mark.parametrize(
     "ts_name",
     [
@@ -54,9 +55,11 @@ def pool_list():
         "ts_with_few_missing",
         "ts_with_fold_missing_tail",
         "ts_with_fold_missing_middle",
+        "ts_with_all_folds_missing_one_segment",
     ],
 )
 def test_objective(
+    validate_on_dataset_mock,
     ts_name,
     request,
     target_metric=MAE(missing_mode="ignore"),
@@ -90,8 +93,10 @@ def test_objective(
     callback.assert_called_once()
 
 
-@pytest.mark.parametrize("ts_name", ["ts_with_all_folds_missing_one_segment"])
+@patch("etna.pipeline.FoldMask.validate_on_dataset", return_value=MagicMock())  # TODO: remove after fix
+@pytest.mark.parametrize("ts_name", ["ts_with_all_folds_missing_all_segments"])
 def test_objective_fail_none(
+    validate_on_dataset_mock,
     ts_name,
     request,
     target_metric=MAE(missing_mode="ignore"),
@@ -119,8 +124,7 @@ def test_objective_fail_none(
         callback=callback,
     )
 
-    # TODO: discuss the error here
-    with pytest.raises(ValueError, match="Last train timestamp should be not later"):
+    with pytest.raises(ValueError, match="Metric value is None"):
         _ = _objective(trial)
 
 
diff --git a/tests/test_auto/test_tune.py b/tests/test_auto/test_tune.py
index efbf443b7..41562994d 100644
--- a/tests/test_auto/test_tune.py
+++ b/tests/test_auto/test_tune.py
@@ -24,6 +24,7 @@
 from etna.transforms import TimeSeriesImputerTransform
 
 
+@patch("etna.pipeline.FoldMask.validate_on_dataset", return_value=MagicMock())  # TODO: remove after fix
 @pytest.mark.parametrize(
     "ts_name",
     [
@@ -31,9 +32,11 @@
         "ts_with_few_missing",
         "ts_with_fold_missing_tail",
         "ts_with_fold_missing_middle",
+        "ts_with_all_folds_missing_one_segment",
     ],
 )
 def test_objective(
+    validate_on_dataset_mock,
     ts_name,
     request,
     target_metric=MAE(missing_mode="ignore"),
@@ -65,8 +68,10 @@ def test_objective(
     callback.assert_called_once()
 
 
-@pytest.mark.parametrize("ts_name", ["ts_with_all_folds_missing_one_segment"])
+@patch("etna.pipeline.FoldMask.validate_on_dataset", return_value=MagicMock())  # TODO: remove after fix
+@pytest.mark.parametrize("ts_name", ["ts_with_all_folds_missing_all_segments"])
 def test_objective_fail_none(
+    validate_on_dataset_mock,
     ts_name,
     request,
     target_metric=MAE(missing_mode="ignore"),
@@ -92,8 +97,7 @@ def test_objective_fail_none(
         callback=callback,
     )
 
-    # TODO: discuss the error here
-    with pytest.raises(ValueError, match="Last train timestamp should be not later"):
+    with pytest.raises(ValueError, match="Metric value is None"):
         _ = _objective(trial)
 
 

From 69b782a33ee3cab8e6c5861e7b902ee22b8f87af Mon Sep 17 00:00:00 2001
From: Dmitry Bunin <bunin260200@gmail.com>
Date: Tue, 17 Dec 2024 11:32:00 +0300
Subject: [PATCH 9/9] chore: update changelog

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1d505bdeb..6e3d5c379 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -33,7 +33,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Add parameter `missing_mode` into `MAE` metric ([#523](https://github.com/etna-team/etna/pull/523))
 - Add parameter `missing_mode` into `MAPE` and `SMAPE` metrics ([#524](https://github.com/etna-team/etna/pull/524))
 - 
-- 
+- Update `aggregate_metrics_df` to work with `None` values ([#522](https://github.com/etna-team/etna/pull/522))
 - 
 - 
 -