From e8242a93a4df38207f63728da461c750b53c6b3f Mon Sep 17 00:00:00 2001
From: Dmitry Chigarev <dmitry.chigarev@intel.com>
Date: Fri, 17 Nov 2023 16:57:50 +0100
Subject: [PATCH 1/3] PERF-#6749: Preserve partial dtype for the result of
 'reset_index()'

Signed-off-by: Dmitry Chigarev <dmitry.chigarev@intel.com>
---
 .../dataframe/pandas/dataframe/dataframe.py   | 12 ++-
 .../storage_formats/pandas/query_compiler.py  | 21 +++--
 .../storage_formats/pandas/test_internals.py  | 91 ++++++++++++++++++-
 3 files changed, 111 insertions(+), 13 deletions(-)

diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
index 7031b269554..57945dc4a48 100644
--- a/modin/core/dataframe/pandas/dataframe/dataframe.py
+++ b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -1323,11 +1323,13 @@ def from_labels(self) -> "PandasDataframe":
                 if "index" not in self.columns
                 else "level_{}".format(0)
             ]
-        new_dtypes = None
-        if self.has_materialized_dtypes:
-            names = tuple(level_names) if len(level_names) > 1 else level_names[0]
-            new_dtypes = self.index.to_frame(name=names).dtypes
-            new_dtypes = pandas.concat([new_dtypes, self.dtypes])
+        names = tuple(level_names) if len(level_names) > 1 else level_names[0]
+        new_dtypes = self.index.to_frame(name=names).dtypes
+        try:
+            new_dtypes = ModinDtypes.concat([new_dtypes, self._dtypes])
+        except NotImplementedError:
+            # can raise on duplicated labels
+            new_dtypes = None
 
         # We will also use the `new_column_names` in the calculation of the internal metadata, so this is a
         # lightweight way of ensuring the metadata matches.
diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
index ba907a8b036..0565a7d19f4 100644
--- a/modin/core/storage_formats/pandas/query_compiler.py
+++ b/modin/core/storage_formats/pandas/query_compiler.py
@@ -58,6 +58,7 @@
     SeriesGroupByDefault,
 )
 from modin.core.dataframe.base.dataframe.utils import join_columns
+from modin.core.dataframe.pandas.metadata import ModinDtypes
 from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler
 from modin.error_message import ErrorMessage
 from modin.utils import (
@@ -719,10 +720,20 @@ def _reset(df, *axis_lengths, partition_idx):  # pragma: no cover
                     df.index = pandas.RangeIndex(start, stop)
                 return df
 
-            if self._modin_frame.has_columns_cache and kwargs["drop"]:
-                new_columns = self._modin_frame.copy_columns_cache(copy_lengths=True)
+            new_columns = None
+            if kwargs["drop"]:
+                dtypes = self._modin_frame.copy_dtypes_cache()
+                if self._modin_frame.has_columns_cache:
+                    new_columns = self._modin_frame.copy_columns_cache(
+                        copy_lengths=True
+                    )
             else:
-                new_columns = None
+                # concat index dtypes (None, since they're unknown) with column dtypes
+                try:
+                    dtypes = ModinDtypes.concat([None, self._modin_frame._dtypes])
+                except NotImplementedError:
+                    # may raise on duplicated names in materialized 'self.dtypes'
+                    dtypes = None
 
             return self.__constructor__(
                 self._modin_frame.apply_full_axis(
@@ -730,9 +741,7 @@ def _reset(df, *axis_lengths, partition_idx):  # pragma: no cover
                     func=_reset,
                     enumerate_partitions=True,
                     new_columns=new_columns,
-                    dtypes=(
-                        self._modin_frame._dtypes if kwargs.get("drop", False) else None
-                    ),
+                    dtypes=dtypes,
                     sync_labels=False,
                     pass_axis_lengths_to_partitions=True,
                 )
diff --git a/modin/test/storage_formats/pandas/test_internals.py b/modin/test/storage_formats/pandas/test_internals.py
index b459f050263..c4832238c71 100644
--- a/modin/test/storage_formats/pandas/test_internals.py
+++ b/modin/test/storage_formats/pandas/test_internals.py
@@ -21,6 +21,7 @@
 
 import modin.pandas as pd
 from modin.config import Engine, ExperimentalGroupbyImpl, MinPartitionSize, NPartitions
+from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe
 from modin.core.dataframe.pandas.dataframe.utils import ColumnInfo, ShuffleSortFunctions
 from modin.core.dataframe.pandas.metadata import (
     DtypesDescriptor,
@@ -1947,8 +1948,6 @@ class TestZeroComputationDtypes:
     """
 
     def test_get_dummies_case(self):
-        from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe
-
         with mock.patch.object(PandasDataframe, "_compute_dtypes") as patch:
             df = pd.DataFrame(
                 {"items": [1, 2, 3, 4], "b": [3, 3, 4, 4], "c": [1, 0, 0, 1]}
@@ -1960,3 +1959,91 @@ def test_get_dummies_case(self):
             assert res._query_compiler._modin_frame.has_materialized_dtypes
 
         patch.assert_not_called()
+
+    @pytest.mark.parametrize("has_materialized_index", [True, False])
+    @pytest.mark.parametrize("drop", [True, False])
+    def test_preserve_dtypes_reset_index(self, drop, has_materialized_index):
+        with mock.patch.object(PandasDataframe, "_compute_dtypes") as patch:
+            # case 1: 'df' has complete dtype by default
+            df = pd.DataFrame({"a": [1, 2, 3]})
+            if has_materialized_index:
+                assert df._query_compiler._modin_frame.has_materialized_index
+            else:
+                df._query_compiler._modin_frame.set_index_cache(None)
+                assert not df._query_compiler._modin_frame.has_materialized_index
+            assert df._query_compiler._modin_frame.has_materialized_dtypes
+
+            res = df.reset_index(drop=drop)
+            if drop:
+                # we droped the index, so columns and dtypes shouldn't change
+                assert res._query_compiler._modin_frame.has_materialized_dtypes
+                assert res.dtypes.equals(df.dtypes)
+            else:
+                if has_materialized_index:
+                    # we should have inserted index dtype into the descriptor,
+                    # and since both of them are materialized, the result should be
+                    # materialized too
+                    assert res._query_compiler._modin_frame.has_materialized_dtypes
+                    assert res.dtypes.equals(
+                        pandas.Series(
+                            [np.dtype(int), np.dtype(int)], index=["index", "a"]
+                        )
+                    )
+                else:
+                    # we now know that there are cols with unknown name and dtype in our dataframe,
+                    # so the resulting dtypes should contain information only about original column
+                    expected_dtypes = DtypesDescriptor(
+                        {"a": np.dtype(int)},
+                        know_all_names=False,
+                    )
+                    assert res._query_compiler._modin_frame._dtypes._value.equals(
+                        expected_dtypes
+                    )
+
+            # case 2: 'df' has partial dtype by default
+            df = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
+            df._query_compiler._modin_frame.set_dtypes_cache(
+                ModinDtypes(
+                    DtypesDescriptor(
+                        {"a": np.dtype(int)}, cols_with_unknown_dtypes=["b"]
+                    )
+                )
+            )
+            if has_materialized_index:
+                assert df._query_compiler._modin_frame.has_materialized_index
+            else:
+                df._query_compiler._modin_frame.set_index_cache(None)
+                assert not df._query_compiler._modin_frame.has_materialized_index
+
+            res = df.reset_index(drop=drop)
+            if drop:
+                # we droped the index, so columns and dtypes shouldn't change
+                assert res._query_compiler._modin_frame._dtypes._value.equals(
+                    df._query_compiler._modin_frame._dtypes._value
+                )
+            else:
+                if has_materialized_index:
+                    # we should have inserted index dtype into the descriptor,
+                    # the resulted dtype should have information about 'index' and 'a' columns,
+                    # and miss dtype info for 'b' column
+                    expected_dtypes = DtypesDescriptor(
+                        {"index": np.dtype(int), "a": np.dtype(int)},
+                        cols_with_unknown_dtypes=["b"],
+                        columns_order={0: "index", 1: "a", 2: "b"},
+                    )
+                    assert res._query_compiler._modin_frame._dtypes._value.equals(
+                        expected_dtypes
+                    )
+                else:
+                    # we miss info about the 'index' column since it wasn't materialized at
+                    # the time of 'reset_index()' and we're still missing dtype info for 'b' column
+                    expected_dtypes = DtypesDescriptor(
+                        {"a": np.dtype(int)},
+                        cols_with_unknown_dtypes=["b"],
+                        know_all_names=False,
+                    )
+                    assert res._query_compiler._modin_frame._dtypes._value.equals(
+                        expected_dtypes
+                    )
+
+        patch.assert_not_called()

From ffb32cf38452d591da1ff349a6e28fbdb7b1f2bf Mon Sep 17 00:00:00 2001
From: Dmitry Chigarev <dmitry.chigarev@intel.com>
Date: Mon, 20 Nov 2023 14:28:34 +0100
Subject: [PATCH 2/3] fix groupby tests

Signed-off-by: Dmitry Chigarev <dmitry.chigarev@intel.com>
---
 modin/core/dataframe/pandas/metadata/dtypes.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/modin/core/dataframe/pandas/metadata/dtypes.py b/modin/core/dataframe/pandas/metadata/dtypes.py
index fc6cc211fac..7d991bc6ec7 100644
--- a/modin/core/dataframe/pandas/metadata/dtypes.py
+++ b/modin/core/dataframe/pandas/metadata/dtypes.py
@@ -260,13 +260,15 @@ def copy(self) -> "DtypesDescriptor":
         DtypesDescriptor
         """
         return type(self)(
-            self._known_dtypes.copy(),
-            self._cols_with_unknown_dtypes.copy(),
-            self._remaining_dtype,
-            self._parent_df,
-            columns_order=None
-            if self.columns_order is None
-            else self.columns_order.copy(),
+            # should access '.columns_order' first, as it may compute columns order
+            # and complete the metadata for 'self'
+            columns_order=(
+                None if self.columns_order is None else self.columns_order.copy()
+            ),
+            known_dtypes=self._known_dtypes.copy(),
+            cols_with_unknown_dtypes=self._cols_with_unknown_dtypes.copy(),
+            remaining_dtype=self._remaining_dtype,
+            parent_df=self._parent_df,
             know_all_names=self._know_all_names,
             _schema_is_known=self._schema_is_known,
         )

From c17dacd74906dbd8a94354f4d301f2ff903a2821 Mon Sep 17 00:00:00 2001
From: Dmitry Chigarev <dmitry.chigarev@intel.com>
Date: Mon, 20 Nov 2023 18:47:59 +0100
Subject: [PATCH 3/3] use old logic on duplicates in '.concat()'

Signed-off-by: Dmitry Chigarev <dmitry.chigarev@intel.com>
---
 modin/core/dataframe/pandas/metadata/dtypes.py | 11 ++++++++++-
 .../storage_formats/pandas/test_internals.py   | 18 ++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/modin/core/dataframe/pandas/metadata/dtypes.py b/modin/core/dataframe/pandas/metadata/dtypes.py
index 7d991bc6ec7..ffd81eb74b2 100644
--- a/modin/core/dataframe/pandas/metadata/dtypes.py
+++ b/modin/core/dataframe/pandas/metadata/dtypes.py
@@ -648,7 +648,16 @@ def concat(cls, values: list) -> "ModinDtypes":
             else:
                 raise NotImplementedError(type(val))
 
-        desc = DtypesDescriptor.concat(preprocessed_vals)
+        try:
+            desc = DtypesDescriptor.concat(preprocessed_vals)
+        except NotImplementedError as e:
+            # 'DtypesDescriptor' doesn't support duplicated labels, however, if all values are pandas Serieses,
+            # we still can perform concatenation using pure pandas
+            if "duplicated" not in e.args[0].lower() or not all(
+                isinstance(val, pandas.Series) for val in values
+            ):
+                raise e
+            desc = pandas.concat(values)
         return ModinDtypes(desc)
 
     def set_index(self, new_index: Union[pandas.Index, "ModinIndex"]) -> "ModinDtypes":
diff --git a/modin/test/storage_formats/pandas/test_internals.py b/modin/test/storage_formats/pandas/test_internals.py
index c4832238c71..5279614b510 100644
--- a/modin/test/storage_formats/pandas/test_internals.py
+++ b/modin/test/storage_formats/pandas/test_internals.py
@@ -1815,6 +1815,24 @@ def test_concat(self):
         )
         assert res.equals(exp)
 
+    def test_ModinDtypes_duplicated_concat(self):
+        # test that 'ModinDtypes' is able to perform dtypes concatenation on duplicated labels
+        # if all of them are Serieses
+        res = ModinDtypes.concat([pandas.Series([np.dtype(int)], index=["a"])] * 2)
+        assert isinstance(res._value, pandas.Series)
+        assert res._value.equals(
+            pandas.Series([np.dtype(int), np.dtype(int)], index=["a", "a"])
+        )
+
+        # test that 'ModinDtypes.concat' with duplicated labels raises when not all dtypes are materialized
+        with pytest.raises(NotImplementedError):
+            res = ModinDtypes.concat(
+                [
+                    pandas.Series([np.dtype(int)], index=["a"]),
+                    DtypesDescriptor(cols_with_unknown_dtypes=["a"]),
+                ]
+            )
+
     def test_update_parent(self):
         """
         Test that updating parents in ``DtypesDescriptor`` also propagates to stored lazy categoricals.