Merge branch 'master' into issue_6753

modin-project · Nov 20, 2023 · efc6bbe · efc6bbe
2 parents 4ea385c + 0ba2a46
commit efc6bbe
Show file tree

Hide file tree

Showing 4 changed files with 147 additions and 20 deletions.
diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -1323,11 +1323,13 @@ def from_labels(self) -> "PandasDataframe":
                 if "index" not in self.columns
                 else "level_{}".format(0)
             ]
-        new_dtypes = None
-        if self.has_materialized_dtypes:
-            names = tuple(level_names) if len(level_names) > 1 else level_names[0]
-            new_dtypes = self.index.to_frame(name=names).dtypes
-            new_dtypes = pandas.concat([new_dtypes, self.dtypes])
+        names = tuple(level_names) if len(level_names) > 1 else level_names[0]
+        new_dtypes = self.index.to_frame(name=names).dtypes
+        try:
+            new_dtypes = ModinDtypes.concat([new_dtypes, self._dtypes])
+        except NotImplementedError:
+            # can raise on duplicated labels
+            new_dtypes = None
 
         # We will also use the `new_column_names` in the calculation of the internal metadata, so this is a
         # lightweight way of ensuring the metadata matches.
@@ -2587,7 +2589,7 @@ def sort_function(df):  # pragma: no cover
             return df
 
         # If this df is empty, we don't want to try and shuffle or sort.
-        if len(self.get_axis(0)) == 0 or len(self.get_axis(1)) == 0:
+        if len(self.get_axis(1)) == 0 or len(self) == 0:
             return self.copy()
 
         axis = Axis(axis)

diff --git a/modin/core/dataframe/pandas/metadata/dtypes.py b/modin/core/dataframe/pandas/metadata/dtypes.py
@@ -260,13 +260,15 @@ def copy(self) -> "DtypesDescriptor":
         DtypesDescriptor
         """
         return type(self)(
-            self._known_dtypes.copy(),
-            self._cols_with_unknown_dtypes.copy(),
-            self._remaining_dtype,
-            self._parent_df,
-            columns_order=None
-            if self.columns_order is None
-            else self.columns_order.copy(),
+            # should access '.columns_order' first, as it may compute columns order
+            # and complete the metadata for 'self'
+            columns_order=(
+                None if self.columns_order is None else self.columns_order.copy()
+            ),
+            known_dtypes=self._known_dtypes.copy(),
+            cols_with_unknown_dtypes=self._cols_with_unknown_dtypes.copy(),
+            remaining_dtype=self._remaining_dtype,
+            parent_df=self._parent_df,
             know_all_names=self._know_all_names,
             _schema_is_known=self._schema_is_known,
         )
@@ -655,7 +657,16 @@ def concat(cls, values: list) -> "ModinDtypes":
             else:
                 raise NotImplementedError(type(val))
 
-        desc = DtypesDescriptor.concat(preprocessed_vals)
+        try:
+            desc = DtypesDescriptor.concat(preprocessed_vals)
+        except NotImplementedError as e:
+            # 'DtypesDescriptor' doesn't support duplicated labels, however, if all values are pandas Serieses,
+            # we still can perform concatenation using pure pandas
+            if "duplicated" not in e.args[0].lower() or not all(
+                isinstance(val, pandas.Series) for val in values
+            ):
+                raise e
+            desc = pandas.concat(values)
         return ModinDtypes(desc)
 
     def set_index(self, new_index: Union[pandas.Index, "ModinIndex"]) -> "ModinDtypes":

diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
@@ -720,20 +720,28 @@ def _reset(df, *axis_lengths, partition_idx):  # pragma: no cover
                     df.index = pandas.RangeIndex(start, stop)
                 return df
 
-            if self._modin_frame.has_columns_cache and kwargs["drop"]:
-                new_columns = self._modin_frame.copy_columns_cache(copy_lengths=True)
+            new_columns = None
+            if kwargs["drop"]:
+                dtypes = self._modin_frame.copy_dtypes_cache()
+                if self._modin_frame.has_columns_cache:
+                    new_columns = self._modin_frame.copy_columns_cache(
+                        copy_lengths=True
+                    )
             else:
-                new_columns = None
+                # concat index dtypes (None, since they're unknown) with column dtypes
+                try:
+                    dtypes = ModinDtypes.concat([None, self._modin_frame._dtypes])
+                except NotImplementedError:
+                    # may raise on duplicated names in materialized 'self.dtypes'
+                    dtypes = None
 
             return self.__constructor__(
                 self._modin_frame.apply_full_axis(
                     axis=1,
                     func=_reset,
                     enumerate_partitions=True,
                     new_columns=new_columns,
-                    dtypes=(
-                        self._modin_frame._dtypes if kwargs.get("drop", False) else None
-                    ),
+                    dtypes=dtypes,
                     sync_labels=False,
                     pass_axis_lengths_to_partitions=True,
                 )

diff --git a/modin/test/storage_formats/pandas/test_internals.py b/modin/test/storage_formats/pandas/test_internals.py
@@ -1815,6 +1815,24 @@ def test_concat(self):
         )
         assert res.equals(exp)
 
+    def test_ModinDtypes_duplicated_concat(self):
+        # test that 'ModinDtypes' is able to perform dtypes concatenation on duplicated labels
+        # if all of them are Serieses
+        res = ModinDtypes.concat([pandas.Series([np.dtype(int)], index=["a"])] * 2)
+        assert isinstance(res._value, pandas.Series)
+        assert res._value.equals(
+            pandas.Series([np.dtype(int), np.dtype(int)], index=["a", "a"])
+        )
+
+        # test that 'ModinDtypes.concat' with duplicated labels raises when not all dtypes are materialized
+        with pytest.raises(NotImplementedError):
+            res = ModinDtypes.concat(
+                [
+                    pandas.Series([np.dtype(int)], index=["a"]),
+                    DtypesDescriptor(cols_with_unknown_dtypes=["a"]),
+                ]
+            )
+
     def test_update_parent(self):
         """
         Test that updating parents in ``DtypesDescriptor`` also propagates to stored lazy categoricals.
@@ -2017,3 +2035,91 @@ def test_get_dummies_case(self):
             assert res._query_compiler._modin_frame.has_materialized_dtypes
 
         patch.assert_not_called()
+
+    @pytest.mark.parametrize("has_materialized_index", [True, False])
+    @pytest.mark.parametrize("drop", [True, False])
+    def test_preserve_dtypes_reset_index(self, drop, has_materialized_index):
+        with mock.patch.object(PandasDataframe, "_compute_dtypes") as patch:
+            # case 1: 'df' has complete dtype by default
+            df = pd.DataFrame({"a": [1, 2, 3]})
+            if has_materialized_index:
+                assert df._query_compiler._modin_frame.has_materialized_index
+            else:
+                df._query_compiler._modin_frame.set_index_cache(None)
+                assert not df._query_compiler._modin_frame.has_materialized_index
+            assert df._query_compiler._modin_frame.has_materialized_dtypes
+
+            res = df.reset_index(drop=drop)
+            if drop:
+                # we droped the index, so columns and dtypes shouldn't change
+                assert res._query_compiler._modin_frame.has_materialized_dtypes
+                assert res.dtypes.equals(df.dtypes)
+            else:
+                if has_materialized_index:
+                    # we should have inserted index dtype into the descriptor,
+                    # and since both of them are materialized, the result should be
+                    # materialized too
+                    assert res._query_compiler._modin_frame.has_materialized_dtypes
+                    assert res.dtypes.equals(
+                        pandas.Series(
+                            [np.dtype(int), np.dtype(int)], index=["index", "a"]
+                        )
+                    )
+                else:
+                    # we now know that there are cols with unknown name and dtype in our dataframe,
+                    # so the resulting dtypes should contain information only about original column
+                    expected_dtypes = DtypesDescriptor(
+                        {"a": np.dtype(int)},
+                        know_all_names=False,
+                    )
+                    assert res._query_compiler._modin_frame._dtypes._value.equals(
+                        expected_dtypes
+                    )
+
+            # case 2: 'df' has partial dtype by default
+            df = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
+            df._query_compiler._modin_frame.set_dtypes_cache(
+                ModinDtypes(
+                    DtypesDescriptor(
+                        {"a": np.dtype(int)}, cols_with_unknown_dtypes=["b"]
+                    )
+                )
+            )
+            if has_materialized_index:
+                assert df._query_compiler._modin_frame.has_materialized_index
+            else:
+                df._query_compiler._modin_frame.set_index_cache(None)
+                assert not df._query_compiler._modin_frame.has_materialized_index
+
+            res = df.reset_index(drop=drop)
+            if drop:
+                # we droped the index, so columns and dtypes shouldn't change
+                assert res._query_compiler._modin_frame._dtypes._value.equals(
+                    df._query_compiler._modin_frame._dtypes._value
+                )
+            else:
+                if has_materialized_index:
+                    # we should have inserted index dtype into the descriptor,
+                    # the resulted dtype should have information about 'index' and 'a' columns,
+                    # and miss dtype info for 'b' column
+                    expected_dtypes = DtypesDescriptor(
+                        {"index": np.dtype(int), "a": np.dtype(int)},
+                        cols_with_unknown_dtypes=["b"],
+                        columns_order={0: "index", 1: "a", 2: "b"},
+                    )
+                    assert res._query_compiler._modin_frame._dtypes._value.equals(
+                        expected_dtypes
+                    )
+                else:
+                    # we miss info about the 'index' column since it wasn't materialized at
+                    # the time of 'reset_index()' and we're still missing dtype info for 'b' column
+                    expected_dtypes = DtypesDescriptor(
+                        {"a": np.dtype(int)},
+                        cols_with_unknown_dtypes=["b"],
+                        know_all_names=False,
+                    )
+                    assert res._query_compiler._modin_frame._dtypes._value.equals(
+                        expected_dtypes
+                    )
+
+        patch.assert_not_called()