From e8242a93a4df38207f63728da461c750b53c6b3f Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Fri, 17 Nov 2023 16:57:50 +0100 Subject: [PATCH 1/3] PERF-#6749: Preserve partial dtype for the result of 'reset_index()' Signed-off-by: Dmitry Chigarev --- .../dataframe/pandas/dataframe/dataframe.py | 12 ++- .../storage_formats/pandas/query_compiler.py | 21 +++-- .../storage_formats/pandas/test_internals.py | 91 ++++++++++++++++++- 3 files changed, 111 insertions(+), 13 deletions(-) diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index 7031b269554..57945dc4a48 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -1323,11 +1323,13 @@ def from_labels(self) -> "PandasDataframe": if "index" not in self.columns else "level_{}".format(0) ] - new_dtypes = None - if self.has_materialized_dtypes: - names = tuple(level_names) if len(level_names) > 1 else level_names[0] - new_dtypes = self.index.to_frame(name=names).dtypes - new_dtypes = pandas.concat([new_dtypes, self.dtypes]) + names = tuple(level_names) if len(level_names) > 1 else level_names[0] + new_dtypes = self.index.to_frame(name=names).dtypes + try: + new_dtypes = ModinDtypes.concat([new_dtypes, self._dtypes]) + except NotImplementedError: + # can raise on duplicated labels + new_dtypes = None # We will also use the `new_column_names` in the calculation of the internal metadata, so this is a # lightweight way of ensuring the metadata matches. diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index ba907a8b036..0565a7d19f4 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -58,6 +58,7 @@ SeriesGroupByDefault, ) from modin.core.dataframe.base.dataframe.utils import join_columns +from modin.core.dataframe.pandas.metadata import ModinDtypes from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler from modin.error_message import ErrorMessage from modin.utils import ( @@ -719,10 +720,20 @@ def _reset(df, *axis_lengths, partition_idx): # pragma: no cover df.index = pandas.RangeIndex(start, stop) return df - if self._modin_frame.has_columns_cache and kwargs["drop"]: - new_columns = self._modin_frame.copy_columns_cache(copy_lengths=True) + new_columns = None + if kwargs["drop"]: + dtypes = self._modin_frame.copy_dtypes_cache() + if self._modin_frame.has_columns_cache: + new_columns = self._modin_frame.copy_columns_cache( + copy_lengths=True + ) else: - new_columns = None + # concat index dtypes (None, since they're unknown) with column dtypes + try: + dtypes = ModinDtypes.concat([None, self._modin_frame._dtypes]) + except NotImplementedError: + # may raise on duplicated names in materialized 'self.dtypes' + dtypes = None return self.__constructor__( self._modin_frame.apply_full_axis( @@ -730,9 +741,7 @@ def _reset(df, *axis_lengths, partition_idx): # pragma: no cover func=_reset, enumerate_partitions=True, new_columns=new_columns, - dtypes=( - self._modin_frame._dtypes if kwargs.get("drop", False) else None - ), + dtypes=dtypes, sync_labels=False, pass_axis_lengths_to_partitions=True, ) diff --git a/modin/test/storage_formats/pandas/test_internals.py b/modin/test/storage_formats/pandas/test_internals.py index b459f050263..c4832238c71 100644 --- a/modin/test/storage_formats/pandas/test_internals.py +++ b/modin/test/storage_formats/pandas/test_internals.py @@ -21,6 +21,7 @@ import modin.pandas as pd from modin.config import Engine, ExperimentalGroupbyImpl, MinPartitionSize, NPartitions +from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe from modin.core.dataframe.pandas.dataframe.utils import ColumnInfo, ShuffleSortFunctions from modin.core.dataframe.pandas.metadata import ( DtypesDescriptor, @@ -1947,8 +1948,6 @@ class TestZeroComputationDtypes: """ def test_get_dummies_case(self): - from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe - with mock.patch.object(PandasDataframe, "_compute_dtypes") as patch: df = pd.DataFrame( {"items": [1, 2, 3, 4], "b": [3, 3, 4, 4], "c": [1, 0, 0, 1]} @@ -1960,3 +1959,91 @@ def test_get_dummies_case(self): assert res._query_compiler._modin_frame.has_materialized_dtypes patch.assert_not_called() + + @pytest.mark.parametrize("has_materialized_index", [True, False]) + @pytest.mark.parametrize("drop", [True, False]) + def test_preserve_dtypes_reset_index(self, drop, has_materialized_index): + with mock.patch.object(PandasDataframe, "_compute_dtypes") as patch: + # case 1: 'df' has complete dtype by default + df = pd.DataFrame({"a": [1, 2, 3]}) + if has_materialized_index: + assert df._query_compiler._modin_frame.has_materialized_index + else: + df._query_compiler._modin_frame.set_index_cache(None) + assert not df._query_compiler._modin_frame.has_materialized_index + assert df._query_compiler._modin_frame.has_materialized_dtypes + + res = df.reset_index(drop=drop) + if drop: + # we droped the index, so columns and dtypes shouldn't change + assert res._query_compiler._modin_frame.has_materialized_dtypes + assert res.dtypes.equals(df.dtypes) + else: + if has_materialized_index: + # we should have inserted index dtype into the descriptor, + # and since both of them are materialized, the result should be + # materialized too + assert res._query_compiler._modin_frame.has_materialized_dtypes + assert res.dtypes.equals( + pandas.Series( + [np.dtype(int), np.dtype(int)], index=["index", "a"] + ) + ) + else: + # we now know that there are cols with unknown name and dtype in our dataframe, + # so the resulting dtypes should contain information only about original column + expected_dtypes = DtypesDescriptor( + {"a": np.dtype(int)}, + know_all_names=False, + ) + assert res._query_compiler._modin_frame._dtypes._value.equals( + expected_dtypes + ) + + # case 2: 'df' has partial dtype by default + df = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) + df._query_compiler._modin_frame.set_dtypes_cache( + ModinDtypes( + DtypesDescriptor( + {"a": np.dtype(int)}, cols_with_unknown_dtypes=["b"] + ) + ) + ) + if has_materialized_index: + assert df._query_compiler._modin_frame.has_materialized_index + else: + df._query_compiler._modin_frame.set_index_cache(None) + assert not df._query_compiler._modin_frame.has_materialized_index + + res = df.reset_index(drop=drop) + if drop: + # we droped the index, so columns and dtypes shouldn't change + assert res._query_compiler._modin_frame._dtypes._value.equals( + df._query_compiler._modin_frame._dtypes._value + ) + else: + if has_materialized_index: + # we should have inserted index dtype into the descriptor, + # the resulted dtype should have information about 'index' and 'a' columns, + # and miss dtype info for 'b' column + expected_dtypes = DtypesDescriptor( + {"index": np.dtype(int), "a": np.dtype(int)}, + cols_with_unknown_dtypes=["b"], + columns_order={0: "index", 1: "a", 2: "b"}, + ) + assert res._query_compiler._modin_frame._dtypes._value.equals( + expected_dtypes + ) + else: + # we miss info about the 'index' column since it wasn't materialized at + # the time of 'reset_index()' and we're still missing dtype info for 'b' column + expected_dtypes = DtypesDescriptor( + {"a": np.dtype(int)}, + cols_with_unknown_dtypes=["b"], + know_all_names=False, + ) + assert res._query_compiler._modin_frame._dtypes._value.equals( + expected_dtypes + ) + + patch.assert_not_called() From ffb32cf38452d591da1ff349a6e28fbdb7b1f2bf Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Mon, 20 Nov 2023 14:28:34 +0100 Subject: [PATCH 2/3] fix groupby tests Signed-off-by: Dmitry Chigarev --- modin/core/dataframe/pandas/metadata/dtypes.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/modin/core/dataframe/pandas/metadata/dtypes.py b/modin/core/dataframe/pandas/metadata/dtypes.py index fc6cc211fac..7d991bc6ec7 100644 --- a/modin/core/dataframe/pandas/metadata/dtypes.py +++ b/modin/core/dataframe/pandas/metadata/dtypes.py @@ -260,13 +260,15 @@ def copy(self) -> "DtypesDescriptor": DtypesDescriptor """ return type(self)( - self._known_dtypes.copy(), - self._cols_with_unknown_dtypes.copy(), - self._remaining_dtype, - self._parent_df, - columns_order=None - if self.columns_order is None - else self.columns_order.copy(), + # should access '.columns_order' first, as it may compute columns order + # and complete the metadata for 'self' + columns_order=( + None if self.columns_order is None else self.columns_order.copy() + ), + known_dtypes=self._known_dtypes.copy(), + cols_with_unknown_dtypes=self._cols_with_unknown_dtypes.copy(), + remaining_dtype=self._remaining_dtype, + parent_df=self._parent_df, know_all_names=self._know_all_names, _schema_is_known=self._schema_is_known, ) From c17dacd74906dbd8a94354f4d301f2ff903a2821 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Mon, 20 Nov 2023 18:47:59 +0100 Subject: [PATCH 3/3] use old logic on duplicates in '.concat()' Signed-off-by: Dmitry Chigarev --- modin/core/dataframe/pandas/metadata/dtypes.py | 11 ++++++++++- .../storage_formats/pandas/test_internals.py | 18 ++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/modin/core/dataframe/pandas/metadata/dtypes.py b/modin/core/dataframe/pandas/metadata/dtypes.py index 7d991bc6ec7..ffd81eb74b2 100644 --- a/modin/core/dataframe/pandas/metadata/dtypes.py +++ b/modin/core/dataframe/pandas/metadata/dtypes.py @@ -648,7 +648,16 @@ def concat(cls, values: list) -> "ModinDtypes": else: raise NotImplementedError(type(val)) - desc = DtypesDescriptor.concat(preprocessed_vals) + try: + desc = DtypesDescriptor.concat(preprocessed_vals) + except NotImplementedError as e: + # 'DtypesDescriptor' doesn't support duplicated labels, however, if all values are pandas Serieses, + # we still can perform concatenation using pure pandas + if "duplicated" not in e.args[0].lower() or not all( + isinstance(val, pandas.Series) for val in values + ): + raise e + desc = pandas.concat(values) return ModinDtypes(desc) def set_index(self, new_index: Union[pandas.Index, "ModinIndex"]) -> "ModinDtypes": diff --git a/modin/test/storage_formats/pandas/test_internals.py b/modin/test/storage_formats/pandas/test_internals.py index c4832238c71..5279614b510 100644 --- a/modin/test/storage_formats/pandas/test_internals.py +++ b/modin/test/storage_formats/pandas/test_internals.py @@ -1815,6 +1815,24 @@ def test_concat(self): ) assert res.equals(exp) + def test_ModinDtypes_duplicated_concat(self): + # test that 'ModinDtypes' is able to perform dtypes concatenation on duplicated labels + # if all of them are Serieses + res = ModinDtypes.concat([pandas.Series([np.dtype(int)], index=["a"])] * 2) + assert isinstance(res._value, pandas.Series) + assert res._value.equals( + pandas.Series([np.dtype(int), np.dtype(int)], index=["a", "a"]) + ) + + # test that 'ModinDtypes.concat' with duplicated labels raises when not all dtypes are materialized + with pytest.raises(NotImplementedError): + res = ModinDtypes.concat( + [ + pandas.Series([np.dtype(int)], index=["a"]), + DtypesDescriptor(cols_with_unknown_dtypes=["a"]), + ] + ) + def test_update_parent(self): """ Test that updating parents in ``DtypesDescriptor`` also propagates to stored lazy categoricals.