Skip to content

Commit

Permalink
Merge branch 'master' into issue_6753
Browse files Browse the repository at this point in the history
  • Loading branch information
dchigarev authored Nov 20, 2023
2 parents 4ea385c + 0ba2a46 commit efc6bbe
Show file tree
Hide file tree
Showing 4 changed files with 147 additions and 20 deletions.
14 changes: 8 additions & 6 deletions modin/core/dataframe/pandas/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1323,11 +1323,13 @@ def from_labels(self) -> "PandasDataframe":
if "index" not in self.columns
else "level_{}".format(0)
]
new_dtypes = None
if self.has_materialized_dtypes:
names = tuple(level_names) if len(level_names) > 1 else level_names[0]
new_dtypes = self.index.to_frame(name=names).dtypes
new_dtypes = pandas.concat([new_dtypes, self.dtypes])
names = tuple(level_names) if len(level_names) > 1 else level_names[0]
new_dtypes = self.index.to_frame(name=names).dtypes
try:
new_dtypes = ModinDtypes.concat([new_dtypes, self._dtypes])
except NotImplementedError:
# can raise on duplicated labels
new_dtypes = None

# We will also use the `new_column_names` in the calculation of the internal metadata, so this is a
# lightweight way of ensuring the metadata matches.
Expand Down Expand Up @@ -2587,7 +2589,7 @@ def sort_function(df): # pragma: no cover
return df

# If this df is empty, we don't want to try and shuffle or sort.
if len(self.get_axis(0)) == 0 or len(self.get_axis(1)) == 0:
if len(self.get_axis(1)) == 0 or len(self) == 0:
return self.copy()

axis = Axis(axis)
Expand Down
27 changes: 19 additions & 8 deletions modin/core/dataframe/pandas/metadata/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,13 +260,15 @@ def copy(self) -> "DtypesDescriptor":
DtypesDescriptor
"""
return type(self)(
self._known_dtypes.copy(),
self._cols_with_unknown_dtypes.copy(),
self._remaining_dtype,
self._parent_df,
columns_order=None
if self.columns_order is None
else self.columns_order.copy(),
# should access '.columns_order' first, as it may compute columns order
# and complete the metadata for 'self'
columns_order=(
None if self.columns_order is None else self.columns_order.copy()
),
known_dtypes=self._known_dtypes.copy(),
cols_with_unknown_dtypes=self._cols_with_unknown_dtypes.copy(),
remaining_dtype=self._remaining_dtype,
parent_df=self._parent_df,
know_all_names=self._know_all_names,
_schema_is_known=self._schema_is_known,
)
Expand Down Expand Up @@ -655,7 +657,16 @@ def concat(cls, values: list) -> "ModinDtypes":
else:
raise NotImplementedError(type(val))

desc = DtypesDescriptor.concat(preprocessed_vals)
try:
desc = DtypesDescriptor.concat(preprocessed_vals)
except NotImplementedError as e:
# 'DtypesDescriptor' doesn't support duplicated labels, however, if all values are pandas Serieses,
# we still can perform concatenation using pure pandas
if "duplicated" not in e.args[0].lower() or not all(
isinstance(val, pandas.Series) for val in values
):
raise e
desc = pandas.concat(values)
return ModinDtypes(desc)

def set_index(self, new_index: Union[pandas.Index, "ModinIndex"]) -> "ModinDtypes":
Expand Down
20 changes: 14 additions & 6 deletions modin/core/storage_formats/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -720,20 +720,28 @@ def _reset(df, *axis_lengths, partition_idx): # pragma: no cover
df.index = pandas.RangeIndex(start, stop)
return df

if self._modin_frame.has_columns_cache and kwargs["drop"]:
new_columns = self._modin_frame.copy_columns_cache(copy_lengths=True)
new_columns = None
if kwargs["drop"]:
dtypes = self._modin_frame.copy_dtypes_cache()
if self._modin_frame.has_columns_cache:
new_columns = self._modin_frame.copy_columns_cache(
copy_lengths=True
)
else:
new_columns = None
# concat index dtypes (None, since they're unknown) with column dtypes
try:
dtypes = ModinDtypes.concat([None, self._modin_frame._dtypes])
except NotImplementedError:
# may raise on duplicated names in materialized 'self.dtypes'
dtypes = None

return self.__constructor__(
self._modin_frame.apply_full_axis(
axis=1,
func=_reset,
enumerate_partitions=True,
new_columns=new_columns,
dtypes=(
self._modin_frame._dtypes if kwargs.get("drop", False) else None
),
dtypes=dtypes,
sync_labels=False,
pass_axis_lengths_to_partitions=True,
)
Expand Down
106 changes: 106 additions & 0 deletions modin/test/storage_formats/pandas/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1815,6 +1815,24 @@ def test_concat(self):
)
assert res.equals(exp)

def test_ModinDtypes_duplicated_concat(self):
# test that 'ModinDtypes' is able to perform dtypes concatenation on duplicated labels
# if all of them are Serieses
res = ModinDtypes.concat([pandas.Series([np.dtype(int)], index=["a"])] * 2)
assert isinstance(res._value, pandas.Series)
assert res._value.equals(
pandas.Series([np.dtype(int), np.dtype(int)], index=["a", "a"])
)

# test that 'ModinDtypes.concat' with duplicated labels raises when not all dtypes are materialized
with pytest.raises(NotImplementedError):
res = ModinDtypes.concat(
[
pandas.Series([np.dtype(int)], index=["a"]),
DtypesDescriptor(cols_with_unknown_dtypes=["a"]),
]
)

def test_update_parent(self):
"""
Test that updating parents in ``DtypesDescriptor`` also propagates to stored lazy categoricals.
Expand Down Expand Up @@ -2017,3 +2035,91 @@ def test_get_dummies_case(self):
assert res._query_compiler._modin_frame.has_materialized_dtypes

patch.assert_not_called()

@pytest.mark.parametrize("has_materialized_index", [True, False])
@pytest.mark.parametrize("drop", [True, False])
def test_preserve_dtypes_reset_index(self, drop, has_materialized_index):
with mock.patch.object(PandasDataframe, "_compute_dtypes") as patch:
# case 1: 'df' has complete dtype by default
df = pd.DataFrame({"a": [1, 2, 3]})
if has_materialized_index:
assert df._query_compiler._modin_frame.has_materialized_index
else:
df._query_compiler._modin_frame.set_index_cache(None)
assert not df._query_compiler._modin_frame.has_materialized_index
assert df._query_compiler._modin_frame.has_materialized_dtypes

res = df.reset_index(drop=drop)
if drop:
# we droped the index, so columns and dtypes shouldn't change
assert res._query_compiler._modin_frame.has_materialized_dtypes
assert res.dtypes.equals(df.dtypes)
else:
if has_materialized_index:
# we should have inserted index dtype into the descriptor,
# and since both of them are materialized, the result should be
# materialized too
assert res._query_compiler._modin_frame.has_materialized_dtypes
assert res.dtypes.equals(
pandas.Series(
[np.dtype(int), np.dtype(int)], index=["index", "a"]
)
)
else:
# we now know that there are cols with unknown name and dtype in our dataframe,
# so the resulting dtypes should contain information only about original column
expected_dtypes = DtypesDescriptor(
{"a": np.dtype(int)},
know_all_names=False,
)
assert res._query_compiler._modin_frame._dtypes._value.equals(
expected_dtypes
)

# case 2: 'df' has partial dtype by default
df = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
df._query_compiler._modin_frame.set_dtypes_cache(
ModinDtypes(
DtypesDescriptor(
{"a": np.dtype(int)}, cols_with_unknown_dtypes=["b"]
)
)
)
if has_materialized_index:
assert df._query_compiler._modin_frame.has_materialized_index
else:
df._query_compiler._modin_frame.set_index_cache(None)
assert not df._query_compiler._modin_frame.has_materialized_index

res = df.reset_index(drop=drop)
if drop:
# we droped the index, so columns and dtypes shouldn't change
assert res._query_compiler._modin_frame._dtypes._value.equals(
df._query_compiler._modin_frame._dtypes._value
)
else:
if has_materialized_index:
# we should have inserted index dtype into the descriptor,
# the resulted dtype should have information about 'index' and 'a' columns,
# and miss dtype info for 'b' column
expected_dtypes = DtypesDescriptor(
{"index": np.dtype(int), "a": np.dtype(int)},
cols_with_unknown_dtypes=["b"],
columns_order={0: "index", 1: "a", 2: "b"},
)
assert res._query_compiler._modin_frame._dtypes._value.equals(
expected_dtypes
)
else:
# we miss info about the 'index' column since it wasn't materialized at
# the time of 'reset_index()' and we're still missing dtype info for 'b' column
expected_dtypes = DtypesDescriptor(
{"a": np.dtype(int)},
cols_with_unknown_dtypes=["b"],
know_all_names=False,
)
assert res._query_compiler._modin_frame._dtypes._value.equals(
expected_dtypes
)

patch.assert_not_called()

0 comments on commit efc6bbe

Please sign in to comment.