Skip to content

Commit

Permalink
FEAT-modin-project#6803: Enable range-partitioning impl for 'groupby.…
Browse files Browse the repository at this point in the history
…apply()' by default (modin-project#6804)

Signed-off-by: Dmitry Chigarev <[email protected]>
  • Loading branch information
dchigarev authored Dec 6, 2023
1 parent 6843954 commit a405217
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 16 deletions.
8 changes: 0 additions & 8 deletions modin/core/dataframe/pandas/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3735,14 +3735,6 @@ def groupby(
skip_on_aligning_flag = "__skip_me_on_aligning__"

def apply_func(df): # pragma: no cover
if any(
isinstance(dtype, pandas.CategoricalDtype)
for dtype in df.dtypes[by].values
):
raise NotImplementedError(
"Reshuffling groupby is not yet supported when grouping on a categorical column. "
+ "https://github.com/modin-project/modin/issues/5925"
)
result = operator(df.groupby(by, **kwargs))
if (
align_result_columns
Expand Down
25 changes: 17 additions & 8 deletions modin/core/storage_formats/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
)
from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler
from modin.error_message import ErrorMessage
from modin.logging import get_logger
from modin.utils import (
MODIN_UNNAMED_SERIES_LABEL,
_inherit_docstrings,
Expand Down Expand Up @@ -3782,12 +3783,12 @@ def _groupby_shuffle(
+ "https://github.com/modin-project/modin/issues/5926"
)

# So this check works only if we have dtypes cache materialized, otherwise the exception will be thrown
# inside the kernel and so it will be uncatchable. TODO: figure out a better way to handle this.
if self._modin_frame._dtypes is not None and any(
isinstance(dtype, pandas.CategoricalDtype)
for dtype in self.dtypes[by].values
):
# This check materializes dtypes for 'by' columns
if isinstance(self._modin_frame._dtypes, ModinDtypes):
by_dtypes = self._modin_frame._dtypes.lazy_get(by).get()
else:
by_dtypes = self.dtypes[by]
if any(isinstance(dtype, pandas.CategoricalDtype) for dtype in by_dtypes):
raise NotImplementedError(
"Reshuffling groupby is not yet supported when grouping on a categorical column. "
+ "https://github.com/modin-project/modin/issues/5925"
Expand Down Expand Up @@ -3960,7 +3961,10 @@ def groupby_agg(
by, agg_func, axis, groupby_kwargs, agg_args, agg_kwargs, how, drop
)

if ExperimentalGroupbyImpl.get():
# 'group_wise' means 'groupby.apply()'. We're certain that range-partitioning groupby
# always works better for '.apply()', so we're using it regardless of the 'ExperimentalGroupbyImpl'
# value
if how == "group_wise" or ExperimentalGroupbyImpl.get():
try:
return self._groupby_shuffle(
by=by,
Expand All @@ -3973,10 +3977,15 @@ def groupby_agg(
how=how,
)
except NotImplementedError as e:
ErrorMessage.warn(
# if a user wants to use range-partitioning groupby explicitly, then we should print a visible
# warning to them on a failure, otherwise we're only logging it
message = (
f"Can't use experimental reshuffling groupby implementation because of: {e}"
+ "\nFalling back to a full-axis implementation."
)
get_logger().info(message)
if ExperimentalGroupbyImpl.get():
ErrorMessage.warn(message)

if isinstance(agg_func, dict) and GroupbyReduceImpl.has_impl_for(agg_func):
return self._groupby_dict_reduce(
Expand Down

0 comments on commit a405217

Please sign in to comment.