FEAT-modin-project#6803: Enable range-partitioning impl for 'groupby.…

…apply()' by default (modin-project#6804) Signed-off-by: Dmitry Chigarev <[email protected]>
dchigarev · Dec 6, 2023 · a405217 · a405217
1 parent 6843954
commit a405217
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 16 deletions.
diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -3735,14 +3735,6 @@ def groupby(
         skip_on_aligning_flag = "__skip_me_on_aligning__"
 
         def apply_func(df):  # pragma: no cover
-            if any(
-                isinstance(dtype, pandas.CategoricalDtype)
-                for dtype in df.dtypes[by].values
-            ):
-                raise NotImplementedError(
-                    "Reshuffling groupby is not yet supported when grouping on a categorical column. "
-                    + "https://github.com/modin-project/modin/issues/5925"
-                )
             result = operator(df.groupby(by, **kwargs))
             if (
                 align_result_columns

diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
@@ -66,6 +66,7 @@
 )
 from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler
 from modin.error_message import ErrorMessage
+from modin.logging import get_logger
 from modin.utils import (
     MODIN_UNNAMED_SERIES_LABEL,
     _inherit_docstrings,
@@ -3782,12 +3783,12 @@ def _groupby_shuffle(
                 + "https://github.com/modin-project/modin/issues/5926"
             )
 
-        # So this check works only if we have dtypes cache materialized, otherwise the exception will be thrown
-        # inside the kernel and so it will be uncatchable. TODO: figure out a better way to handle this.
-        if self._modin_frame._dtypes is not None and any(
-            isinstance(dtype, pandas.CategoricalDtype)
-            for dtype in self.dtypes[by].values
-        ):
+        # This check materializes dtypes for 'by' columns
+        if isinstance(self._modin_frame._dtypes, ModinDtypes):
+            by_dtypes = self._modin_frame._dtypes.lazy_get(by).get()
+        else:
+            by_dtypes = self.dtypes[by]
+        if any(isinstance(dtype, pandas.CategoricalDtype) for dtype in by_dtypes):
             raise NotImplementedError(
                 "Reshuffling groupby is not yet supported when grouping on a categorical column. "
                 + "https://github.com/modin-project/modin/issues/5925"
@@ -3960,7 +3961,10 @@ def groupby_agg(
                 by, agg_func, axis, groupby_kwargs, agg_args, agg_kwargs, how, drop
             )
 
-        if ExperimentalGroupbyImpl.get():
+        # 'group_wise' means 'groupby.apply()'. We're certain that range-partitioning groupby
+        # always works better for '.apply()', so we're using it regardless of the 'ExperimentalGroupbyImpl'
+        # value
+        if how == "group_wise" or ExperimentalGroupbyImpl.get():
             try:
                 return self._groupby_shuffle(
                     by=by,
@@ -3973,10 +3977,15 @@ def groupby_agg(
                     how=how,
                 )
             except NotImplementedError as e:
-                ErrorMessage.warn(
+                # if a user wants to use range-partitioning groupby explicitly, then we should print a visible
+                # warning to them on a failure, otherwise we're only logging it
+                message = (
                     f"Can't use experimental reshuffling groupby implementation because of: {e}"
                     + "\nFalling back to a full-axis implementation."
                 )
+                get_logger().info(message)
+                if ExperimentalGroupbyImpl.get():
+                    ErrorMessage.warn(message)
 
         if isinstance(agg_func, dict) and GroupbyReduceImpl.has_impl_for(agg_func):
             return self._groupby_dict_reduce(