diff --git a/.github/workflows/asv.yaml b/.github/workflows/asv.yaml index 165912b38..1e1053965 100644 --- a/.github/workflows/asv.yaml +++ b/.github/workflows/asv.yaml @@ -37,8 +37,7 @@ jobs: id: build shell: bash -el {0} run: | - pip install -e "git+https://github.com/xorbitsai/xoscar.git@main#subdirectory=python&egg=xoscar" - pip install numpy scipy cython asv coverage + pip install numpy scipy cython asv==0.5.1 coverage cd python && pip install -e ".[dev,extra]" - name: Run ASV benchmarks diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index b1c3cb8f6..a508a6f9a 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -172,7 +172,7 @@ jobs: ../CI/install-hadoop.sh echo "import coverage; coverage.process_startup()" > \ $(python -c "import site; print(site.getsitepackages()[-1])")/coverage.pth - conda install --quiet --yes -c conda-forge skein libffi conda-pack + conda install --quiet --yes -c conda-forge skein libffi conda-pack grpcio=1.42.0 fi if [[ "$MODULE" == "vineyard" ]]; then pip install "vineyard<0.16.1" -i https://pypi.org/simple @@ -250,7 +250,7 @@ jobs: - name: Install on GPU if: ${{ matrix.module == 'gpu' }} run: | - pip install -e "git+https://github.com/xorbitsai/xoscar.git@main#subdirectory=python&egg=xoscar" + pip install -U xoscar python setup.py build_ext -i working-directory: ./python diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a7239dfcb..c1ed9e8c5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,7 +32,7 @@ repos: - id: prettier types_or: [html, javascript] - repo: https://github.com/codespell-project/codespell - rev: v2.2.5 + rev: v2.2.6 hooks: - id: codespell exclude: _mars/lib diff --git a/asv/asv.conf.json b/asv/asv.conf.json index f751dc5c7..5e240b4f1 100644 --- a/asv/asv.conf.json +++ b/asv/asv.conf.json @@ -88,7 +88,7 @@ "numpy": [], "Cython": ["0.29.24"], "pandas": [], - "scipy": [], + "scipy": ["1.10.0"], "scikit-learn": [], "numexpr": [], "cloudpickle": [], diff --git a/python/setup.cfg b/python/setup.cfg index 949a19ef8..ec6576c65 100644 --- a/python/setup.cfg +++ b/python/setup.cfg @@ -41,6 +41,7 @@ install_requires = tqdm>=4.1.0 uvloop>=0.14.0; sys_platform!="win32" pyarrow>=5.0.0 + fsspec>=2022.7.1,!=2022.8.0 [options.packages.find] exclude = @@ -84,7 +85,6 @@ doc = extra = pillow>=7.0.0 lz4>=1.0.0 - fsspec>=2022.7.1,!=2022.8.0 numexpr>=2.6.4 jax = jax>=0.4.0; sys.platform != "win32" @@ -96,14 +96,11 @@ ray = vineyard = vineyard>=0.3; sys.platform != "win32" aws = - fsspec>=2022.7.1,!=2022.8.0 s3fs azure = - fsspec>=2022.7.1,!=2022.8.0 adlfs datasets = datasets - fsspec>=2022.7.1,!=2022.8.0 [coverage:run] branch = True diff --git a/python/xorbits/__init__.py b/python/xorbits/__init__.py index ff1566ba1..49df2c017 100644 --- a/python/xorbits/__init__.py +++ b/python/xorbits/__init__.py @@ -24,6 +24,7 @@ def _install(): from .lightgbm import _install as _install_lightgbm from .numpy import _install as _install_numpy from .pandas import _install as _install_pandas + from .sklearn import _install as _install_sklearn from .web import _install as _install_web from .xgboost import _install as _install_xgboost @@ -34,6 +35,7 @@ def _install(): _install_xgboost() _install_datasets() _install_experimental() + _install_sklearn() _install() diff --git a/python/xorbits/_mars/config.py b/python/xorbits/_mars/config.py index 505f7bed2..3dabfbe5c 100644 --- a/python/xorbits/_mars/config.py +++ b/python/xorbits/_mars/config.py @@ -342,9 +342,6 @@ def validate(x): default_options.register_option("serialize_method", "pickle") # dataframe-related options -default_options.register_option( - "dataframe.mode.use_inf_as_na", False, validator=is_bool -) default_options.register_option( "dataframe.use_arrow_dtype", None, validator=any_validator(is_null, is_bool) ) diff --git a/python/xorbits/_mars/core/base.py b/python/xorbits/_mars/core/base.py index 1513a4cc8..dbc71959c 100644 --- a/python/xorbits/_mars/core/base.py +++ b/python/xorbits/_mars/core/base.py @@ -94,7 +94,7 @@ def __copy__(self): return self.copy() def copy(self): - return self.copy_to(type(self)(_key=self.key)) + return self.copy_to(type(self)()) def copy_to(self, target: "Base"): target_fields = target._FIELDS diff --git a/python/xorbits/_mars/core/entity/tileables.py b/python/xorbits/_mars/core/entity/tileables.py index 0a4feda6d..b43ade6ca 100644 --- a/python/xorbits/_mars/core/entity/tileables.py +++ b/python/xorbits/_mars/core/entity/tileables.py @@ -364,7 +364,14 @@ def __copy__(self): def _view(self): return super().copy() - def copy(self: TileableType) -> TileableType: + def copy(self: TileableType, **kw) -> TileableType: + from ...dataframe import Index + from ...deploy.oscar.session import SyncSession + + new_name = None + if isinstance(self, Index): + new_name = kw.pop("name", None) + new_op = self.op.copy() if new_op.create_view: # if the operand is a view, make it a copy @@ -378,6 +385,24 @@ def copy(self: TileableType) -> TileableType: new_outs = new_op.new_tileables( self.op.inputs, kws=params, output_limit=len(params) ) + + sess = self._executed_sessions[-1] if self._executed_sessions else None + to_incref_keys = [] + for _out in new_outs: + if sess: + _out._attach_session(sess) + to_incref_keys.append(_out.key) + if self.data in sess._tileable_to_fetch: + sess._tileable_to_fetch[_out.data] = sess._tileable_to_fetch[ + self.data + ] + if new_name: + _out.name = new_name + + if to_incref_keys: + assert sess is not None + SyncSession.from_isolated_session(sess).incref(*to_incref_keys) + pos = -1 for i, out in enumerate(self.op.outputs): # create a ref to copied one diff --git a/python/xorbits/_mars/dataframe/base/cartesian_chunk.py b/python/xorbits/_mars/dataframe/base/cartesian_chunk.py index 6da9acd21..774f6747f 100644 --- a/python/xorbits/_mars/dataframe/base/cartesian_chunk.py +++ b/python/xorbits/_mars/dataframe/base/cartesian_chunk.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging + import numpy as np import pandas as pd @@ -22,11 +24,13 @@ from ...serialization.serializables import ( DictField, FunctionField, + Int32Field, KeyField, + StringField, TupleField, ) from ...utils import enter_current_session, has_unknown_shape, quiet_stdio -from ..operands import DataFrameOperand, DataFrameOperandMixin, OutputType +from ..operands import DataFrameOperand, OutputType from ..utils import ( build_df, build_empty_df, @@ -34,63 +38,31 @@ parse_index, validate_output_types, ) +from .core import DataFrameAutoMergeMixin + +logger = logging.getLogger(__name__) -class DataFrameCartesianChunk(DataFrameOperand, DataFrameOperandMixin): +class DataFrameCartesianChunk(DataFrameOperand, DataFrameAutoMergeMixin): _op_type_ = opcodes.CARTESIAN_CHUNK - _left = KeyField("left") - _right = KeyField("right") - _func = FunctionField("func") - _args = TupleField("args") - _kwargs = DictField("kwargs") + left = KeyField("left") + right = KeyField("right") + func = FunctionField("func") + args = TupleField("args") + kwargs = DictField("kwargs") + auto_merge = StringField("auto_merge") + auto_merge_threshold = Int32Field("auto_merge_threshold") - def __init__( - self, - left=None, - right=None, - func=None, - args=None, - kwargs=None, - output_types=None, - **kw - ): - super().__init__( - _left=left, - _right=right, - _func=func, - _args=args, - _kwargs=kwargs, - _output_types=output_types, - **kw - ) + def __init__(self, output_types=None, **kw): + super().__init__(_output_types=output_types, **kw) if self.memory_scale is None: self.memory_scale = 2.0 - @property - def left(self): - return self._left - - @property - def right(self): - return self._right - - @property - def func(self): - return self._func - - @property - def args(self): - return self._args - - @property - def kwargs(self): - return self._kwargs - def _set_inputs(self, inputs): super()._set_inputs(inputs) - self._left = self._inputs[0] - self._right = self._inputs[1] + self.left = self.inputs[0] + self.right = self.inputs[1] @staticmethod def _build_test_obj(obj): @@ -103,7 +75,7 @@ def _build_test_obj(obj): def __call__(self, left, right, index=None, dtypes=None): test_left = self._build_test_obj(left) test_right = self._build_test_obj(right) - output_type = self._output_types[0] if self._output_types else None + output_type = self.output_types[0] if self.output_types else None if output_type == OutputType.df_or_series: return self.new_df_or_series([left, right]) @@ -111,7 +83,7 @@ def __call__(self, left, right, index=None, dtypes=None): # try run to infer meta try: with np.errstate(all="ignore"), quiet_stdio(): - obj = self._func(test_left, test_right, *self._args, **self._kwargs) + obj = self.func(test_left, test_right, *self.args, **self.kwargs) except: # noqa: E722 # nosec # pylint: disable=bare-except if output_type == OutputType.series: obj = pd.Series([], dtype=np.dtype(object)) @@ -126,11 +98,11 @@ def __call__(self, left, right, index=None, dtypes=None): ) if getattr(obj, "ndim", 0) == 1 or output_type == OutputType.series: - shape = self._kwargs.pop("shape", (np.nan,)) + shape = self.kwargs.pop("shape", (np.nan,)) if index is None: index = obj.index index_value = parse_index( - index, left, right, self._func, self._args, self._kwargs + index, left, right, self.func, self.args, self.kwargs ) return self.new_series( [left, right], @@ -147,7 +119,7 @@ def __call__(self, left, right, index=None, dtypes=None): if index is None: index = obj.index index_value = parse_index( - index, left, right, self._func, self._args, self._kwargs + index, left, right, self.func, self.args, self.kwargs ) return self.new_dataframe( [left, right], @@ -164,6 +136,14 @@ def tile(cls, op: "DataFrameCartesianChunk"): out = op.outputs[0] out_type = op.output_types[0] + auto_merge_threshold = op.auto_merge_threshold + auto_merge_before, auto_merge_after = cls._get_auto_merge_options(op.auto_merge) + + merge_before_res = yield from cls._merge_before( + op, auto_merge_before, auto_merge_threshold, left, right, logger + ) + left, right = merge_before_res[0], merge_before_res[1] + if left.ndim == 2 and left.chunk_shape[1] > 1: if has_unknown_shape(left): yield @@ -240,7 +220,12 @@ def tile(cls, op: "DataFrameCartesianChunk"): params["nsplits"] = tuple(tuple(ns) for ns in nsplits) if nsplits else nsplits params["chunks"] = out_chunks new_op = op.copy() - return new_op.new_tileables(op.inputs, kws=[params]) + ret = new_op.new_tileables(op.inputs, kws=[params]) + + ret = yield from cls._merge_after( + op, auto_merge_after, auto_merge_threshold, ret, logger + ) + return ret @classmethod @redirect_custom_log @@ -250,7 +235,16 @@ def execute(cls, ctx, op: "DataFrameCartesianChunk"): ctx[op.outputs[0].key] = op.func(left, right, *op.args, **(op.kwargs or dict())) -def cartesian_chunk(left, right, func, skip_infer=False, args=(), **kwargs): +def cartesian_chunk( + left, + right, + func, + skip_infer=False, + args=(), + auto_merge: str = "both", + auto_merge_threshold: int = 8, + **kwargs, +): output_type = kwargs.pop("output_type", None) output_types = kwargs.pop("output_types", None) object_type = kwargs.pop("object_type", None) @@ -265,6 +259,10 @@ def cartesian_chunk(left, right, func, skip_infer=False, args=(), **kwargs): index = kwargs.pop("index", None) dtypes = kwargs.pop("dtypes", None) memory_scale = kwargs.pop("memory_scale", None) + if auto_merge not in ["both", "none", "before", "after"]: # pragma: no cover + raise ValueError( + f"auto_merge can only be `both`, `none`, `before` or `after`, got {auto_merge}" + ) op = DataFrameCartesianChunk( left=left, @@ -274,5 +272,7 @@ def cartesian_chunk(left, right, func, skip_infer=False, args=(), **kwargs): kwargs=kwargs, output_types=output_types, memory_scale=memory_scale, + auto_merge=auto_merge, + auto_merge_threshold=auto_merge_threshold, ) return op(left, right, index=index, dtypes=dtypes) diff --git a/python/xorbits/_mars/dataframe/base/core.py b/python/xorbits/_mars/dataframe/base/core.py index b7f529dc1..57796babc 100644 --- a/python/xorbits/_mars/dataframe/base/core.py +++ b/python/xorbits/_mars/dataframe/base/core.py @@ -13,9 +13,17 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + +import logging + +from ...core import TileStatus +from ...core.context import get_context from ...serialization.serializables import KeyField +from ...typing import OperandType, TileableType from ..core import DATAFRAME_TYPE, SERIES_TYPE from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import auto_merge_chunks class DataFrameDeviceConversionBase(DataFrameOperand, DataFrameOperandMixin): @@ -63,3 +71,94 @@ def tile(cls, op): return new_op.new_tileables( op.inputs, chunks=out_chunks, nsplits=op.inputs[0].nsplits, **out.params ) + + +class DataFrameAutoMergeMixin(DataFrameOperandMixin): + @classmethod + def _get_auto_merge_options(cls, auto_merge: str) -> tuple[bool, bool]: + if auto_merge == "both": + return True, True + elif auto_merge == "none": + return False, False + elif auto_merge == "before": + return True, False + else: + assert auto_merge == "after" + return False, True + + @classmethod + def _merge_before( + cls, + op: OperandType, + auto_merge_before: bool, + auto_merge_threshold: int, + left: TileableType, + right: TileableType, + logger: logging.Logger, + ): + ctx = get_context() + + if ( + auto_merge_before + and len(left.chunks) + len(right.chunks) > auto_merge_threshold + ): + yield TileStatus([left, right] + left.chunks + right.chunks, progress=0.2) + left_chunk_size = len(left.chunks) + right_chunk_size = len(right.chunks) + left = auto_merge_chunks(ctx, left) + right = auto_merge_chunks(ctx, right) + logger.info( + "Auto merge before %s, left data shape: %s, chunk count: %s -> %s, " + "right data shape: %s, chunk count: %s -> %s.", + op, + left.shape, + left_chunk_size, + len(left.chunks), + right.shape, + right_chunk_size, + len(right.chunks), + ) + else: + logger.info( + "Skip auto merge before %s, left data shape: %s, chunk count: %d, " + "right data shape: %s, chunk count: %d.", + op, + left.shape, + len(left.chunks), + right.shape, + len(right.chunks), + ) + return [left, right] + + @classmethod + def _merge_after( + cls, + op: OperandType, + auto_merge_after: bool, + auto_merge_threshold: int, + ret: TileableType, + logger: logging.Logger, + ): + if auto_merge_after and len(ret[0].chunks) > auto_merge_threshold: + # if how=="inner", output data size will reduce greatly with high probability, + # use auto_merge_chunks to combine small chunks. + yield TileStatus( + ret[0].chunks, progress=0.8 + ) # trigger execution for chunks + merged = auto_merge_chunks(get_context(), ret[0]) + logger.info( + "Auto merge after %s, data shape: %s, chunk count: %s -> %s.", + op, + merged.shape, + len(ret[0].chunks), + len(merged.chunks), + ) + return [merged] + else: + logger.info( + "Skip auto merge after %s, data shape: %s, chunk count: %d.", + op, + ret[0].shape, + len(ret[0].chunks), + ) + return ret diff --git a/python/xorbits/_mars/dataframe/base/tests/test_base_execution.py b/python/xorbits/_mars/dataframe/base/tests/test_base_execution.py index 23c19e122..529812ea5 100644 --- a/python/xorbits/_mars/dataframe/base/tests/test_base_execution.py +++ b/python/xorbits/_mars/dataframe/base/tests/test_base_execution.py @@ -1729,12 +1729,10 @@ def test_value_counts_execution(setup): r = series.value_counts() pd.testing.assert_series_equal(r.execute().fetch(), s.value_counts()) - # pandas issue: https://github.com/pandas-dev/pandas/issues/54857 - if pd.__version__ != "2.1.0": - r = series.value_counts(bins=5, normalize=True) - pd.testing.assert_series_equal( - r.execute().fetch(), s.value_counts(bins=5, normalize=True) - ) + r = series.value_counts(bins=5, normalize=True) + pd.testing.assert_series_equal( + r.execute().fetch(), s.value_counts(bins=5, normalize=True) + ) # test multi chunks series = from_pandas_series(s, chunk_size=30) @@ -1746,11 +1744,10 @@ def test_value_counts_execution(setup): pd.testing.assert_series_equal(r.execute().fetch(), s.value_counts(normalize=True)) # test bins and normalize - if pd.__version__ != "2.1.0": - r = series.value_counts(method="tree", bins=5, normalize=True) - pd.testing.assert_series_equal( - r.execute().fetch(), s.value_counts(bins=5, normalize=True) - ) + r = series.value_counts(method="tree", bins=5, normalize=True) + pd.testing.assert_series_equal( + r.execute().fetch(), s.value_counts(bins=5, normalize=True) + ) def test_astype(setup): @@ -3185,3 +3182,53 @@ def test_nunique(setup, method, chunked, axis): raw_df.nunique(axis=axis), mdf.nunique(axis=axis, method=method).execute().fetch(), ) + + +@pytest.mark.parametrize("chunk_size", [None, 10]) +def test_copy_deep(setup, chunk_size): + ns = np.random.RandomState(0) + df = pd.DataFrame(ns.rand(100, 10), columns=["a" + str(i) for i in range(10)]) + mdf = from_pandas_df(df, chunk_size=chunk_size) + + # test case that there is no other result between copy and origin data + res = mdf.copy() + res["a0"] = res["a0"] + 1 + dfc = df.copy(deep=True) + dfc["a0"] = dfc["a0"] + 1 + pd.testing.assert_frame_equal(res.execute().fetch(), dfc) + pd.testing.assert_frame_equal(mdf.execute().fetch(), df) + + s = pd.Series(ns.randint(0, 100, size=(100,))) + ms = from_pandas_series(s, chunk_size=chunk_size) + + res = ms.copy() + res.iloc[0] = 111.0 + sc = s.copy(deep=True) + sc.iloc[0] = 111.0 + pd.testing.assert_series_equal(res.execute().fetch(), sc) + pd.testing.assert_series_equal(ms.execute().fetch(), s) + + index = pd.Index([i for i in range(100)], name="test") + m_index = from_pandas_index(index, chunk_size=chunk_size) + + res = m_index.copy() + assert res is not m_index + pd.testing.assert_index_equal(res.execute().fetch(), index.copy()) + pd.testing.assert_index_equal(m_index.execute().fetch(), index) + + res = m_index.copy(name="abc") + pd.testing.assert_index_equal(res.execute().fetch(), index.copy(name="abc")) + pd.testing.assert_index_equal(m_index.execute().fetch(), index) + + # test case that there is other ops between copy and origin data + xdf = (mdf + 1) * 2 / 7 + expected = (df + 1) * 2 / 7 + pd.testing.assert_frame_equal(xdf.execute().fetch(), expected) + + xdf_c = xdf.copy() + expected_c = expected.copy(deep=True) + pd.testing.assert_frame_equal(xdf_c.execute().fetch(), expected) + xdf_c["a1"] = xdf_c["a1"] + 0.8 + expected_c["a1"] = expected_c["a1"] + 0.8 + pd.testing.assert_frame_equal(xdf_c.execute().fetch(), expected_c) + pd.testing.assert_frame_equal(xdf.execute().fetch(), expected) diff --git a/python/xorbits/_mars/dataframe/base/value_counts.py b/python/xorbits/_mars/dataframe/base/value_counts.py index 457250014..ae80f3db3 100644 --- a/python/xorbits/_mars/dataframe/base/value_counts.py +++ b/python/xorbits/_mars/dataframe/base/value_counts.py @@ -193,6 +193,9 @@ def execute(cls, ctx, op: "DataFrameValueCounts"): # convert CategoricalDtype which generated in `cut` # to IntervalDtype result.index = result.index.astype("interval") + # index name changed since pandas 2.1.1 + if pd_release_version >= (2, 1, 1): + result.index.name = None if op.nrows: result = result.head(op.nrows) result.name = op.outputs[0].name diff --git a/python/xorbits/_mars/dataframe/core.py b/python/xorbits/_mars/dataframe/core.py index c4facdcd7..ff0b6d9fc 100644 --- a/python/xorbits/_mars/dataframe/core.py +++ b/python/xorbits/_mars/dataframe/core.py @@ -954,6 +954,16 @@ def __str__(self): def __repr__(self): return self._to_str(representation=True) + def _to_arr(self): + if len(self._executed_sessions) == 0: # pragma: no cover + raise NotImplementedError + + data = self.fetch(session=self._executed_sessions[-1]) + return np.asarray(data) + + def __array__(self): + return self._to_arr() + def _to_mars_tensor(self, dtype=None, order="K", extract_multi_index=False): tensor = self.to_tensor(extract_multi_index=extract_multi_index) dtype = dtype if dtype is not None else tensor.dtype @@ -1157,6 +1167,37 @@ def to_series(self, index=None, name=None): return series_from_index(self, index=index, name=name) + def copy(self, name=None, deep=False): + """ + Make a copy of this object. + + Name is set on the new object. + + Parameters + ---------- + name : Label, optional + Set name for new object. + deep : bool, default False + + Returns + ------- + Index + Index refer to new object which is a copy of this object. + + Notes + ----- + In most cases, there should be no functional difference from using + ``deep``, but if ``deep`` is passed it will attempt to deepcopy. + + Examples + -------- + >>> idx = pd.Index(['a', 'b', 'c']) + >>> new_idx = idx.copy() + >>> idx is new_idx + False + """ + return super().copy(name=name) + class RangeIndex(Index): __slots__ = () @@ -1414,6 +1455,16 @@ def __str__(self): def __repr__(self): return self._to_str(representation=False) + def _to_arr(self): + if len(self._executed_sessions) == 0: # pragma: no cover + raise NotImplementedError + + data = self.fetch(session=self._executed_sessions[-1]) + return np.asarray(data) + + def __array__(self): + return self._to_arr() + @property def dtype(self): return getattr(self, "_dtype", None) or getattr(self.op, "dtype", None) @@ -1571,10 +1622,9 @@ def copy(self, deep=True): # pylint: disable=arguments-differ copy : Series or DataFrame Object type matches caller. """ - if deep: - return super().copy() - else: - return super()._view() + if deep is False: + raise NotImplementedError("Not support `deep=False` for now") + return super().copy() def __len__(self): return len(self._data) @@ -2598,6 +2648,11 @@ def apply_if_callable(maybe_callable, obj, **kwargs): data[k] = apply_if_callable(v, data) return data + def copy(self, deep=True): + if deep is False: + raise NotImplementedError("Not support `deep=False` for now") + return super().copy() + class DataFrameGroupByChunkData(BaseDataFrameChunkData): type_name = "DataFrameGroupBy" diff --git a/python/xorbits/_mars/dataframe/datasource/core.py b/python/xorbits/_mars/dataframe/datasource/core.py index dad9f2049..2d9b0e828 100644 --- a/python/xorbits/_mars/dataframe/datasource/core.py +++ b/python/xorbits/_mars/dataframe/datasource/core.py @@ -134,7 +134,10 @@ def post_tile(cls, op: OperandType, results: List[TileableType]): if ( op.incremental_index and results is not None - and isinstance(results[0].index_value.value, IndexValue.RangeIndex) + and ( + results[0].index_value is None + or isinstance(results[0].index_value.value, IndexValue.RangeIndex) + ) ): result = results[0] chunks = [] @@ -159,7 +162,10 @@ def pre_execute(cls, ctx: Union[dict, Context], op: OperandType): out = op.outputs[0] if ( op.incremental_index - and isinstance(out.index_value.value, IndexValue.RangeIndex) + and ( + out.index_value is None + or isinstance(out.index_value.value, IndexValue.RangeIndex) + ) and getattr(op, "incremental_index_recorder_name", None) ): index = out.index[0] @@ -173,7 +179,10 @@ def post_execute(cls, ctx: Union[dict, Context], op: OperandType): result = ctx[out.key] if ( op.incremental_index - and isinstance(out.index_value.value, IndexValue.RangeIndex) + and ( + out.index_value is None + or isinstance(out.index_value.value, IndexValue.RangeIndex) + ) and getattr(op, "incremental_index_recorder_name", None) ): recorder_name = op.incremental_index_recorder_name diff --git a/python/xorbits/_mars/dataframe/datasource/read_csv.py b/python/xorbits/_mars/dataframe/datasource/read_csv.py index b0ea4666e..43a29c47e 100644 --- a/python/xorbits/_mars/dataframe/datasource/read_csv.py +++ b/python/xorbits/_mars/dataframe/datasource/read_csv.py @@ -103,6 +103,7 @@ class DataFrameReadCSV( sep = StringField("sep") header = AnyField("header") index_col = Int32Field("index_col") + index_names = ListField("index_names") skiprows = Int32Field("skiprows") compression = StringField("compression") usecols = AnyField("usecols") @@ -114,6 +115,7 @@ class DataFrameReadCSV( storage_options = DictField("storage_options") merge_small_files = BoolField("merge_small_files") merge_small_file_options = DictField("merge_small_file_options") + is_http_url = BoolField("is_http_url", None) def get_columns(self): return self.usecols @@ -150,8 +152,32 @@ def _tile_compressed(cls, op): nsplits=nsplits, ) + @classmethod + def _tile_http_url(cls, op: "DataFrameReadCSV"): + out_chunks = [] + out_df = op.outputs[0] + for i, url in enumerate(op.path): + chunk_op = op.copy().reset_key() + chunk_op.path = url + out_chunks.append( + chunk_op.new_chunk(None, index=(i, 0), shape=(np.nan, np.nan)) + ) + new_op = op.copy() + nsplits = ((np.nan,) * len(out_chunks), (np.nan,)) + return new_op.new_dataframes( + None, + out_df.shape, + dtypes=out_df.dtypes, + index_value=out_df.index_value, + columns_value=out_df.columns_value, + chunks=out_chunks, + nsplits=nsplits, + ) + @classmethod def _tile(cls, op: "DataFrameReadCSV"): + if op.is_http_url: + return cls._tile_http_url(op) if op.compression: return cls._tile_compressed(op) @@ -266,6 +292,7 @@ def _pandas_read_csv(cls, f, op): nrows=op.nrows, **csv_kwargs, ) + df.index.names = op.index_names if op.keep_usecols_order: df = df[op.usecols] return df @@ -300,8 +327,34 @@ def _cudf_read_csv(cls, op): # pragma: no cover df = df[op.usecols] return df + @classmethod + def _execute_http_url(cls, ctx, op): + xdf = cudf if op.gpu else pd + out_df = op.outputs[0] + csv_kwargs = op.extra_params.copy() + if xdf is pd and op.use_arrow_dtype: + csv_kwargs.update(arrow_dtype_kwargs()) + df = xdf.read_csv( + op.path, + sep=op.sep, + names=op.names, + header=op.header, + index_col=op.index_col, + usecols=op.usecols, + nrows=op.nrows, + compression=op.compression, + **csv_kwargs, + ) + if op.keep_usecols_order: + df = df[op.usecols] + ctx[out_df.key] = df + @classmethod def execute(cls, ctx, op): + if op.is_http_url: + cls._execute_http_url(ctx, op) + return + xdf = cudf if op.gpu else pd out_df = op.outputs[0] csv_kwargs = op.extra_params.copy() @@ -330,6 +383,8 @@ def execute(cls, ctx, op): ctx[out_df.key] = df def estimate_size(cls, ctx, op): + if op.is_http_url: + return super().estimate_size(ctx, op) phy_size = op.size * (op.memory_scale or 1) ctx[op.outputs[0].key] = (phy_size, phy_size * 2) @@ -337,7 +392,10 @@ def __call__( self, index_value=None, columns_value=None, dtypes=None, chunk_bytes=None ): self._output_types = [OutputType.dataframe] - shape = (np.nan, len(dtypes)) + if dtypes is not None: + shape = (np.nan, len(dtypes)) + else: + shape = (np.nan, np.nan) return self.new_dataframe( None, shape, @@ -352,7 +410,7 @@ def read_csv( path: str, names: Union[List, Tuple] = None, sep: str = ",", - index_col: int = None, + index_col: Union[int, str, List[int], List[str]] = None, compression: str = None, header: Union[str, List] = "infer", dtype: Union[str, Dict] = None, @@ -657,6 +715,33 @@ def read_csv( """ if use_arrow_dtype is None: use_arrow_dtype = options.dataframe.use_arrow_dtype + + single_path = path[0] if isinstance(path, (list, tuple)) else path + if isinstance(single_path, str) and ( + single_path.startswith("http://") or single_path.startswith("https://") + ): + urls = path if isinstance(path, (list, tuple)) else [path] + op = DataFrameReadCSV( + path=urls, + names=names, + sep=sep, + header=header, + index_col=index_col, + usecols=usecols, + skiprows=skiprows, + compression=compression, + gpu=gpu, + incremental_index=incremental_index, + use_arrow_dtype=use_arrow_dtype, + storage_options=storage_options, + memory_scale=memory_scale, + merge_small_files=merge_small_files, + merge_small_file_options=merge_small_file_options, + is_http_url=True, + **kwargs, + ) + return op() + # infer dtypes and columns if isinstance(path, (list, tuple)): file_path = path[0] @@ -709,8 +794,8 @@ def read_csv( else: index_value = parse_index(mini_df.index) columns_value = parse_index(mini_df.columns, store_data=True) - if index_col and not isinstance(index_col, int): - index_col = list(mini_df.columns).index(index_col) + # Set names and index_col may lose multiindex names, so we have to fix it. + index_names = mini_df.index.names # convert path to abs_path abs_path = convert_to_abspath(path, storage_options) @@ -721,6 +806,7 @@ def read_csv( sep=sep, header=header, index_col=index_col, + index_names=index_names, usecols=usecols, skiprows=skiprows, compression=compression, diff --git a/python/xorbits/_mars/dataframe/datasource/read_parquet.py b/python/xorbits/_mars/dataframe/datasource/read_parquet.py index bd4e65b39..a7d317095 100644 --- a/python/xorbits/_mars/dataframe/datasource/read_parquet.py +++ b/python/xorbits/_mars/dataframe/datasource/read_parquet.py @@ -51,7 +51,7 @@ ) from ...utils import is_object_dtype, lazy_import from ..operands import OutputType -from ..utils import arrow_dtype_kwargs, parse_index +from ..utils import PD_VERSION_GREATER_THAN_2_10, arrow_dtype_kwargs, parse_index from .core import ( ColumnPruneSupportedDataSourceMixin, IncrementalIndexDatasource, @@ -391,8 +391,7 @@ def _tile_no_partitioned(cls, op: "DataFrameReadParquet"): paths = sorted(paths) if not isinstance(fs, fsspec.implementations.local.LocalFileSystem): parsed_path = urlparse(op.path) - path_prefix = f"{parsed_path.scheme}://{parsed_path.netloc}" - paths = [path_prefix + path for path in paths] + paths = [f"{parsed_path.scheme}://{path}" for path in paths] elif isinstance(op.path, str) and op.path.endswith(".zip"): file = fs.open(op.path, storage_options=op.storage_options) z = zipfile.ZipFile(file) @@ -406,8 +405,7 @@ def _tile_no_partitioned(cls, op: "DataFrameReadParquet"): paths = fs.glob(op.path, storage_options=op.storage_options) if not isinstance(fs, fsspec.implementations.local.LocalFileSystem): parsed_path = urlparse(op.path) - path_prefix = f"{parsed_path.scheme}://{parsed_path.netloc}" - paths = [path_prefix + path for path in paths] + paths = [f"{parsed_path.scheme}://{path}" for path in paths] first_chunk_row_num, first_chunk_raw_bytes = None, None for i, pth in enumerate(paths): if i == 0: @@ -416,7 +414,7 @@ def _tile_no_partitioned(cls, op: "DataFrameReadParquet"): first_chunk_row_num = get_engine(op.engine).get_row_num(f) first_chunk_raw_bytes = sys.getsizeof(f) else: - of = fsspec.open(pth, storage_options=op.storage_options) + of = fsspec.open(pth) with of as f: first_chunk_row_num = get_engine(op.engine).get_row_num(f) first_chunk_raw_bytes = fsspec.get_fs_token_paths( @@ -778,7 +776,7 @@ def read_parquet( If index_col not specified, ensure range index incremental, gain a slightly better performance if setting False. use_arrow_dtype: bool, default None - If True, use arrow dtype to store columns. + If True, use arrow dtype to store columns. Default enabled if pandas >= 2.1 storage_options: dict, optional Options for storage connection. memory_scale: int, optional @@ -798,6 +796,10 @@ def read_parquet( engine_type = check_engine(engine) engine = get_engine(engine_type) + # We enable arrow dtype by default if pandas >= 2.1 + if use_arrow_dtype is None and engine_type == "pyarrow": + use_arrow_dtype = PD_VERSION_GREATER_THAN_2_10 + single_path = path[0] if isinstance(path, list) else path is_partitioned = False if isinstance(single_path, str) and ( @@ -830,6 +832,10 @@ def read_parquet( raise ValueError( f"The 'use_arrow_dtype' argument is not supported for the {engine_type} engine" ) + # We enable arrow dtype by default if pandas >= 2.1 + if use_arrow_dtype is None: + use_arrow_dtype = PD_VERSION_GREATER_THAN_2_10 + types_mapper = pd.ArrowDtype if use_arrow_dtype else None if fs.isdir(single_path): @@ -849,11 +855,6 @@ def read_parquet( else: if not isinstance(path, list): file_path = fs.glob(path, storage_options=storage_options)[0] - if not isinstance(fs, fsspec.implementations.local.LocalFileSystem): - parsed_path = urlparse(path) - path_prefix = f"{parsed_path.scheme}://{parsed_path.netloc}" - file_path = path_prefix + file_path - else: file_path = path[0] with fs.open(file_path, storage_options=storage_options) as f: diff --git a/python/xorbits/_mars/dataframe/datasource/tests/test_datasource.py b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource.py index 074f5f1cb..7bb9358ae 100644 --- a/python/xorbits/_mars/dataframe/datasource/tests/test_datasource.py +++ b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource.py @@ -674,14 +674,14 @@ def test_read_parquet_estimate_size(): chunk = tiled.chunks[0] estimate_size(sizes, chunk.op) estimated_size = sizes[chunk.key][0] - assert estimated_size > test_df.memory_usage(deep=True).sum() * 1.5 + assert estimated_size >= test_df.memory_usage(deep=True).sum() * 1.5 df = read_parquet(file_path, columns=["a", "c"]) tiled = tile(df) sizes = dict() chunk = tiled.chunks[0] estimate_size(sizes, chunk.op) - assert sizes[chunk.key][0] < estimated_size * (2 / 3) + assert sizes[chunk.key][0] <= estimated_size * (2 / 3) df = read_parquet(file_path, use_arrow_dtype=True) tiled = tile(df) @@ -690,7 +690,7 @@ def test_read_parquet_estimate_size(): estimate_size(sizes, chunk.op) estimated_size_arrow = sizes[chunk.key][0] estimated_size_arrow < estimated_size - assert estimated_size_arrow > test_df.memory_usage(deep=True).sum() * 1.5 + assert estimated_size_arrow >= test_df.memory_usage(deep=True).sum() * 1.5 df = read_parquet(file_path, use_arrow_dtype=True, columns=["a", "c"]) tiled = tile(df) @@ -698,4 +698,4 @@ def test_read_parquet_estimate_size(): chunk = tiled.chunks[0] estimate_size(sizes, chunk.op) estimated_size_arrow = sizes[chunk.key][0] - assert sizes[chunk.key][0] < estimated_size * 2 / 3 + assert sizes[chunk.key][0] <= estimated_size * 2 / 3 diff --git a/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py index f7d6ff18d..1e7148934 100644 --- a/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py +++ b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py @@ -48,7 +48,7 @@ from ....config import option_context from ....tests.core import require_cudf, require_cupy from ....utils import get_next_port, pd_release_version -from ...utils import is_pandas_2 +from ...utils import PD_VERSION_GREATER_THAN_2_10, is_pandas_2 from ..dataframe import from_pandas as from_pandas_df from ..from_records import from_records from ..from_tensor import dataframe_from_1d_tileables, dataframe_from_tensor @@ -606,6 +606,10 @@ def test_read_csv_execution(setup): mdf2 = md.read_csv(file_path, index_col=0, chunk_bytes=100).execute().fetch() pd.testing.assert_frame_equal(pdf, mdf2) + mdf3 = md.read_csv(file_path, index_col=[0, 1]).execute().fetch() + pdf3 = pd.read_csv(file_path, index_col=[0, 1]) + pd.testing.assert_frame_equal(pdf3, mdf3) + # test nan with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, "test.csv") @@ -1295,6 +1299,8 @@ def test_read_parquet_arrow(setup, engine): "c": np.random.rand(10), } ) + if PD_VERSION_GREATER_THAN_2_10 and engine != "fastparquet": + test_df = test_df.convert_dtypes(dtype_backend="pyarrow") with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, "test.parquet") @@ -1352,6 +1358,9 @@ def test_read_parquet_arrow(setup, engine): } ) + if PD_VERSION_GREATER_THAN_2_10 and engine != "fastparquet": + df = df.convert_dtypes(dtype_backend="pyarrow") + file_paths = [os.path.join(tempdir, f"test{i}.parquet") for i in range(3)] df[:100].to_parquet(file_paths[0], row_group_size=50) df[100:200].to_parquet(file_paths[1], row_group_size=30) @@ -1447,6 +1456,8 @@ def test_read_parquet_zip(setup, engine): "c": np.random.rand(300), } ) + if PD_VERSION_GREATER_THAN_2_10 and engine != "fastparquet": + df = df.convert_dtypes(dtype_backend="pyarrow") file_paths = [os.path.join(tempdir, f"test{i}.parquet") for i in range(3)] df[:100].to_parquet(file_paths[0], row_group_size=50) @@ -1545,7 +1556,9 @@ def test_read_parquet_fast_parquet(setup): # assert sum(s[0] for s in size_res) > test_df.memory_usage(deep=True).sum() -def _start_tornado(port: int, file_path0: str, file_path1: str, zip_path: str): +def _start_tornado( + port: int, file_path0: str, file_path1: str, csv_path: str, zip_path: str +): import tornado.ioloop import tornado.web @@ -1559,6 +1572,11 @@ def get(self): with open(file_path1, "rb") as f: self.write(f.read()) + class CSVHandler(tornado.web.RequestHandler): + def get(self): + with open(csv_path, "rb") as f: + self.write(f.read()) + class RangeZipFileHandler(tornado.web.RequestHandler): def get(self): file_path = zip_path @@ -1596,6 +1614,7 @@ def parse_range_header(self, range_header): (r"/read-parquet0", Parquet0Handler), (r"/read-parquet1", Parquet1Handler), (r"/test.zip", RangeZipFileHandler), + (r"/read-csv", CSVHandler), ] ) app.listen(port) @@ -1607,6 +1626,7 @@ def start_http_server(): with tempfile.TemporaryDirectory() as tempdir: file_path0 = os.path.join(tempdir, "test0.parquet") file_path1 = os.path.join(tempdir, "test1.parquet") + csv_path = os.path.join(tempdir, "test.csv") df = pd.DataFrame( { @@ -1617,6 +1637,7 @@ def start_http_server(): ) df.iloc[:50].to_parquet(file_path0) df.iloc[50:].to_parquet(file_path1) + df.to_csv(csv_path) import zipfile zip_path = os.path.join(tempdir, "test.zip") @@ -1627,7 +1648,8 @@ def start_http_server(): port = get_next_port() proc = multiprocessing.Process( - target=_start_tornado, args=(port, file_path0, file_path1, zip_path) + target=_start_tornado, + args=(port, file_path0, file_path1, csv_path, zip_path), ) proc.daemon = True proc.start() @@ -1635,13 +1657,15 @@ def start_http_server(): yield df, [ f"http://127.0.0.1:{port}/read-parquet0", f"http://127.0.0.1:{port}/read-parquet1", - ], f"http://127.0.0.1:{port}/test.zip" + ], f"http://127.0.0.1:{port}/test.zip", f"http://127.0.0.1:{port}/read-csv" # Terminate the process proc.terminate() def test_read_parquet_with_http_url(setup, start_http_server): - df, urls, zip_url = start_http_server + df, urls, zip_url, _ = start_http_server + if PD_VERSION_GREATER_THAN_2_10: + df = df.convert_dtypes(dtype_backend="pyarrow") mdf = md.read_parquet(urls).execute().fetch() pd.testing.assert_frame_equal(df, mdf) if is_pandas_2(): @@ -1766,6 +1790,8 @@ def test_read_parquet_ftp(ftp_writable, setup): host, port, user, pw = ftp_writable data = {"Column1": [1, 2, 3], "Column2": ["A", "B", "C"]} df = pd.DataFrame(data) + if PD_VERSION_GREATER_THAN_2_10: + df = df.convert_dtypes(dtype_backend="pyarrow") with tempfile.TemporaryDirectory() as tempdir: local_file_path = os.path.join(tempdir, "test.parquet") df.to_parquet("ftp://{}:{}@{}:{}/test.parquet".format(user, pw, host, port)) @@ -1788,3 +1814,30 @@ def test_read_parquet_ftp(ftp_writable, setup): "ftp://{}:{}@{}:{}/test.zip".format(user, pw, host, port) ) pd.testing.assert_frame_equal(df, mdf_zip.to_pandas()) + + +def test_read_csv_http_url(setup, start_http_server): + df, _, _, csv_url = start_http_server + mdf = md.read_csv(csv_url) + pd.testing.assert_frame_equal(pd.read_csv(csv_url), mdf.execute().fetch()) + + mdf = md.read_csv(csv_url, names=["col1", "col2", "col3"]) + pd.testing.assert_frame_equal( + pd.read_csv(csv_url, names=["col1", "col2", "col3"]), mdf.execute().fetch() + ) + + mdf = md.read_csv(csv_url, header=0) + pd.testing.assert_frame_equal(pd.read_csv(csv_url, header=0), mdf.execute().fetch()) + + mdf = md.read_csv(csv_url, header=None) + pd.testing.assert_frame_equal( + pd.read_csv(csv_url, header=None), mdf.execute().fetch() + ) + + if is_pandas_2(): + df = df.convert_dtypes(dtype_backend="pyarrow") + mdf = md.read_csv(csv_url, use_arrow_dtype=True).execute().fetch() + pd.testing.assert_frame_equal( + pd.read_csv(csv_url, dtype_backend="pyarrow"), mdf + ) + assert isinstance(mdf.dtypes.iloc[1], pd.ArrowDtype) diff --git a/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_hdfs.py b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_hdfs.py index 29afdc4be..8bbbf1478 100644 --- a/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_hdfs.py +++ b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_hdfs.py @@ -21,6 +21,7 @@ import pytest from .... import dataframe as md +from ....dataframe.utils import PD_VERSION_GREATER_THAN_2_10 from ....tests.core import require_hadoop TEST_DIR = "/tmp/test" @@ -124,7 +125,9 @@ def test_read_parquet_execution(setup, setup_hdfs): df = md.read_parquet(f"hdfs://localhost:8020{TEST_DIR}/test.parquet") res = df.to_pandas() - pd.testing.assert_frame_equal(res, test_df) + if PD_VERSION_GREATER_THAN_2_10: + expected = test_df.convert_dtypes(dtype_backend="pyarrow") + pd.testing.assert_frame_equal(res, expected) hdfs.mkdir(f"{TEST_DIR}/test_partitioned") @@ -139,4 +142,7 @@ def test_read_parquet_execution(setup, setup_hdfs): df = md.read_parquet(f"hdfs://localhost:8020{TEST_DIR}/test_partitioned") res = df.to_pandas() + if PD_VERSION_GREATER_THAN_2_10: + test_df = test_df.convert_dtypes(dtype_backend="pyarrow") + test_df2 = test_df2.convert_dtypes(dtype_backend="pyarrow") pd.testing.assert_frame_equal(res, pd.concat([test_df, test_df2])) diff --git a/python/xorbits/_mars/dataframe/datastore/tests/test_datastore_execution.py b/python/xorbits/_mars/dataframe/datastore/tests/test_datastore_execution.py index 1ffdeaaac..485fa252b 100644 --- a/python/xorbits/_mars/dataframe/datastore/tests/test_datastore_execution.py +++ b/python/xorbits/_mars/dataframe/datastore/tests/test_datastore_execution.py @@ -40,6 +40,7 @@ from .... import dataframe as md from ....tests.core import flaky from ... import DataFrame +from ...utils import PD_VERSION_GREATER_THAN_2_10 def test_to_csv_execution(setup): @@ -182,7 +183,12 @@ def test_to_parquet_arrow_execution(setup): read_df = md.read_parquet(path) result = read_df.execute().fetch() result = result.sort_index() - pd.testing.assert_frame_equal(result, raw) + if PD_VERSION_GREATER_THAN_2_10: + expected = raw.convert_dtypes(dtype_backend="pyarrow") + else: + expected = raw + + pd.testing.assert_frame_equal(result, expected) # test read_parquet then to_parquet read_df = md.read_parquet(path) @@ -195,9 +201,11 @@ def test_to_parquet_arrow_execution(setup): read_df = md.read_parquet(path) result = read_df.execute().fetch() result["col3"] = result["col3"].astype("object") + if PD_VERSION_GREATER_THAN_2_10: + expected["col3"] = expected["col3"].astype("object") pd.testing.assert_frame_equal( result.sort_values("col1").reset_index(drop=True), - raw.sort_values("col1").reset_index(drop=True), + expected.sort_values("col1").reset_index(drop=True), ) diff --git a/python/xorbits/_mars/dataframe/datastore/tests/test_datastore_hdfs.py b/python/xorbits/_mars/dataframe/datastore/tests/test_datastore_hdfs.py index 0341aab3d..78c5b75ae 100644 --- a/python/xorbits/_mars/dataframe/datastore/tests/test_datastore_hdfs.py +++ b/python/xorbits/_mars/dataframe/datastore/tests/test_datastore_hdfs.py @@ -19,6 +19,7 @@ from .... import dataframe as md from ....tests.core import require_hadoop +from ...utils import PD_VERSION_GREATER_THAN_2_10 TEST_DIR = "/tmp/test" @@ -55,6 +56,9 @@ def test_to_parquet_execution(setup, setup_hdfs): hdfs.mkdir(dir_name) df.to_parquet(dir_name).execute() + if PD_VERSION_GREATER_THAN_2_10: + test_df = test_df.convert_dtypes(dtype_backend="pyarrow") + result = md.read_parquet(dir_name).to_pandas() pd.testing.assert_frame_equal(result.reset_index(drop=True), test_df) diff --git a/python/xorbits/_mars/dataframe/groupby/__init__.py b/python/xorbits/_mars/dataframe/groupby/__init__.py index 4b76ca6c4..c4369be4e 100644 --- a/python/xorbits/_mars/dataframe/groupby/__init__.py +++ b/python/xorbits/_mars/dataframe/groupby/__init__.py @@ -26,6 +26,7 @@ def _install(): from .fill import bfill, ffill, fillna from .getitem import df_groupby_getitem from .head import head + from .nth import nth # Just for enabling custom agg function registration. # Therefore, del this immediately after import. @@ -73,6 +74,7 @@ def _install(): setattr(cls, "cumsum", cumsum) setattr(cls, "head", head) + setattr(cls, "nth", nth) setattr(cls, "rolling", rolling) diff --git a/python/xorbits/_mars/dataframe/groupby/aggregation.py b/python/xorbits/_mars/dataframe/groupby/aggregation.py index 1d67e0305..1d038944b 100644 --- a/python/xorbits/_mars/dataframe/groupby/aggregation.py +++ b/python/xorbits/_mars/dataframe/groupby/aggregation.py @@ -27,10 +27,10 @@ from ...core import ENTITY_TYPE, OutputType from ...core.context import get_context from ...core.custom_log import redirect_custom_log +from ...core.entity.utils import recursive_tile from ...core.operand import OperandStage from ...serialization.serializables import ( AnyField, - BoolField, DictField, Int32Field, Int64Field, @@ -170,7 +170,6 @@ class DataFrameGroupByAgg(DataFrameOperand, DataFrameOperandMixin): groupby_params = DictField("groupby_params") method = StringField("method") - use_inf_as_na = BoolField("use_inf_as_na") # for chunk combine_size = Int32Field("combine_size") @@ -482,6 +481,7 @@ def _gen_map_chunks( # force as_index=True for map phase map_op.output_types = op.output_types map_op.groupby_params = map_op.groupby_params.copy() + map_op.raw_groupby_params = map_op.raw_groupby_params.copy() map_op.groupby_params["as_index"] = True if isinstance(map_op.groupby_params["by"], list): by = [] @@ -493,6 +493,7 @@ def _gen_map_chunks( else: by.append(v) map_op.groupby_params["by"] = by + map_op.raw_groupby_params["by"] = by map_op.stage = OperandStage.map map_op.pre_funcs = func_infos.pre_funcs map_op.agg_funcs = func_infos.agg_funcs @@ -928,6 +929,20 @@ def tile(cls, op: "DataFrameGroupByAgg"): in_df = build_concatenated_rows_frame(in_df) out_df = op.outputs[0] + by = op.groupby_params["by"] + in_df_nsplits_settled: bool = all([not np.isnan(v) for v in in_df.nsplits[0]]) + if isinstance(by, list): + for i, _by in enumerate(by): + if ( + isinstance(_by, ENTITY_TYPE) + and all([not np.isnan(v) for v in _by.nsplits[0]]) + and in_df_nsplits_settled + ): + by[i] = yield from recursive_tile( + _by.rechunk({0: in_df.nsplits[0]}) + ) + yield by[i].chunks + func_infos = cls._compile_funcs(op, in_df) if op.method == "auto": @@ -945,6 +960,10 @@ def tile(cls, op: "DataFrameGroupByAgg"): else: # pragma: no cover raise NotImplementedError + @classmethod + def _get_new_by_data(cls, by: List, ctx: Dict): + return [ctx[v.key] if isinstance(v, ENTITY_TYPE) else v for v in by] + @classmethod def _get_grouped(cls, op: "DataFrameGroupByAgg", df, ctx, copy=False, grouper=None): if copy: @@ -958,13 +977,7 @@ def _get_grouped(cls, op: "DataFrameGroupByAgg", df, ctx, copy=False, grouper=No params["by"] = grouper params.pop("level", None) elif isinstance(params.get("by"), list): - new_by = [] - for v in params["by"]: - if isinstance(v, ENTITY_TYPE): - new_by.append(ctx[v.key]) - else: - new_by.append(v) - params["by"] = new_by + params["by"] = cls._get_new_by_data(params["by"], ctx) grouped = df.groupby(**params) @@ -986,10 +999,23 @@ def _pack_inputs(agg_funcs: List[ReductionAggStep], in_data): pos += step.output_limit return out_dict - @staticmethod + @classmethod def _do_custom_agg( - func_name: str, op: "DataFrameGroupByAgg", in_data: pd.DataFrame + cls, func_name: str, op: "DataFrameGroupByAgg", in_data: pd.DataFrame, ctx: Dict ) -> Union[pd.Series, pd.DataFrame]: + # Must be tuple way, like x=('col', 'agg_func_name') + # See `is_funcs_aggregate` func, + # if not this way, the code doesn't go here or switch to transform execution. + if op.raw_func is None: + func_name = list(op.raw_func_kw.values())[0][1] + if ( + func_name == "nunique" + and "by" in op.groupby_params + and isinstance(op.groupby_params["by"], list) + ): + op.raw_groupby_params["by"] = cls._get_new_by_data( + op.groupby_params["by"], ctx + ) if op.stage == OperandStage.map: return custom_agg_functions[func_name].execute_map(op, in_data) elif op.stage == OperandStage.combine: @@ -1107,7 +1133,7 @@ def _wrapped_func(col): ) in op.agg_funcs: input_obj = ret_map_groupbys[input_key] if map_func_name == "custom_reduction": - agg_dfs.append(cls._do_custom_agg(raw_func_name, op, in_data)) + agg_dfs.append(cls._do_custom_agg(raw_func_name, op, in_data, ctx)) else: single_func = map_func_name == op.raw_func agg_dfs.append( @@ -1155,7 +1181,7 @@ def _execute_combine(cls, ctx, op: "DataFrameGroupByAgg"): ) in zip(ctx[op.inputs[0].key], op.agg_funcs): input_obj = in_data_dict[output_key] if agg_func_name == "custom_reduction": - combines.append(cls._do_custom_agg(raw_func_name, op, raw_input)) + combines.append(cls._do_custom_agg(raw_func_name, op, raw_input, ctx)) else: combines.append( cls._do_predefined_agg(input_obj, agg_func_name, gpu=op.gpu, **kwds) @@ -1196,7 +1222,7 @@ def _execute_agg(cls, ctx, op: "DataFrameGroupByAgg"): ) in op.agg_funcs: if agg_func_name == "custom_reduction": in_data_dict[output_key] = cls._do_custom_agg( - raw_func_name, op, in_data_dict[output_key] + raw_func_name, op, in_data_dict[output_key], ctx ) else: input_obj = cls._get_grouped(op, in_data_dict[output_key], ctx) @@ -1286,18 +1312,14 @@ def _execute_agg(cls, ctx, op: "DataFrameGroupByAgg"): @redirect_custom_log @enter_current_session def execute(cls, ctx, op: "DataFrameGroupByAgg"): - try: - pd.set_option("mode.use_inf_as_na", op.use_inf_as_na) - if op.stage == OperandStage.map: - cls._execute_map(ctx, op) - elif op.stage == OperandStage.combine: - cls._execute_combine(ctx, op) - elif op.stage == OperandStage.agg: - cls._execute_agg(ctx, op) - else: # pragma: no cover - raise ValueError("Aggregation operand not executable") - finally: - pd.reset_option("mode.use_inf_as_na") + if op.stage == OperandStage.map: + cls._execute_map(ctx, op) + elif op.stage == OperandStage.combine: + cls._execute_combine(ctx, op) + elif op.stage == OperandStage.agg: + cls._execute_agg(ctx, op) + else: # pragma: no cover + raise ValueError("Aggregation operand not executable") def agg(groupby, func=None, method="auto", combine_size=None, *args, **kwargs): @@ -1355,8 +1377,6 @@ def agg(groupby, func=None, method="auto", combine_size=None, *args, **kwargs): func, *args, _call_agg=True, index=index_value, **kwargs ) - use_inf_as_na = kwargs.pop("_use_inf_as_na", options.dataframe.mode.use_inf_as_na) - agg_op = DataFrameGroupByAgg( raw_func=func, raw_func_kw=kwargs, @@ -1365,6 +1385,5 @@ def agg(groupby, func=None, method="auto", combine_size=None, *args, **kwargs): groupby_params=groupby.op.groupby_params, combine_size=combine_size or options.combine_size, chunk_store_limit=options.chunk_store_limit, - use_inf_as_na=use_inf_as_na, ) return agg_op(groupby) diff --git a/python/xorbits/_mars/dataframe/groupby/fill.py b/python/xorbits/_mars/dataframe/groupby/fill.py index 7d71a89de..4baa08ea1 100644 --- a/python/xorbits/_mars/dataframe/groupby/fill.py +++ b/python/xorbits/_mars/dataframe/groupby/fill.py @@ -18,7 +18,7 @@ from ... import opcodes from ...core import OutputType -from ...serialization.serializables import AnyField, DictField, Int64Field, StringField +from ...serialization.serializables import AnyField, Int64Field, StringField from ..operands import DataFrameOperand, DataFrameOperandMixin from ..utils import build_empty_df, build_empty_series, parse_index @@ -29,7 +29,6 @@ class GroupByFillOperand(DataFrameOperand, DataFrameOperandMixin): value = AnyField("value", default=None) method = StringField("method", default=None) limit = Int64Field("limit", default=None) - downcast = DictField("downcast", default=None) def _calc_out_dtypes(self, in_groupby): mock_groupby = in_groupby.op.build_mock_groupby() @@ -40,7 +39,6 @@ def _calc_out_dtypes(self, in_groupby): value=self.value, method=self.method, limit=self.limit, - downcast=self.downcast, ) else: result_df = getattr(mock_groupby, func_name)(limit=self.limit) @@ -133,7 +131,6 @@ def execute(cls, ctx, op: "GroupByFillOperand"): value=op.value, method=op.method, limit=op.limit, - downcast=op.downcast, ) else: result = getattr(in_data, func_name)(limit=op.limit) @@ -184,7 +181,7 @@ def bfill(groupby, limit=None): return op(groupby) -def fillna(groupby, value=None, method=None, limit=None, downcast=None): +def fillna(groupby, value=None, method=None, limit=None): """ Fill NA/NaN values using the specified method @@ -197,11 +194,8 @@ def fillna(groupby, value=None, method=None, limit=None, downcast=None): limit: int, default None If method is specified, this is the maximum number of consecutive NaN values to forward/backward fill - downcast: dict, default None - A dict of item->dtype of what to downcast if possible, - or the string ‘infer’ which will try to downcast to an appropriate equal type return: DataFrame or None """ - op = GroupByFillNa(value=value, method=method, limit=limit, downcast=downcast) + op = GroupByFillNa(value=value, method=method, limit=limit) return op(groupby) diff --git a/python/xorbits/_mars/dataframe/groupby/nth.py b/python/xorbits/_mars/dataframe/groupby/nth.py new file mode 100644 index 000000000..da2cb14a0 --- /dev/null +++ b/python/xorbits/_mars/dataframe/groupby/nth.py @@ -0,0 +1,232 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd + +from ...core import OutputType, get_output_types, recursive_tile +from ...serialization.serializables import DictField, IndexField, StringField +from ..core import IndexValue +from ..operands import DataFrameOperand, DataFrameOperandMixin +from ..utils import build_concatenated_rows_frame, parse_index + + +class GroupByNthSelector(DataFrameOperand, DataFrameOperandMixin): + _op_module_ = "dataframe.groupby" + + groupby_params = DictField("groupby_params") + n = IndexField("n") + dropna = StringField("dropna", default=None) + + def __call__(self, groupby): + df = groupby + while df.op.output_types[0] not in (OutputType.dataframe, OutputType.series): + df = df.inputs[0] + selection = groupby.op.groupby_params.pop("selection", None) + if df.ndim > 1 and selection: + if isinstance(selection, tuple) and selection not in df.dtypes: + selection = list(selection) + + result_df = df[selection] + else: + result_df = df + + self._output_types = ( + [OutputType.dataframe] if result_df.ndim == 2 else [OutputType.series] + ) + params = result_df.params + params["shape"] = (np.nan,) + result_df.shape[1:] + if isinstance(df.index_value.value, IndexValue.RangeIndex): + params["index_value"] = parse_index(pd.RangeIndex(-1), df.key) + + return self.new_tileable([df], **params) + + @classmethod + def tile(cls, op: "GroupByNthSelector"): + in_df = op.inputs[0] + groupby_params = op.groupby_params.copy() + selection = groupby_params.pop("selection", None) + if len(in_df.shape) > 1: + in_df = build_concatenated_rows_frame(in_df) + out_df = op.outputs[0] + # if there is only one chunk, tile with a single chunk + if len(in_df.chunks) <= 1: + new_shape = (np.nan,) + new_nsplits = ((np.nan,),) + if out_df.ndim > 1: + new_shape += (out_df.shape[1],) + new_nsplits += ((out_df.shape[1],),) + c = in_df.chunks[0] + chunk_op = op.copy().reset_key() + params = out_df.params + params["shape"] = new_shape + params["index"] = (0,) * out_df.ndim + out_chunk = chunk_op.new_chunk([c], **params) + + tileable_op = op.copy().reset_key() + return tileable_op.new_tileables( + [in_df], nsplits=new_nsplits, chunks=[out_chunk], **params + ) + + if in_df.ndim > 1 and selection: + if isinstance(selection, tuple) and selection not in in_df.dtypes: + selection = list(selection) + + if not isinstance(selection, list): + pre_selection = [selection] + else: + pre_selection = list(selection) + + if isinstance(groupby_params.get("by"), list): + pre_selection += [ + el for el in groupby_params["by"] if el not in pre_selection + ] + + if len(pre_selection) != in_df.shape[1]: + in_df = yield from recursive_tile(in_df[pre_selection]) + + # pre chunks + pre_chunks = [] + for c in in_df.chunks: + pre_op = op.copy().reset_key() + pre_op._output_types = get_output_types(c) + pre_op.groupby_params = op.groupby_params.copy() + pre_op.groupby_params.pop("selection", None) + params = c.params + params["shape"] = (np.nan,) + c.shape[1:] + pre_chunks.append(pre_op.new_chunk([c], **params)) + + new_op = op.copy().reset_key() + new_op._output_types = get_output_types(in_df) + new_nsplits = ((np.nan,) * len(in_df.nsplits[0]),) + in_df.nsplits[1:] + pre_tiled = new_op.new_tileable( + [in_df], chunks=pre_chunks, nsplits=new_nsplits, **in_df.params + ) + # generate groupby + grouped = yield from recursive_tile(pre_tiled.groupby(**groupby_params)) + if selection: + grouped = yield from recursive_tile(grouped[selection]) + + # generate post chunks + post_chunks = [] + for c in grouped.chunks: + post_op = op.copy().reset_key() + post_op.groupby_params = op.groupby_params.copy() + post_op.groupby_params.pop("selection", None) + if op.output_types[0] == OutputType.dataframe: + index = c.index + else: + index = (c.index[0],) + params = out_df.params + params["index"] = index + post_chunks.append(post_op.new_chunk([c], **params)) + + new_op = op.copy().reset_key() + new_nsplits = ((np.nan,) * len(in_df.nsplits[0]),) + if out_df.ndim > 1: + new_nsplits += ((out_df.shape[1],),) + return new_op.new_tileables( + [in_df], chunks=post_chunks, nsplits=new_nsplits, **out_df.params + ) + + @classmethod + def execute(cls, ctx, op: "GroupByNthSelector"): + in_data = ctx[op.inputs[0].key] + params = op.groupby_params.copy() + selection = params.pop("selection", None) + + if hasattr(in_data, "groupby"): + grouped = in_data.groupby(**params) + else: + grouped = in_data + if selection: + grouped = grouped[selection] + result = grouped.nth(op.n, op.dropna) + ctx[op.outputs[0].key] = result + + +def nth(groupby, n, dropna=None): + """ + Take the nth row from each group if n is an int, or a subset of rows + if n is a list of ints. + + If dropna, will take the nth non-null row, dropna is either + Truthy (if a Series) or 'all', 'any' (if a DataFrame); + this is equivalent to calling dropna(how=dropna) before the + groupby. + + Parameters + ---------- + n : int or list of ints + a single nth value for the row or a list of nth values + dropna : None or str, optional + apply the specified dropna operation before counting which row is + the nth row. Needs to be None, 'any' or 'all' + + Examples + -------- + >>> import mars.dataframe as md + >>> df = md.DataFrame({'A': [1, 1, 2, 1, 2], + ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B']) + >>> g = df.groupby('A') + >>> g.nth(0).execute() + B + A + 1 NaN + 2 3.0 + >>> g.nth(1).execute() + B + A + 1 2.0 + 2 5.0 + >>> g.nth(-1).execute() + B + A + 1 4.0 + 2 5.0 + >>> g.nth([0, 1]).execute() + B + A + 1 NaN + 1 2.0 + 2 3.0 + 2 5.0 + + Specifying ``dropna`` allows count ignoring NaN + + >>> g.nth(0, dropna='any').execute() + B + A + 1 2.0 + 2 3.0 + + NaNs denote group exhausted when using dropna + + >>> g.nth(3, dropna='any').execute() + B + A + 1 NaN + 2 NaN + + Specifying ``as_index=False`` in ``groupby`` keeps the original index. + + >>> df.groupby('A', as_index=False).nth(1).execute() + A B + 1 1 2.0 + 4 2 5.0 + """ + groupby_params = groupby.op.groupby_params.copy() + groupby_params.pop("as_index", None) + op = GroupByNthSelector(n=n, dropna=dropna, groupby_params=groupby_params) + return op(groupby) diff --git a/python/xorbits/_mars/dataframe/groupby/nunique.py b/python/xorbits/_mars/dataframe/groupby/nunique.py index 424b9797e..c07577ba9 100644 --- a/python/xorbits/_mars/dataframe/groupby/nunique.py +++ b/python/xorbits/_mars/dataframe/groupby/nunique.py @@ -15,8 +15,9 @@ import pandas as pd -from ...core import OutputType +from ...core import ENTITY_TYPE, OutputType from ...utils import implements +from ..utils import is_dataframe from .aggregation import DataFrameGroupByAgg from .custom_aggregation import ( DataFrameCustomGroupByAggMixin, @@ -79,6 +80,22 @@ def _get_selection_columns(cls, op: DataFrameGroupByAgg) -> Union[None, List]: selection = [selection] return selection + @classmethod + def _drop_duplicates_by_series(cls, in_data: pd.DataFrame, origin_cols: List): + if isinstance(in_data.index, pd.MultiIndex): + origin_index_name = in_data.index.names + else: + origin_index_name = in_data.index.name + res = in_data.reset_index() + new_cols = list(res.columns) + index_cols = [v for v in new_cols if v not in origin_cols] + res = res.drop_duplicates().set_index(index_cols) + if isinstance(res.index, pd.MultiIndex): + res.index.names = origin_index_name + else: + res.index.name = origin_index_name + return res + @classmethod def _get_execute_map_result( cls, op: DataFrameGroupByAgg, in_data: pd.DataFrame @@ -86,10 +103,23 @@ def _get_execute_map_result( selections = cls._get_selection_columns(op) by_cols = op.raw_groupby_params["by"] if by_cols is not None: - cols = ( - [*selections, *by_cols] if selections is not None else in_data.columns - ) - res = in_data[cols].drop_duplicates(subset=cols).set_index(by_cols) + # When `by` some series, the series will be used to determine the groups. + # We first need to set the index of the data to these series, + # and then `reset_index` to let these series become data columns. + # Next bring these columns for `drop_duplicates` and reset these columns to index. + if isinstance(by_cols, list) and any( + [isinstance(v, pd.Series) for v in by_cols] + ): + origin_cols = list(in_data.columns) + res = in_data.set_index(by_cols) + res = cls._drop_duplicates_by_series(res, origin_cols) + else: + cols = ( + [*selections, *by_cols] + if selections is not None + else in_data.columns + ) + res = in_data[cols].drop_duplicates(subset=cols).set_index(by_cols) else: # group by level selections = selections if selections is not None else in_data.columns level_indexes = cls._get_level_indexes(op, in_data) @@ -111,9 +141,17 @@ def _get_execute_map_result( def _get_execute_combine_result( cls, op: DataFrameGroupByAgg, in_data: pd.DataFrame ) -> Union[pd.DataFrame, pd.Series]: - # in_data.index.names means MultiIndex (groupby on multi cols) - index_col = in_data.index.name or in_data.index.names - res = in_data.reset_index().drop_duplicates().set_index(index_col) + by = op.raw_groupby_params["by"] + if isinstance(by, list) and any([isinstance(v, ENTITY_TYPE) for v in by]): + # `in_data` may be series when there is index op after groupby + origin_cols = ( + list(in_data.columns) if is_dataframe(in_data) else [in_data.name] + ) + res = cls._drop_duplicates_by_series(in_data, origin_cols) + else: + # in_data.index.names means MultiIndex (groupby on multi cols) + index_col = in_data.index.name or in_data.index.names + res = in_data.reset_index().drop_duplicates().set_index(index_col) if op.output_types[0] == OutputType.series: res = res.squeeze() return res @@ -127,7 +165,12 @@ def _get_execute_agg_result( by = op.raw_groupby_params["by"] if by is not None: - if op.output_types[0] == OutputType.dataframe: + if isinstance(by, list) and any( + [isinstance(_by, ENTITY_TYPE) for _by in by] + ): + # nothing to do here, just group by level is correct + pass + elif op.output_types[0] == OutputType.dataframe: groupby_params.pop("level", None) groupby_params["by"] = cols in_data = in_data.reset_index() @@ -136,6 +179,11 @@ def _get_execute_agg_result( # since level field in op.groupby_params is not correct. groupby_params["level"] = op.raw_groupby_params["level"] + # For the tuple usage: .agg(x=('a', 'nunique')), firstly set `as_index=True`. + # Otherwise, subsequent processing will lose the information about the grouped columns. + # TODO: This is due to `reduction` functions, but for now, let's keep it simple. + if op.raw_func is None: + groupby_params["as_index"] = True res = in_data.groupby(**groupby_params).nunique() return res diff --git a/python/xorbits/_mars/dataframe/groupby/tests/test_groupby_execution.py b/python/xorbits/_mars/dataframe/groupby/tests/test_groupby_execution.py index 003870aed..f3ebbc56d 100644 --- a/python/xorbits/_mars/dataframe/groupby/tests/test_groupby_execution.py +++ b/python/xorbits/_mars/dataframe/groupby/tests/test_groupby_execution.py @@ -1878,3 +1878,101 @@ def test_series_groupby_rolling_agg(setup, window, min_periods, center, closed, mresult = mresult.execute().fetch() pd.testing.assert_series_equal(presult, mresult.sort_index()) + + +@pytest.mark.skipif(pd.__version__ <= "1.5.3", reason="pandas version is too low") +@pytest.mark.parametrize( + "chunk_size, dropna", list(product([None, 3], [None, "any", "all"])) +) +def test_groupby_nth(setup, chunk_size, dropna): + df1 = pd.DataFrame( + { + "a": np.random.randint(0, 5, size=20), + "b": np.random.randint(0, 5, size=20), + "c": np.random.randint(0, 5, size=20), + "d": np.random.randint(0, 5, size=20), + } + ) + mdf = md.DataFrame(df1, chunk_size=chunk_size) + + r = mdf.groupby("b").nth(0) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), df1.groupby("b").nth(0) + ) + r = mdf.groupby("b").nth(-1) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), df1.groupby("b").nth(-1) + ) + r = mdf.groupby("b")[["a", "c"]].nth(0) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), df1.groupby("b")[["a", "c"]].nth(0) + ) + + # test nth with list index + r = mdf.groupby("b").nth([0, 1]) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), df1.groupby("b").nth([0, 1]) + ) + + # test nth with slice + r = mdf.groupby("b").nth(slice(None, 1)) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), df1.groupby("b").nth(slice(None, 1)) + ) + + # test nth with selection + r = mdf.groupby("b")[["a", "d"]].nth(0) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), df1.groupby("b")[["a", "d"]].nth(0) + ) + r = mdf.groupby("b")[["c", "a", "d"]].nth(0) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), df1.groupby("b")[["c", "a", "d"]].nth(0) + ) + r = mdf.groupby("b")["c"].nth(0) + pd.testing.assert_series_equal( + r.execute().fetch().sort_index(), df1.groupby("b")["c"].nth(0) + ) + + series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3]) + ms = md.Series(series1, chunk_size=chunk_size) + + r = ms.groupby(lambda x: x % 2).nth(0) + pd.testing.assert_series_equal( + r.execute().fetch().sort_index(), series1.groupby(lambda x: x % 2).nth(0) + ) + + # test with special index + series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3], index=[4, 1, 2, 3, 5, 8, 6, 7, 9]) + ms = md.Series(series1, chunk_size=chunk_size) + + r = ms.groupby(lambda x: x % 2).nth(0) + pd.testing.assert_series_equal( + r.execute().fetch().sort_index(), + series1.groupby(lambda x: x % 2).nth(0).sort_index(), + ) + + df2 = pd.DataFrame( + { + "a": [3, 5, 2, np.nan, 1, 2, 4, 6, 2, 4], + "b": [8, 3, 4, 1, 8, np.nan, 2, 2, 2, 3], + "c": [1, 8, 8, np.nan, 3, 5, 0, 0, 5, 4], + "d": [np.nan, 7, 6, 3, 6, 3, 2, 1, 5, 8], + } + ) + + mdf = md.DataFrame(df2) + + r = mdf.groupby("b").nth(0, dropna=dropna) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), df2.groupby("b").nth(0, dropna=dropna) + ) + r = mdf.groupby("b").nth(-1, dropna=dropna) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), df2.groupby("b").nth(-1, dropna=dropna) + ) + r = mdf.groupby("b")[["a", "c"]].nth(0, dropna=dropna) + pd.testing.assert_frame_equal( + r.execute().fetch().sort_index(), + df2.groupby("b")[["a", "c"]].nth(0, dropna=dropna), + ) diff --git a/python/xorbits/_mars/dataframe/groupby/tests/test_groupby_nunique_execution.py b/python/xorbits/_mars/dataframe/groupby/tests/test_groupby_nunique_execution.py index 08bb49706..68e678230 100644 --- a/python/xorbits/_mars/dataframe/groupby/tests/test_groupby_nunique_execution.py +++ b/python/xorbits/_mars/dataframe/groupby/tests/test_groupby_nunique_execution.py @@ -11,6 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import itertools + import numpy as np import pandas as pd import pytest @@ -332,3 +334,151 @@ def test_groupby_agg_nunique(setup, gen_data1): ) expected = df.groupby("b", sort=sort).agg(["sum", "nunique"]) pd.testing.assert_frame_equal(r.sort_index(), expected.sort_index()) + + +@pytest.mark.parametrize( + "chunk_size, as_index, sort", + itertools.product([None, 13], [True, False], [True, False]), +) +def test_groupby_agg_nunique_with_tuple_kwargs( + setup, gen_data2, chunk_size, as_index, sort +): + df = gen_data2 + mdf = md.DataFrame(df, chunk_size=chunk_size) + + res = mdf.groupby("b", as_index=as_index, sort=sort).agg(e=("a", "nunique")) + expected = df.groupby("b", as_index=as_index, sort=sort).agg(e=("a", "nunique")) + pd.testing.assert_frame_equal(res.execute().fetch(), expected) + + res = mdf.groupby("b", as_index=as_index, sort=sort).agg( + e=("a", "nunique"), f=("c", "nunique") + ) + expected = df.groupby("b", as_index=as_index, sort=sort).agg( + e=("a", "nunique"), f=("c", "nunique") + ) + pd.testing.assert_frame_equal(res.execute().fetch(), expected) + + res = mdf.groupby("b", as_index=as_index, sort=sort).agg( + e=("a", "nunique"), f=("d", "sum") + ) + expected = df.groupby("b", as_index=as_index, sort=sort).agg( + e=("a", "nunique"), f=("d", "sum") + ) + pd.testing.assert_frame_equal(res.execute().fetch(), expected) + + res = mdf.groupby("b", as_index=as_index, sort=sort).agg( + e=("a", "nunique"), f=("d", "sum"), g=("c", "nunique") + ) + expected = df.groupby("b", as_index=as_index, sort=sort).agg( + e=("a", "nunique"), f=("d", "sum"), g=("c", "nunique") + ) + pd.testing.assert_frame_equal(res.execute().fetch(), expected) + + # by multi columns + res = mdf.groupby(["b", "c"], as_index=as_index, sort=sort).agg(e=("a", "nunique")) + expected = df.groupby(["b", "c"], as_index=as_index, sort=sort).agg( + e=("a", "nunique") + ) + pd.testing.assert_frame_equal(res.execute().fetch(), expected) + + res = mdf.groupby(["b", "c"], as_index=as_index, sort=sort).agg( + e=("a", "nunique"), f=("d", "mean") + ) + expected = df.groupby(["b", "c"], as_index=as_index, sort=sort).agg( + e=("a", "nunique"), f=("d", "mean") + ) + pd.testing.assert_frame_equal(res.execute().fetch(), expected) + + # index after groupby + res = mdf.groupby(["b"], as_index=as_index, sort=sort)[["a"]].agg( + e=("a", "nunique") + ) + expected = df.groupby(["b"], as_index=as_index, sort=sort)[["a"]].agg( + e=("a", "nunique") + ) + pd.testing.assert_frame_equal(res.execute().fetch(), expected) + + res = mdf.groupby(["b"], as_index=as_index, sort=sort)[["a", "c"]].agg( + e=("a", "nunique"), f=("c", "nunique") + ) + expected = df.groupby(["b"], as_index=as_index, sort=sort)[["a", "c"]].agg( + e=("a", "nunique"), f=("c", "nunique") + ) + pd.testing.assert_frame_equal(res.execute().fetch(), expected) + + +@pytest.mark.parametrize( + "chunk_size, as_index, sort", + itertools.product([None, 13], [True, False], [True, False]), +) +def test_groupby_nunique_by_series(setup, gen_data2, chunk_size, as_index, sort): + df = gen_data2 + mdf = md.DataFrame(df, chunk_size=chunk_size) + + by1 = pd.Series([i + 100 for i in range(100)]) + mby1 = md.Series(by1) + + by2 = pd.Series([i + 200 for i in range(100)]) + mby2 = md.Series(by2) + + res = mdf.groupby(mby1, as_index=as_index, sort=sort).nunique() + expected = df.groupby(by1, as_index=as_index, sort=sort).nunique() + pd.testing.assert_frame_equal(res.execute().fetch(), expected) + + res = mdf.groupby([mby1, mby2], as_index=as_index, sort=sort).nunique() + expected = df.groupby([by1, by2], as_index=as_index, sort=sort).nunique() + pd.testing.assert_frame_equal(res.execute().fetch(), expected) + + res = mdf.groupby([mby1, mby2], as_index=as_index, sort=sort).agg( + e=("a", "nunique"), f=("c", "nunique") + ) + expected = df.groupby([by1, by2], as_index=as_index, sort=sort).agg( + e=("a", "nunique"), f=("c", "nunique") + ) + pd.testing.assert_frame_equal(res.execute().fetch(), expected) + + # test by with duplicates + rs = np.random.RandomState(0) + by3 = pd.Series(rs.choice([i for i in range(1, 6)], size=(100,))) + mby3 = md.Series(by3) + + res = mdf.groupby(mby3, as_index=as_index, sort=sort).nunique() + expected = df.groupby(by3, as_index=as_index, sort=sort).nunique() + pd.testing.assert_frame_equal(res.execute().fetch(), expected) + + # test by other chunk size + by4 = pd.Series(rs.choice([i for i in range(10)], size=(100,))) + mby4 = md.Series(by4, chunk_size=21) + + res = mdf.groupby(mby4, as_index=as_index, sort=sort).nunique() + expected = df.groupby(by4, as_index=as_index, sort=sort).nunique() + pd.testing.assert_frame_equal(res.execute().fetch(), expected) + + # test index after groupby + res = mdf.groupby(mby3, as_index=as_index, sort=sort)[["a", "b"]].nunique() + expected = df.groupby(by3, as_index=as_index, sort=sort)[["a", "b"]].nunique() + pd.testing.assert_frame_equal(res.execute().fetch(), expected) + + res = mdf.groupby(mby3, as_index=as_index, sort=sort)[["a"]].nunique() + expected = df.groupby(by3, as_index=as_index, sort=sort)[["a"]].nunique() + pd.testing.assert_frame_equal(res.execute().fetch(), expected) + + res = mdf.groupby(mby3, as_index=as_index, sort=sort)["a"].nunique() + expected = df.groupby(by3, as_index=as_index, sort=sort)["a"].nunique() + if as_index: + pd.testing.assert_series_equal(res.execute().fetch(), expected) + else: + pd.testing.assert_frame_equal(res.execute().fetch(), expected) + + # test different methods + for method in ["auto", "tree", "shuffle"]: + res = mdf.groupby(mby3, as_index=as_index, sort=sort).nunique(method=method) + expected = df.groupby(by3, as_index=as_index, sort=sort).nunique() + real = res.execute().fetch() + if method == "shuffle": + pd.testing.assert_frame_equal( + real.sort_values(["a", "b", "c", "d"]).reset_index(drop=True), + expected.sort_values(["a", "b", "c", "d"]).reset_index(drop=True), + ) + else: + pd.testing.assert_frame_equal(real, expected) diff --git a/python/xorbits/_mars/dataframe/hash_utils.py b/python/xorbits/_mars/dataframe/hash_utils.py index 348c4b16e..0d4211e2d 100644 --- a/python/xorbits/_mars/dataframe/hash_utils.py +++ b/python/xorbits/_mars/dataframe/hash_utils.py @@ -9,10 +9,11 @@ from typing import TYPE_CHECKING, Hashable, Iterable, Iterator, cast import numpy as np +import pandas as pd from pandas._libs import lib from pandas._libs.hashing import hash_object_array from pandas._typing import ArrayLike, npt -from pandas.core.dtypes.common import is_categorical_dtype, is_list_like +from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.generic import ( ABCDataFrame, ABCExtensionArray, @@ -272,7 +273,7 @@ def hash_array( # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke). - if is_categorical_dtype(dtype): + if isinstance(dtype, pd.CategoricalDtype): vals = cast("Categorical", vals) return _hash_categorical(vals, encoding, hash_key) diff --git a/python/xorbits/_mars/dataframe/indexing/__init__.py b/python/xorbits/_mars/dataframe/indexing/__init__.py index 45aabf4b2..7d30d7aaf 100644 --- a/python/xorbits/_mars/dataframe/indexing/__init__.py +++ b/python/xorbits/_mars/dataframe/indexing/__init__.py @@ -38,7 +38,7 @@ def _install(): from .sample import sample from .set_axis import df_set_axis, series_set_axis from .set_index import set_index - from .setitem import dataframe_setitem + from .setitem import dataframe_setitem, series_setitem from .where import mask, where for cls in DATAFRAME_TYPE + SERIES_TYPE: @@ -69,6 +69,7 @@ def _install(): for cls in SERIES_TYPE: setattr(cls, "__getitem__", series_getitem) + setattr(cls, "__setitem__", series_setitem) setattr(cls, "reset_index", series_reset_index) setattr(cls, "rename", series_rename) setattr(cls, "set_axis", series_set_axis) diff --git a/python/xorbits/_mars/dataframe/indexing/setitem.py b/python/xorbits/_mars/dataframe/indexing/setitem.py index 41190475d..103165137 100644 --- a/python/xorbits/_mars/dataframe/indexing/setitem.py +++ b/python/xorbits/_mars/dataframe/indexing/setitem.py @@ -14,6 +14,7 @@ # limitations under the License. import collections +from typing import Union import numpy as np import pandas as pd @@ -24,7 +25,7 @@ from ...serialization.serializables import AnyField, KeyField from ...tensor.core import TENSOR_TYPE from ...utils import pd_release_version -from ..core import DATAFRAME_TYPE, SERIES_TYPE, DataFrame +from ..core import DATAFRAME_TYPE, SERIES_TYPE, DataFrame, Series from ..initializer import DataFrame as asframe from ..initializer import Series as asseries from ..operands import DataFrameOperand, DataFrameOperandMixin @@ -49,8 +50,6 @@ def __init__(self, target=None, indexes=None, value=None, output_types=None, **k _output_types=output_types, **kw, ) - if self.output_types is None: - self.output_types = [OutputType.dataframe] @property def target(self): @@ -74,7 +73,7 @@ def _set_inputs(self, inputs): def _is_scalar_tensor(t): return isinstance(t, TENSOR_TYPE) and t.ndim == 0 - def __call__(self, target: DataFrame, value): + def _call_dataframe(self, target: DataFrame, value): raw_target = target inputs = [target] @@ -145,8 +144,54 @@ def __call__(self, target: DataFrame, value): ) raw_target.data = ret.data + def _call_series(self, target: Series, value): + inputs = [target] + dtype = target.dtype + shape = target.shape + index_value = target.index_value + + target.data = self.new_series( + inputs, shape=shape, dtype=dtype, index_value=index_value, name=target.name + ).data + + def __call__(self, target: Union[DataFrame, Series], value): + if target.ndim == 2: + self._call_dataframe(target, value) + else: + self._call_series(target, value) + @classmethod def tile(cls, op: "DataFrameSetitem"): + if op.target.ndim == 2: + res = yield from cls._tile_dataframe(op) + return res + else: + return cls._tile_series(op) + + @classmethod + def _tile_series(cls, op: "DataFrameSetitem"): + in_df = op.inputs[0] + result_chunks = [] + + for chk in in_df.chunks: + new_op = op.copy().reset_key() + new_op.output_types = [OutputType.series] + params = dict( + shape=chk.shape, + index=chk.index, + dtype=chk.dtype, + index_value=chk.index_value, + ) + result_chunks.append(new_op.new_chunk([chk], **params)) + + _new_op = op.copy() + params = op.outputs[0].params.copy() + params["nsplits"] = in_df.nsplits + params["chunks"] = result_chunks + return _new_op.new_seriess(op.inputs, **params) + + @classmethod + def _tile_dataframe(cls, op: "DataFrameSetitem"): from ..merge.concat import DataFrameConcat out = op.outputs[0] @@ -307,6 +352,36 @@ def estimate_size(cls, ctx: dict, op: "DataFrameSetitem"): @classmethod def execute(cls, ctx, op: "DataFrameSetitem"): + target = ctx[op.target.key] + if target.ndim == 2: + cls._execute_dataframe(ctx, op) + else: + cls._execute_series(ctx, op) + + @classmethod + def _execute_series(cls, ctx, op: "DataFrameSetitem"): + target = ctx[op.target.key] + + indexes = op.indexes + value = op.value + + try: + _ = target[indexes] + indexed = True + except KeyError: + indexed = False + + if indexed: + try: + target[indexes] = value + except ValueError: + target = target.copy(deep=True) + target[indexes] = value + + ctx[op.outputs[0].key] = target + + @classmethod + def _execute_dataframe(cls, ctx, op: "DataFrameSetitem"): target = ctx[op.target.key] # only deep copy when updating indexes = ( @@ -336,3 +411,11 @@ def execute(cls, ctx, op: "DataFrameSetitem"): def dataframe_setitem(df, col, value): op = DataFrameSetitem(target=df, indexes=col, value=value) return op(df, value) + + +def series_setitem(series, index, value): + """ + Currently only supports series whose indexes contain `index` + """ + op = DataFrameSetitem(target=series, indexes=index, value=value) + return op(series, value) diff --git a/python/xorbits/_mars/dataframe/indexing/tests/test_indexing_execution.py b/python/xorbits/_mars/dataframe/indexing/tests/test_indexing_execution.py index 3367ee77c..c63ba124b 100644 --- a/python/xorbits/_mars/dataframe/indexing/tests/test_indexing_execution.py +++ b/python/xorbits/_mars/dataframe/indexing/tests/test_indexing_execution.py @@ -38,6 +38,7 @@ from ...datasource.read_csv import DataFrameReadCSV from ...datasource.read_parquet import DataFrameReadParquet from ...datasource.read_sql import DataFrameReadSQL +from ...utils import PD_VERSION_GREATER_THAN_2_10 _allow_set_missing_list = pd_release_version[:2] >= (1, 1) @@ -193,6 +194,36 @@ def test_iloc_getitem(setup_gpu, gpu): pd.testing.assert_index_equal(index.execute().fetch(), data[selection]) +def test_series_setitem(setup): + data1 = pd.Series(np.arange(10)) + series = md.Series(data1, chunk_size=3) + series[2] = 777 + real = series.execute().fetch() + data1[2] = 777 + pd.testing.assert_series_equal(real, data1) + + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + tuples = list(zip(*arrays)) + index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) + data2 = pd.Series(np.random.randn(8), index=index) + series = md.Series(data2, chunk_size=3) + series["bar", "two"] = 0.888888 + real = series.execute().fetch() + data2["bar", "two"] = 0.888888 + pd.testing.assert_series_equal(real, data2) + + data3 = [9, 99, 999, 9999] + series = md.Series(data3, chunk_size=1) + series[1] = 88 + real = series.execute().fetch() + expected = pd.Series(data3) + expected[1] = 88 + pd.testing.assert_series_equal(real, expected) + + def test_iloc_setitem(setup): df1 = pd.DataFrame( [[1, 3, 3], [4, 2, 6], [7, 8, 9]], @@ -1211,6 +1242,10 @@ def test_optimization(setup): extra_config={"operand_executors": operand_executors} ).fetch() expected = pd_df.head(3) + if PD_VERSION_GREATER_THAN_2_10: + result = result.convert_dtypes(dtype_backend="pyarrow") + expected = expected.convert_dtypes(dtype_backend="pyarrow") + pd.testing.assert_frame_equal(result, expected) dirname = os.path.join(tempdir, "test_parquet2") @@ -1228,6 +1263,10 @@ def test_optimization(setup): extra_config={"operand_executors": operand_executors} ).fetch() expected = pd_df.head(3) + if PD_VERSION_GREATER_THAN_2_10: + result = result.convert_dtypes(dtype_backend="pyarrow") + expected = expected.convert_dtypes(dtype_backend="pyarrow") + pd.testing.assert_frame_equal(result, expected) @@ -1639,6 +1678,9 @@ def test_sample_execution(setup): df = md.read_parquet(file_path) r1 = df.sample(frac=0.05, random_state=0) r2 = pd.read_parquet(file_path).sample(frac=0.05, random_state=0) + if PD_VERSION_GREATER_THAN_2_10: + r2 = r2.convert_dtypes(dtype_backend="pyarrow") + pd.testing.assert_frame_equal(r1.execute().fetch(), r2) # test series diff --git a/python/xorbits/_mars/dataframe/merge/merge.py b/python/xorbits/_mars/dataframe/merge/merge.py index 6882be58c..4db6993aa 100644 --- a/python/xorbits/_mars/dataframe/merge/merge.py +++ b/python/xorbits/_mars/dataframe/merge/merge.py @@ -39,6 +39,7 @@ from ...typing import TileableType from ...utils import has_unknown_shape, lazy_import from ..base.bloom_filter import filter_by_bloom_filter +from ..base.core import DataFrameAutoMergeMixin from ..core import DataFrame, DataFrameChunk, Series from ..operands import DataFrameOperand, DataFrameOperandMixin, DataFrameShuffleProxy from ..utils import ( @@ -173,7 +174,7 @@ class MergeMethod(Enum): shuffle = 2 -class DataFrameMerge(DataFrameOperand, DataFrameOperandMixin): +class DataFrameMerge(DataFrameOperand, DataFrameAutoMergeMixin): _op_type_ = OperandDef.DATAFRAME_MERGE how = StringField("how") @@ -668,18 +669,6 @@ def _can_merge_with_broadcast( ) -> bool: return how in [big_side, "inner"] and np.log2(big_chunk_size) > small_chunk_size - @classmethod - def _get_auto_merge_options(cls, auto_merge: str) -> Tuple[bool, bool]: - if auto_merge == "both": - return True, True - elif auto_merge == "none": - return False, False - elif auto_merge == "before": - return True, False - else: - assert auto_merge == "after" - return False, True - @classmethod def _choose_merge_method( cls, op: "DataFrameMerge", left: TileableType, right: TileableType @@ -755,36 +744,10 @@ def tile(cls, op: "DataFrameMerge"): auto_merge_threshold = op.auto_merge_threshold auto_merge_before, auto_merge_after = cls._get_auto_merge_options(op.auto_merge) - if ( - auto_merge_before - and len(left.chunks) + len(right.chunks) > auto_merge_threshold - ): - yield TileStatus([left, right] + left.chunks + right.chunks, progress=0.2) - left_chunk_size = len(left.chunks) - right_chunk_size = len(right.chunks) - left = auto_merge_chunks(ctx, left) - right = auto_merge_chunks(ctx, right) - logger.info( - "Auto merge before %s, left data shape: %s, chunk count: %s -> %s, " - "right data shape: %s, chunk count: %s -> %s.", - op, - left.shape, - left_chunk_size, - len(left.chunks), - right.shape, - right_chunk_size, - len(right.chunks), - ) - else: - logger.info( - "Skip auto merge before %s, left data shape: %s, chunk count: %d, " - "right data shape: %s, chunk count: %d.", - op, - left.shape, - len(left.chunks), - right.shape, - len(right.chunks), - ) + merge_before_res = yield from cls._merge_before( + op, auto_merge_before, auto_merge_threshold, left, right, logger + ) + left, right = merge_before_res[0], merge_before_res[1] method = cls._choose_merge_method(op, left, right) if cls._if_apply_bloom_filter(method, op, left, right): @@ -818,33 +781,14 @@ def tile(cls, op: "DataFrameMerge"): assert method == MergeMethod.shuffle ret = cls._tile_shuffle(op, left, right) - if ( - op.how == "inner" - and auto_merge_after - and len(ret[0].chunks) > auto_merge_threshold - ): + if op.how == "inner": # if how=="inner", output data size will reduce greatly with high probability, # use auto_merge_chunks to combine small chunks. - yield TileStatus( - ret[0].chunks, progress=0.8 - ) # trigger execution for chunks - merged = auto_merge_chunks(get_context(), ret[0]) - logger.info( - "Auto merge after %s, data shape: %s, chunk count: %s -> %s.", - op, - merged.shape, - len(ret[0].chunks), - len(merged.chunks), + ret = yield from cls._merge_after( + op, auto_merge_after, auto_merge_threshold, ret, logger ) - return [merged] - else: - logger.info( - "Skip auto merge after %s, data shape: %s, chunk count: %d.", - op, - ret[0].shape, - len(ret[0].chunks), - ) - return ret + + return ret @classmethod def execute(cls, ctx, op): diff --git a/python/xorbits/_mars/dataframe/missing/checkna.py b/python/xorbits/_mars/dataframe/missing/checkna.py index 530c042d8..3ccc70545 100644 --- a/python/xorbits/_mars/dataframe/missing/checkna.py +++ b/python/xorbits/_mars/dataframe/missing/checkna.py @@ -21,7 +21,6 @@ from ... import dataframe as md from ... import opcodes from ... import tensor as mt -from ...config import options from ...core import OutputType from ...serialization.serializables import BoolField from ..operands import ( @@ -39,14 +38,10 @@ class DataFrameCheckNA(DataFrameOperand, DataFrameOperandMixin): _op_type_ = opcodes.CHECK_NA _positive = BoolField("positive") - _use_inf_as_na = BoolField("use_inf_as_na") - def __init__( - self, positive=None, use_inf_as_na=None, sparse=None, output_types=None, **kw - ): + def __init__(self, positive=None, sparse=None, output_types=None, **kw): super().__init__( _positive=positive, - _use_inf_as_na=use_inf_as_na, _output_types=output_types, sparse=sparse, **kw, @@ -56,10 +51,6 @@ def __init__( def positive(self) -> bool: return self._positive - @property - def use_inf_as_na(self) -> bool: - return self._use_inf_as_na - def __call__(self, df): if isinstance(df, DATAFRAME_TYPE): self.output_types = [OutputType.dataframe] @@ -107,15 +98,10 @@ def tile(cls, op: "DataFrameCheckNA"): @classmethod def execute(cls, ctx, op: "DataFrameCheckNA"): in_data = ctx[op.inputs[0].key] - old_use_inf_as_na = pd.get_option("mode.use_inf_as_na") - try: - pd.set_option("mode.use_inf_as_na", op.use_inf_as_na) - if op.positive: - ctx[op.outputs[0].key] = in_data.isna() - else: - ctx[op.outputs[0].key] = in_data.notna() - finally: - pd.set_option("mode.use_inf_as_na", old_use_inf_as_na) + if op.positive: + ctx[op.outputs[0].key] = in_data.isna() + else: + ctx[op.outputs[0].key] = in_data.notna() def _from_pandas(obj: Any): @@ -200,14 +186,9 @@ def isna(obj): raise NotImplementedError("isna is not defined for MultiIndex") elif isinstance(obj, ENTITY_TYPE): if isinstance(obj, TENSOR_TYPE): - if options.dataframe.mode.use_inf_as_na: - return ~mt.isfinite(obj) - else: - return mt.isnan(obj) + return mt.isnan(obj) else: - op = DataFrameCheckNA( - positive=True, use_inf_as_na=options.dataframe.mode.use_inf_as_na - ) + op = DataFrameCheckNA(positive=True) return op(obj) else: return _from_pandas(pd.isna(obj)) @@ -279,14 +260,9 @@ def notna(obj): raise NotImplementedError("isna is not defined for MultiIndex") elif isinstance(obj, ENTITY_TYPE): if isinstance(obj, TENSOR_TYPE): - if options.dataframe.mode.use_inf_as_na: - return mt.isfinite(obj) - else: - return ~mt.isnan(obj) + return ~mt.isnan(obj) else: - op = DataFrameCheckNA( - positive=False, use_inf_as_na=options.dataframe.mode.use_inf_as_na - ) + op = DataFrameCheckNA(positive=False) return op(obj) else: return _from_pandas(pd.notna(obj)) diff --git a/python/xorbits/_mars/dataframe/missing/dropna.py b/python/xorbits/_mars/dataframe/missing/dropna.py index af71e3cd0..a3d50f466 100644 --- a/python/xorbits/_mars/dataframe/missing/dropna.py +++ b/python/xorbits/_mars/dataframe/missing/dropna.py @@ -19,7 +19,6 @@ import pandas as pd from ... import opcodes -from ...config import options from ...core import OutputType, recursive_tile from ...serialization.serializables import AnyField, BoolField, Int32Field, StringField from ...utils import no_default, pd_release_version @@ -37,7 +36,6 @@ class DataFrameDropNA(DataFrameOperand, DataFrameOperandMixin): _how = StringField("how") _thresh = Int32Field("thresh") _subset = AnyField("subset") - _use_inf_as_na = BoolField("use_inf_as_na") # when True, dropna will be called on the input, # otherwise non-nan counts will be used @@ -51,7 +49,6 @@ def __init__( how=None, thresh=None, subset=None, - use_inf_as_na=None, drop_directly=None, subset_size=None, sparse=None, @@ -63,7 +60,6 @@ def __init__( _how=how, _thresh=thresh, _subset=subset, - _use_inf_as_na=use_inf_as_na, _drop_directly=drop_directly, _subset_size=subset_size, _output_types=output_types, @@ -87,10 +83,6 @@ def thresh(self) -> int: def subset(self) -> list: return self._subset - @property - def use_inf_as_na(self) -> bool: - return self._use_inf_as_na - @property def drop_directly(self) -> bool: return self._drop_directly @@ -150,9 +142,7 @@ def tile(cls, op: "DataFrameDropNA"): subset_df = in_df if op.subset: subset_df = in_df[op.subset] - count_series = yield from recursive_tile( - subset_df.agg("count", axis=1, _use_inf_as_na=op.use_inf_as_na) - ) + count_series = yield from recursive_tile(subset_df.agg("count", axis=1)) nsplits, out_shape, left_chunks, right_chunks = align_dataframe_series( in_df, count_series, axis=0 @@ -185,35 +175,30 @@ def tile(cls, op: "DataFrameDropNA"): @classmethod def execute(cls, ctx, op: "DataFrameDropNA"): - try: - pd.set_option("mode.use_inf_as_na", op.use_inf_as_na) - - in_data = ctx[op.inputs[0].key] - if op.drop_directly: - if isinstance(in_data, pd.DataFrame): - result = in_data.dropna( - axis=op.axis, how=op.how, thresh=op.thresh, subset=op.subset - ) - elif isinstance(in_data, pd.Series): - result = in_data.dropna(axis=op.axis, how=op.how) - else: - result = in_data.dropna(how=op.how) - ctx[op.outputs[0].key] = result - return - - in_counts = ctx[op.inputs[1].key] - if op.how == "all": - in_counts = in_counts[in_counts > 0] + in_data = ctx[op.inputs[0].key] + if op.drop_directly: + if isinstance(in_data, pd.DataFrame): + result = in_data.dropna( + axis=op.axis, how=op.how, thresh=op.thresh, subset=op.subset + ) + elif isinstance(in_data, pd.Series): + result = in_data.dropna(axis=op.axis, how=op.how) else: - if op.thresh is None or op.thresh is no_default: - thresh = op.subset_size - else: # pragma: no cover - thresh = op.thresh - in_counts = in_counts[in_counts >= thresh] + result = in_data.dropna(how=op.how) + ctx[op.outputs[0].key] = result + return - ctx[op.outputs[0].key] = in_data.reindex(in_counts.index) - finally: - pd.reset_option("mode.use_inf_as_na") + in_counts = ctx[op.inputs[1].key] + if op.how == "all": + in_counts = in_counts[in_counts > 0] + else: + if op.thresh is None or op.thresh is no_default: + thresh = op.subset_size + else: # pragma: no cover + thresh = op.thresh + in_counts = in_counts[in_counts >= thresh] + + ctx[op.outputs[0].key] = in_data.reindex(in_counts.index) def df_dropna( @@ -328,14 +313,12 @@ def df_dropna( if thresh is no_default and how is no_default: how = "any" - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameDropNA( axis=axis, how=how, thresh=thresh, subset=subset, output_types=[OutputType.dataframe], - use_inf_as_na=use_inf_as_na, ) out_df = op(df) if inplace: @@ -417,12 +400,10 @@ def series_dropna(series, axis=0, inplace=False, how=None): dtype: object """ axis = validate_axis(axis, series) - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameDropNA( axis=axis, how=how, output_types=[OutputType.series], - use_inf_as_na=use_inf_as_na, ) out_series = op(series) if inplace: @@ -445,8 +426,5 @@ def index_dropna(index, how="any"): ------- Index """ - use_inf_as_na = options.dataframe.mode.use_inf_as_na - op = DataFrameDropNA( - axis=0, how=how, output_types=[OutputType.index], use_inf_as_na=use_inf_as_na - ) + op = DataFrameDropNA(axis=0, how=how, output_types=[OutputType.index]) return op(index) diff --git a/python/xorbits/_mars/dataframe/missing/fillna.py b/python/xorbits/_mars/dataframe/missing/fillna.py index 15c33226a..65bca253b 100644 --- a/python/xorbits/_mars/dataframe/missing/fillna.py +++ b/python/xorbits/_mars/dataframe/missing/fillna.py @@ -19,10 +19,9 @@ import pandas as pd from ... import opcodes -from ...config import options from ...core import ENTITY_TYPE, Entity, OutputType, get_output_types from ...core.operand import OperandStage -from ...serialization.serializables import AnyField, BoolField, Int64Field, StringField +from ...serialization.serializables import AnyField, Int64Field, StringField from ..align import ( align_dataframe_dataframe, align_dataframe_series, @@ -42,8 +41,6 @@ class FillNA(DataFrameOperand, DataFrameOperandMixin): _method = StringField("method") _axis = AnyField("axis") _limit = Int64Field("limit") - _downcast = AnyField("downcast") - _use_inf_as_na = BoolField("use_inf_as_na") _output_limit = Int64Field("output_limit") @@ -53,8 +50,6 @@ def __init__( method=None, axis=None, limit=None, - downcast=None, - use_inf_as_na=None, output_types=None, output_limit=None, **kw @@ -64,8 +59,6 @@ def __init__( _method=method, _axis=axis, _limit=limit, - _downcast=downcast, - _use_inf_as_na=use_inf_as_na, _output_types=output_types, _output_limit=output_limit, **kw @@ -87,14 +80,6 @@ def axis(self): def limit(self): return self._limit - @property - def downcast(self): - return self._downcast - - @property - def use_inf_as_na(self): - return self._use_inf_as_na - def _set_inputs(self, inputs): super()._set_inputs(inputs) if self._method is None and len(inputs) > 1: @@ -131,7 +116,9 @@ def _execute_map(cls, ctx, op): method = op.method filled = input_data.fillna( - method=method, axis=axis, limit=limit, downcast=op.downcast + method=method, + axis=axis, + limit=limit, ) ctx[op.outputs[0].key] = cls._get_first_slice(op, filled, 1) del filled @@ -151,7 +138,9 @@ def _execute_combine(cls, ctx, op): if not summaries: ctx[op.outputs[0].key] = input_data.fillna( - method=method, axis=axis, limit=limit, downcast=op.downcast + method=method, + axis=axis, + limit=limit, ) return @@ -166,7 +155,9 @@ def _execute_combine(cls, ctx, op): if is_pandas_2(): concat_df = concat_df.fillna( - method=method, axis=axis, limit=limit, downcast=op.downcast + method=method, + axis=axis, + limit=limit, ) else: concat_df.fillna( @@ -174,37 +165,29 @@ def _execute_combine(cls, ctx, op): axis=axis, inplace=True, limit=limit, - downcast=op.downcast, ) ctx[op.outputs[0].key] = cls._get_first_slice(op, concat_df, -1) @classmethod def execute(cls, ctx, op): - try: - pd.set_option("mode.use_inf_as_na", op.use_inf_as_na) - if op.stage == OperandStage.map: - cls._execute_map(ctx, op) - elif op.stage == OperandStage.combine: - cls._execute_combine(ctx, op) + if op.stage == OperandStage.map: + cls._execute_map(ctx, op) + elif op.stage == OperandStage.combine: + cls._execute_combine(ctx, op) + else: + input_data = ctx[op.inputs[0].key] + value = getattr(op, "value", None) + if isinstance(op.value, ENTITY_TYPE): + value = ctx[op.value.key] + if not isinstance(input_data, pd.Index): + ctx[op.outputs[0].key] = input_data.fillna( + value=value, + method=op.method, + axis=op.axis, + limit=op.limit, + ) else: - input_data = ctx[op.inputs[0].key] - value = getattr(op, "value", None) - if isinstance(op.value, ENTITY_TYPE): - value = ctx[op.value.key] - if not isinstance(input_data, pd.Index): - ctx[op.outputs[0].key] = input_data.fillna( - value=value, - method=op.method, - axis=op.axis, - limit=op.limit, - downcast=op.downcast, - ) - else: - ctx[op.outputs[0].key] = input_data.fillna( - value=value, downcast=op.downcast - ) - finally: - pd.reset_option("mode.use_inf_as_na") + ctx[op.outputs[0].key] = input_data.fillna(value=value) @classmethod def _tile_one_by_one(cls, op): @@ -490,9 +473,7 @@ def __call__(self, a, value_df=None): ) -def fillna( - df, value=None, method=None, axis=None, inplace=False, limit=None, downcast=None -): +def fillna(df, value=None, method=None, axis=None, inplace=False, limit=None): """ Fill NA/NaN values using the specified method. @@ -521,10 +502,6 @@ def fillna( be partially filled. If method is not specified, this is the maximum number of entries along the entire axis where NaNs will be filled. Must be greater than 0 if not None. - downcast : dict, default is None - A dict of item->dtype of what to downcast if possible, - or the string 'infer' which will try to downcast to an appropriate - equal type (e.g. float64 to int64 if possible). Returns ------- @@ -595,10 +572,6 @@ def fillna( % type(value).__name__ ) - if downcast is not None: - raise NotImplementedError( - 'Currently argument "downcast" is not implemented yet' - ) if limit is not None: raise NotImplementedError('Currently argument "limit" is not implemented yet') @@ -607,14 +580,11 @@ def fillna( else: value_df = None - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = FillNA( value=value, method=method, axis=axis, limit=limit, - downcast=downcast, - use_inf_as_na=use_inf_as_na, output_types=get_output_types(df), ) out_df = op(df, value_df=value_df) @@ -624,7 +594,7 @@ def fillna( return out_df -def ffill(df, axis=None, inplace=False, limit=None, downcast=None): +def ffill(df, axis=None, inplace=False, limit=None): """ Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``. @@ -633,12 +603,10 @@ def ffill(df, axis=None, inplace=False, limit=None, downcast=None): {klass} or None Object with missing values filled or None if ``inplace=True``. """ - return fillna( - df, method="ffill", axis=axis, inplace=inplace, limit=limit, downcast=downcast - ) + return fillna(df, method="ffill", axis=axis, inplace=inplace, limit=limit) -def bfill(df, axis=None, inplace=False, limit=None, downcast=None): +def bfill(df, axis=None, inplace=False, limit=None): """ Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``. @@ -647,12 +615,10 @@ def bfill(df, axis=None, inplace=False, limit=None, downcast=None): {klass} or None Object with missing values filled or None if ``inplace=True``. """ - return fillna( - df, method="bfill", axis=axis, inplace=inplace, limit=limit, downcast=downcast - ) + return fillna(df, method="bfill", axis=axis, inplace=inplace, limit=limit) -def index_fillna(index, value=None, downcast=None): +def index_fillna(index, value=None): """ Fill NA/NaN values with the specified value. @@ -661,10 +627,6 @@ def index_fillna(index, value=None, downcast=None): value : scalar Scalar value to use to fill holes (e.g. 0). This value cannot be a list-likes. - downcast : dict, default is None - A dict of item->dtype of what to downcast if possible, - or the string 'infer' which will try to downcast to an appropriate - equal type (e.g. float64 to int64 if possible). Returns ------- @@ -678,11 +640,8 @@ def index_fillna(index, value=None, downcast=None): if isinstance(value, (list, pd.Series, SERIES_TYPE)): raise ValueError("'value' must be a scalar, passed: %s" % type(value)) - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = FillNA( value=value, - downcast=downcast, - use_inf_as_na=use_inf_as_na, output_types=get_output_types(index), ) return op(index) diff --git a/python/xorbits/_mars/dataframe/missing/tests/test_missing.py b/python/xorbits/_mars/dataframe/missing/tests/test_missing.py index cab62c383..50a7a7578 100644 --- a/python/xorbits/_mars/dataframe/missing/tests/test_missing.py +++ b/python/xorbits/_mars/dataframe/missing/tests/test_missing.py @@ -56,8 +56,6 @@ def test_fill_na(): series.fillna(value=df) with pytest.raises(ValueError): series.fillna(value=df_raw) - with pytest.raises(NotImplementedError): - series.fillna(value=series_raw, downcast="infer") with pytest.raises(NotImplementedError): series.ffill(limit=1) @@ -226,19 +224,10 @@ def test_replace(): assert r.chunks[0].op.limit is None -@pytest.mark.parametrize("inf_as_na", [True, False]) -def test_isna(setup, inf_as_na): - from ....config import options +def test_isna(setup): from ..checkna import isna - old_mars_inf_as_na = options.dataframe.mode.use_inf_as_na - options.dataframe.mode.use_inf_as_na = inf_as_na - # this option could be changed by mars execution. - old_pd_inf_as_na = pd.get_option("mode.use_inf_as_na") - pd.options.mode.use_inf_as_na = inf_as_na - # scalars - assert pd.get_option("mode.use_inf_as_na") == inf_as_na assert isna("dog") == pd.isna("dog") assert isna(None) == pd.isna(None) assert isna(md.NA) == pd.isna(pd.NA) @@ -247,47 +236,39 @@ def test_isna(setup, inf_as_na): assert isna(type) == pd.isna(type) # multi index - assert pd.get_option("mode.use_inf_as_na") == inf_as_na with pytest.raises(NotImplementedError): midx = md.MultiIndex() isna(midx) # list - assert pd.get_option("mode.use_inf_as_na") == inf_as_na l = [1, 2, 3, np.Inf, np.NaN, pd.NA, pd.NaT] - assert pd.get_option("mode.use_inf_as_na") == inf_as_na actual = isna(l).execute().fetch() expected = pd.isna(l) np.testing.assert_array_equal(expected, actual) # tuple - assert pd.get_option("mode.use_inf_as_na") == inf_as_na t = (1, 2, 3, np.Inf, np.NaN, pd.NA, pd.NaT) assert not isna(t) # numpy ndarray - assert pd.get_option("mode.use_inf_as_na") == inf_as_na narr = np.array((1, 2, 3, np.Inf, np.NaN)) actual = isna(narr).execute().fetch() expected = pd.isna(narr) np.testing.assert_array_equal(expected, actual) # pandas index - assert pd.get_option("mode.use_inf_as_na") == inf_as_na pi = pd.Index((1, 2, 3, np.Inf, np.NaN, pd.NA, pd.NaT)) actual = isna(pi).execute().fetch() expected = pd.isna(pi) np.testing.assert_array_equal(expected, actual) # pandas series - assert pd.get_option("mode.use_inf_as_na") == inf_as_na ps = pd.Series((1, 2, 3, np.Inf, np.NaN, pd.NA, pd.NaT)) actual = isna(ps).execute().fetch() expected = pd.isna(ps) pd.testing.assert_series_equal(expected, actual) # pandas dataframe - assert pd.get_option("mode.use_inf_as_na") == inf_as_na pdf = pd.DataFrame( {"foo": (1, 2, 3, np.Inf, pd.NA), "bar": (4, 5, 6, np.NaN, pd.NaT)} ) @@ -296,7 +277,6 @@ def test_isna(setup, inf_as_na): pd.testing.assert_frame_equal(expected, actual) # mars tensor - assert pd.get_option("mode.use_inf_as_na") == inf_as_na marr = mt.tensor(narr) actual = isna(marr).execute().fetch() expected = pd.isna(narr) @@ -305,7 +285,6 @@ def test_isna(setup, inf_as_na): # mars index from ...datasource.index import from_pandas as from_pandas_index - assert pd.get_option("mode.use_inf_as_na") == inf_as_na mi = from_pandas_index(pi) actual = isna(mi).execute().fetch() expected = pd.isna(pi) @@ -314,7 +293,6 @@ def test_isna(setup, inf_as_na): # mars series from ...datasource.series import from_pandas as from_pandas_series - assert pd.get_option("mode.use_inf_as_na") == inf_as_na ms = from_pandas_series(ps) actual = isna(ms).execute().fetch() expected = pd.isna(ps) @@ -323,29 +301,16 @@ def test_isna(setup, inf_as_na): # mars dataframe from ...datasource.dataframe import from_pandas as from_pandas_df - assert pd.get_option("mode.use_inf_as_na") == inf_as_na mdf = from_pandas_df(pdf) actual = isna(mdf).execute().fetch() expected = pd.isna(pdf) pd.testing.assert_frame_equal(expected, actual) - options.dataframe.mode.use_inf_as_na = old_mars_inf_as_na - pd.options.mode.use_inf_as_na = old_pd_inf_as_na - -@pytest.mark.parametrize("inf_as_na", [True, False]) -def test_notna(setup, inf_as_na): - from ....config import options +def test_notna(setup): from ..checkna import notna - old_mars_inf_as_na = options.dataframe.mode.use_inf_as_na - options.dataframe.mode.use_inf_as_na = inf_as_na - # this option could be changed by mars execution. - old_pd_inf_as_na = pd.get_option("mode.use_inf_as_na") - pd.options.mode.use_inf_as_na = inf_as_na - # scalars - assert pd.get_option("mode.use_inf_as_na") == inf_as_na assert notna("dog") == pd.notna("dog") assert notna(None) == pd.notna(None) assert notna(md.NA) == pd.notna(pd.NA) @@ -354,46 +319,39 @@ def test_notna(setup, inf_as_na): assert notna(type) == pd.notna(type) # multi index - assert pd.get_option("mode.use_inf_as_na") == inf_as_na with pytest.raises(NotImplementedError): midx = md.MultiIndex() notna(midx) # list - assert pd.get_option("mode.use_inf_as_na") == inf_as_na l = [1, 2, 3, np.Inf, np.NaN, pd.NA, pd.NaT] actual = notna(l).execute().fetch() expected = pd.notna(l) np.testing.assert_array_equal(expected, actual) # tuple - assert pd.get_option("mode.use_inf_as_na") == inf_as_na t = (1, 2, 3, np.Inf, np.NaN, pd.NA, pd.NaT) assert notna(t) # numpy ndarray - assert pd.get_option("mode.use_inf_as_na") == inf_as_na narr = np.array((1, 2, 3, np.Inf, np.NaN)) actual = notna(narr).execute().fetch() expected = pd.notna(narr) np.testing.assert_array_equal(expected, actual) # pandas index - assert pd.get_option("mode.use_inf_as_na") == inf_as_na pi = pd.Index((1, 2, 3, np.Inf, np.NaN, pd.NA, pd.NaT)) actual = notna(pi).execute().fetch() expected = pd.notna(pi) np.testing.assert_array_equal(expected, actual) # pandas series - assert pd.get_option("mode.use_inf_as_na") == inf_as_na ps = pd.Series((1, 2, 3, np.Inf, np.NaN, pd.NA, pd.NaT)) actual = notna(ps).execute().fetch() expected = pd.notna(ps) pd.testing.assert_series_equal(expected, actual) # pandas dataframe - assert pd.get_option("mode.use_inf_as_na") == inf_as_na pdf = pd.DataFrame( {"foo": (1, 2, 3, np.Inf, pd.NA), "bar": (4, 5, 6, np.NaN, pd.NaT)} ) @@ -402,7 +360,6 @@ def test_notna(setup, inf_as_na): pd.testing.assert_frame_equal(expected, actual) # mars tensor - assert pd.get_option("mode.use_inf_as_na") == inf_as_na marr = mt.tensor(narr) actual = notna(marr).execute().fetch() expected = pd.notna(narr) @@ -411,7 +368,6 @@ def test_notna(setup, inf_as_na): # mars index from ...datasource.index import from_pandas as from_pandas_index - assert pd.get_option("mode.use_inf_as_na") == inf_as_na mi = from_pandas_index(pi) actual = notna(mi).execute().fetch() expected = pd.notna(pi) @@ -420,7 +376,6 @@ def test_notna(setup, inf_as_na): # mars series from ...datasource.series import from_pandas as from_pandas_series - assert pd.get_option("mode.use_inf_as_na") == inf_as_na ms = from_pandas_series(ps) actual = notna(ms).execute().fetch() expected = pd.notna(ps) @@ -429,11 +384,7 @@ def test_notna(setup, inf_as_na): # mars dataframe from ...datasource.dataframe import from_pandas as from_pandas_df - assert pd.get_option("mode.use_inf_as_na") == inf_as_na mdf = from_pandas_df(pdf) actual = notna(mdf).execute().fetch() expected = pd.notna(pdf) pd.testing.assert_frame_equal(expected, actual) - - options.dataframe.mode.use_inf_as_na = old_mars_inf_as_na - pd.options.mode.use_inf_as_na = old_pd_inf_as_na diff --git a/python/xorbits/_mars/dataframe/reduction/aggregation.py b/python/xorbits/_mars/dataframe/reduction/aggregation.py index dc748dfb3..d9aa5768f 100644 --- a/python/xorbits/_mars/dataframe/reduction/aggregation.py +++ b/python/xorbits/_mars/dataframe/reduction/aggregation.py @@ -98,7 +98,6 @@ class DataFrameAggregate(DataFrameOperand, DataFrameOperandMixin): axis = AnyField("axis") numeric_only = BoolField("numeric_only") bool_only = BoolField("bool_only") - use_inf_as_na = BoolField("use_inf_as_na") combine_size = Int32Field("combine_size") pre_funcs = ListField("pre_funcs") @@ -925,45 +924,45 @@ def _cudf_agg(cls, op: "DataFrameAggregate", in_data): @redirect_custom_log @enter_current_session def execute(cls, ctx, op: "DataFrameAggregate"): - try: - pd.set_option("mode.use_inf_as_na", op.use_inf_as_na) - if op.stage == OperandStage.map: - cls._execute_map(ctx, op) - elif op.stage == OperandStage.combine: - cls._execute_combine(ctx, op) - elif op.stage == OperandStage.agg: - cls._execute_agg(ctx, op) - elif not _agg_size_as_series and op.raw_func == "size": - xp = cp if op.gpu else np - ctx[op.outputs[0].key] = xp.array( - ctx[op.inputs[0].key].agg(op.raw_func, axis=op.axis) - ).reshape(op.outputs[0].shape) + if op.stage == OperandStage.map: + cls._execute_map(ctx, op) + elif op.stage == OperandStage.combine: + cls._execute_combine(ctx, op) + elif op.stage == OperandStage.agg: + cls._execute_agg(ctx, op) + elif not _agg_size_as_series and op.raw_func == "size": + xp = cp if op.gpu else np + ctx[op.outputs[0].key] = xp.array( + ctx[op.inputs[0].key].agg(op.raw_func, axis=op.axis) + ).reshape(op.outputs[0].shape) + else: + xp = cp if op.gpu else np + in_obj = op.inputs[0] + in_data = ctx[in_obj.key] + in_data = cls._select_dtypes(in_data, op) + if isinstance(in_obj, INDEX_CHUNK_TYPE): + result = op.func[0](in_data) + elif ( + op.output_types[0] == OutputType.scalar + and in_data.shape == (0,) + and callable(op.func[0]) + ): + result = op.func[0](in_data) else: - xp = cp if op.gpu else np - in_obj = op.inputs[0] - in_data = ctx[in_obj.key] - in_data = cls._select_dtypes(in_data, op) - if isinstance(in_obj, INDEX_CHUNK_TYPE): - result = op.func[0](in_data) - elif ( - op.output_types[0] == OutputType.scalar - and in_data.shape == (0,) - and callable(op.func[0]) - ): - result = op.func[0](in_data) + if is_cudf(in_data): + result = cls._cudf_agg(op, in_data) else: - if is_cudf(in_data): - result = cls._cudf_agg(op, in_data) - else: - result = in_data.agg(op.raw_func, axis=op.axis) - if op.outputs[0].ndim == 1: - result = result.astype(op.outputs[0].dtype, copy=False) + result = ( + in_data.agg(op.raw_func, axis=op.axis) + if op.raw_func is not None + else in_data.agg(**op.raw_func_kw, axis=op.axis) + ) + if op.outputs[0].ndim == 1: + result = result.astype(op.outputs[0].dtype, copy=False) - if op.output_types[0] == OutputType.tensor: - result = xp.array(result) - ctx[op.outputs[0].key] = result - finally: - pd.reset_option("mode.use_inf_as_na") + if op.output_types[0] == OutputType.tensor: + result = xp.array(result) + ctx[op.outputs[0].key] = result def is_funcs_aggregate(func, func_kw=None, ndim=2): @@ -1068,7 +1067,6 @@ def normalize_reduction_funcs(op, ndim=None): def aggregate(df, func=None, axis=0, **kw): axis = validate_axis(axis, df) - use_inf_as_na = kw.pop("_use_inf_as_na", options.dataframe.mode.use_inf_as_na) if ( df.ndim == 2 and isinstance(func, dict) @@ -1107,7 +1105,6 @@ def aggregate(df, func=None, axis=0, **kw): combine_size=combine_size, numeric_only=numeric_only, bool_only=bool_only, - use_inf_as_na=use_inf_as_na, ) return op(df, output_type=output_type, dtypes=dtypes, index=index) diff --git a/python/xorbits/_mars/dataframe/reduction/all.py b/python/xorbits/_mars/dataframe/reduction/all.py index e7a3e2569..f8a5d1409 100644 --- a/python/xorbits/_mars/dataframe/reduction/all.py +++ b/python/xorbits/_mars/dataframe/reduction/all.py @@ -17,7 +17,6 @@ import pandas as pd from ... import opcodes as OperandDef -from ...config import options from ...core import OutputType from .core import ( DATAFRAME_TYPE, @@ -86,7 +85,6 @@ def all_series( combine_size=None, method=None, ): - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameAll( axis=axis, skipna=skipna, @@ -94,7 +92,6 @@ def all_series( bool_only=bool_only, combine_size=combine_size, output_types=[OutputType.scalar], - use_inf_as_na=use_inf_as_na, method=method, ) return op(series) @@ -109,7 +106,6 @@ def all_dataframe( combine_size=None, method=None, ): - use_inf_as_na = options.dataframe.mode.use_inf_as_na output_types = [OutputType.series] if axis is not None else [OutputType.scalar] op = DataFrameAll( axis=axis, @@ -118,13 +114,11 @@ def all_dataframe( bool_only=bool_only, combine_size=combine_size, output_types=output_types, - use_inf_as_na=use_inf_as_na, method=method, ) return op(df) def all_index(idx): - use_inf_as_na = options.dataframe.mode.use_inf_as_na - op = DataFrameAll(output_types=[OutputType.scalar], use_inf_as_na=use_inf_as_na) + op = DataFrameAll(output_types=[OutputType.scalar]) return op(idx) diff --git a/python/xorbits/_mars/dataframe/reduction/any.py b/python/xorbits/_mars/dataframe/reduction/any.py index d0bc85b30..b653a461e 100644 --- a/python/xorbits/_mars/dataframe/reduction/any.py +++ b/python/xorbits/_mars/dataframe/reduction/any.py @@ -17,7 +17,6 @@ import pandas as pd from ... import opcodes as OperandDef -from ...config import options from ...core import OutputType from .core import ( DATAFRAME_TYPE, @@ -86,7 +85,6 @@ def any_series( combine_size=None, method=None, ): - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameAny( axis=axis, skipna=skipna, @@ -94,7 +92,6 @@ def any_series( bool_only=bool_only, combine_size=combine_size, output_types=[OutputType.scalar], - use_inf_as_na=use_inf_as_na, method=method, ) return op(series) @@ -109,7 +106,6 @@ def any_dataframe( combine_size=None, method=None, ): - use_inf_as_na = options.dataframe.mode.use_inf_as_na output_types = [OutputType.series] if axis is not None else [OutputType.scalar] op = DataFrameAny( axis=axis, @@ -118,13 +114,11 @@ def any_dataframe( bool_only=bool_only, combine_size=combine_size, output_types=output_types, - use_inf_as_na=use_inf_as_na, method=method, ) return op(df) def any_index(index): - use_inf_as_na = options.dataframe.mode.use_inf_as_na - op = DataFrameAny(output_types=[OutputType.scalar], use_inf_as_na=use_inf_as_na) + op = DataFrameAny(output_types=[OutputType.scalar]) return op(index) diff --git a/python/xorbits/_mars/dataframe/reduction/core.py b/python/xorbits/_mars/dataframe/reduction/core.py index 21e33d662..c82b652cd 100644 --- a/python/xorbits/_mars/dataframe/reduction/core.py +++ b/python/xorbits/_mars/dataframe/reduction/core.py @@ -65,7 +65,6 @@ class DataFrameReductionOperand(DataFrameOperand): _numeric_only = BoolField("numeric_only") _bool_only = BoolField("bool_only") _min_count = Int32Field("min_count") - _use_inf_as_na = BoolField("use_inf_as_na") _method = StringField("method") _dtype = DataTypeField("dtype") @@ -84,7 +83,6 @@ def __init__( gpu=None, sparse=None, output_types=None, - use_inf_as_na=None, method=None, **kw, ): @@ -100,7 +98,6 @@ def __init__( gpu=gpu, sparse=sparse, _output_types=output_types, - _use_inf_as_na=use_inf_as_na, _method=method, **kw, ) @@ -137,10 +134,6 @@ def dtype(self): def combine_size(self): return self._combine_size - @property - def use_inf_as_na(self): - return self._use_inf_as_na - @property def is_atomic(self): return False @@ -163,7 +156,6 @@ def get_reduction_args(self, axis=None): class DataFrameCumReductionOperand(DataFrameOperand): _axis = AnyField("axis") _skipna = BoolField("skipna") - _use_inf_as_na = BoolField("use_inf_as_na") _dtype = DataTypeField("dtype") @@ -175,7 +167,6 @@ def __init__( gpu=None, sparse=None, output_types=None, - use_inf_as_na=None, **kw, ): super().__init__( @@ -185,7 +176,6 @@ def __init__( gpu=gpu, sparse=sparse, _output_types=output_types, - _use_inf_as_na=use_inf_as_na, **kw, ) @@ -201,10 +191,6 @@ def skipna(self): def dtype(self): return self._dtype - @property - def use_inf_as_na(self): - return self._use_inf_as_na - def _default_agg_fun(value, func_name=None, **kw): if value.ndim == 1: @@ -612,14 +598,10 @@ def _execute_combine(cls, ctx, op): @classmethod def execute(cls, ctx, op): - try: - pd.set_option("mode.use_inf_as_na", op.use_inf_as_na) - if op.stage == OperandStage.map: - return cls._execute_map(ctx, op) - else: - return cls._execute_combine(ctx, op) - finally: - pd.reset_option("mode.use_inf_as_na") + if op.stage == OperandStage.map: + return cls._execute_map(ctx, op) + else: + return cls._execute_combine(ctx, op) def _call_dataframe(self, df): axis = getattr(self, "axis", None) or 0 diff --git a/python/xorbits/_mars/dataframe/reduction/count.py b/python/xorbits/_mars/dataframe/reduction/count.py index 41b4047b4..71012db21 100644 --- a/python/xorbits/_mars/dataframe/reduction/count.py +++ b/python/xorbits/_mars/dataframe/reduction/count.py @@ -14,7 +14,6 @@ # limitations under the License. from ... import opcodes as OperandDef -from ...config import options from ...core import OutputType from .core import DataFrameReductionMixin, DataFrameReductionOperand @@ -40,13 +39,11 @@ def count(value): def count_series(series, level=None, combine_size=None, **kw): - use_inf_as_na = kw.pop("_use_inf_as_na", options.dataframe.mode.use_inf_as_na) method = kw.pop("method", None) op = DataFrameCount( level=level, combine_size=combine_size, output_types=[OutputType.scalar], - use_inf_as_na=use_inf_as_na, method=method, ) return op(series) @@ -55,7 +52,6 @@ def count_series(series, level=None, combine_size=None, **kw): def count_dataframe( df, axis=0, level=None, numeric_only=False, combine_size=None, **kw ): - use_inf_as_na = kw.pop("_use_inf_as_na", options.dataframe.mode.use_inf_as_na) method = kw.pop("method", None) op = DataFrameCount( axis=axis, @@ -63,7 +59,6 @@ def count_dataframe( numeric_only=numeric_only, combine_size=combine_size, output_types=[OutputType.series], - use_inf_as_na=use_inf_as_na, method=method, ) return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/cummax.py b/python/xorbits/_mars/dataframe/reduction/cummax.py index 238d76196..18e248572 100644 --- a/python/xorbits/_mars/dataframe/reduction/cummax.py +++ b/python/xorbits/_mars/dataframe/reduction/cummax.py @@ -14,7 +14,6 @@ # limitations under the License. from ... import opcodes as OperandDef -from ...config import options from .core import DataFrameCumReductionMixin, DataFrameCumReductionOperand @@ -24,11 +23,9 @@ class DataFrameCummax(DataFrameCumReductionOperand, DataFrameCumReductionMixin): def cummax(df, axis=None, skipna=True): - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameCummax( axis=axis, skipna=skipna, output_types=df.op.output_types, - use_inf_as_na=use_inf_as_na, ) return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/cummin.py b/python/xorbits/_mars/dataframe/reduction/cummin.py index 2ee203770..559b258be 100644 --- a/python/xorbits/_mars/dataframe/reduction/cummin.py +++ b/python/xorbits/_mars/dataframe/reduction/cummin.py @@ -14,7 +14,6 @@ # limitations under the License. from ... import opcodes as OperandDef -from ...config import options from .core import DataFrameCumReductionMixin, DataFrameCumReductionOperand @@ -24,11 +23,9 @@ class DataFrameCummin(DataFrameCumReductionOperand, DataFrameCumReductionMixin): def cummin(df, axis=None, skipna=True): - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameCummin( axis=axis, skipna=skipna, output_types=df.op.output_types, - use_inf_as_na=use_inf_as_na, ) return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/cumprod.py b/python/xorbits/_mars/dataframe/reduction/cumprod.py index d251198ad..9e52f33e4 100644 --- a/python/xorbits/_mars/dataframe/reduction/cumprod.py +++ b/python/xorbits/_mars/dataframe/reduction/cumprod.py @@ -14,7 +14,6 @@ # limitations under the License. from ... import opcodes as OperandDef -from ...config import options from .core import DataFrameCumReductionMixin, DataFrameCumReductionOperand @@ -24,11 +23,9 @@ class DataFrameCumprod(DataFrameCumReductionOperand, DataFrameCumReductionMixin) def cumprod(df, axis=None, skipna=True): - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameCumprod( axis=axis, skipna=skipna, output_types=df.op.output_types, - use_inf_as_na=use_inf_as_na, ) return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/cumsum.py b/python/xorbits/_mars/dataframe/reduction/cumsum.py index 591c85302..f6e74ef4b 100644 --- a/python/xorbits/_mars/dataframe/reduction/cumsum.py +++ b/python/xorbits/_mars/dataframe/reduction/cumsum.py @@ -14,7 +14,6 @@ # limitations under the License. from ... import opcodes as OperandDef -from ...config import options from .core import DataFrameCumReductionMixin, DataFrameCumReductionOperand @@ -24,11 +23,9 @@ class DataFrameCumsum(DataFrameCumReductionOperand, DataFrameCumReductionMixin): def cumsum(df, axis=None, skipna=True): - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameCumsum( axis=axis, skipna=skipna, output_types=df.op.output_types, - use_inf_as_na=use_inf_as_na, ) return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/custom_reduction.py b/python/xorbits/_mars/dataframe/reduction/custom_reduction.py index 59ee88830..a89361df6 100644 --- a/python/xorbits/_mars/dataframe/reduction/custom_reduction.py +++ b/python/xorbits/_mars/dataframe/reduction/custom_reduction.py @@ -14,7 +14,6 @@ # limitations under the License. from ... import opcodes as OperandDef -from ...config import options from ...core import OutputType from ...serialization.serializables import AnyField from .core import DataFrameReductionMixin, DataFrameReductionOperand @@ -35,12 +34,10 @@ def get_reduction_args(self, axis=None): def build_custom_reduction_result(df, custom_reduction_obj, method=None): - use_inf_as_na = options.dataframe.mode.use_inf_as_na output_type = OutputType.series if df.ndim == 2 else OutputType.scalar op = DataFrameCustomReduction( custom_reduction=custom_reduction_obj, output_types=[output_type], - use_inf_as_na=use_inf_as_na, method=method, ) return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/kurtosis.py b/python/xorbits/_mars/dataframe/reduction/kurtosis.py index 5bb2702e9..b32055953 100644 --- a/python/xorbits/_mars/dataframe/reduction/kurtosis.py +++ b/python/xorbits/_mars/dataframe/reduction/kurtosis.py @@ -16,7 +16,6 @@ import numpy as np from ... import opcodes -from ...config import options from ...core import ENTITY_TYPE, OutputType from ...serialization.serializables import BoolField from .core import DataFrameReductionMixin, DataFrameReductionOperand @@ -83,7 +82,6 @@ def kurt_series( fisher=True, method=None, ): - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameKurtosis( axis=axis, skipna=skipna, @@ -92,7 +90,6 @@ def kurt_series( bias=bias, fisher=fisher, output_types=[OutputType.scalar], - use_inf_as_na=use_inf_as_na, method=method, ) return op(df) @@ -109,7 +106,6 @@ def kurt_dataframe( fisher=True, method=None, ): - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameKurtosis( axis=axis, skipna=skipna, @@ -119,7 +115,6 @@ def kurt_dataframe( fisher=fisher, combine_size=combine_size, output_types=[OutputType.series], - use_inf_as_na=use_inf_as_na, method=method, ) return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/max.py b/python/xorbits/_mars/dataframe/reduction/max.py index e04fa12cc..5e8d3dd34 100644 --- a/python/xorbits/_mars/dataframe/reduction/max.py +++ b/python/xorbits/_mars/dataframe/reduction/max.py @@ -14,7 +14,6 @@ # limitations under the License. from ... import opcodes as OperandDef -from ...config import options from ...core import OutputType from .core import DataFrameReductionMixin, DataFrameReductionOperand @@ -28,15 +27,21 @@ def is_atomic(self): return True -def max_series(df, axis=None, skipna=True, level=None, combine_size=None, method=None): - use_inf_as_na = options.dataframe.mode.use_inf_as_na +def max_series( + df, + axis=None, + skipna=True, + level=None, + combine_size=None, + method=None, + **kwargs, # kwargs for compatible with numpy reduction +): op = DataFrameMax( axis=axis, skipna=skipna, level=level, combine_size=combine_size, output_types=[OutputType.scalar], - use_inf_as_na=use_inf_as_na, method=method, ) return op(df) @@ -50,8 +55,8 @@ def max_dataframe( numeric_only=None, combine_size=None, method=None, + **kwargs, # kwargs for compatible with numpy reduction ): - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameMax( axis=axis, skipna=skipna, @@ -59,18 +64,15 @@ def max_dataframe( numeric_only=numeric_only, combine_size=combine_size, output_types=[OutputType.series], - use_inf_as_na=use_inf_as_na, method=method, ) return op(df) def max_index(df, axis=None, skipna=True): - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameMax( axis=axis, skipna=skipna, output_types=[OutputType.scalar], - use_inf_as_na=use_inf_as_na, ) return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/mean.py b/python/xorbits/_mars/dataframe/reduction/mean.py index e471e66ae..4d22d608a 100644 --- a/python/xorbits/_mars/dataframe/reduction/mean.py +++ b/python/xorbits/_mars/dataframe/reduction/mean.py @@ -14,7 +14,6 @@ # limitations under the License. from ... import opcodes as OperandDef -from ...config import options from ...core import OutputType from .core import DataFrameReductionMixin, DataFrameReductionOperand @@ -33,15 +32,21 @@ def mean(x): return mean -def mean_series(df, axis=None, skipna=True, level=None, combine_size=None, method=None): - use_inf_as_na = options.dataframe.mode.use_inf_as_na +def mean_series( + df, + axis=None, + skipna=True, + level=None, + combine_size=None, + method=None, + **kwargs, # kwargs for compatible with numpy reduction +): op = DataFrameMean( axis=axis, skipna=skipna, level=level, combine_size=combine_size, output_types=[OutputType.scalar], - use_inf_as_na=use_inf_as_na, method=method, ) return op(df) @@ -55,8 +60,8 @@ def mean_dataframe( numeric_only=None, combine_size=None, method=None, + **kwargs, # kwargs for compatible with numpy reduction ): - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameMean( axis=axis, skipna=skipna, @@ -64,7 +69,6 @@ def mean_dataframe( numeric_only=numeric_only, combine_size=combine_size, output_types=[OutputType.series], - use_inf_as_na=use_inf_as_na, method=method, ) return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/min.py b/python/xorbits/_mars/dataframe/reduction/min.py index d514c43b7..0bcb91634 100644 --- a/python/xorbits/_mars/dataframe/reduction/min.py +++ b/python/xorbits/_mars/dataframe/reduction/min.py @@ -14,7 +14,6 @@ # limitations under the License. from ... import opcodes as OperandDef -from ...config import options from ...core import OutputType from .core import DataFrameReductionMixin, DataFrameReductionOperand @@ -28,15 +27,21 @@ def is_atomic(self): return True -def min_series(df, axis=None, skipna=True, level=None, combine_size=None, method=None): - use_inf_as_na = options.dataframe.mode.use_inf_as_na +def min_series( + df, + axis=None, + skipna=True, + level=None, + combine_size=None, + method=None, + **kwargs, # kwargs for compatible with numpy reduction +): op = DataFrameMin( axis=axis, skipna=skipna, level=level, combine_size=combine_size, output_types=[OutputType.scalar], - use_inf_as_na=use_inf_as_na, method=method, ) return op(df) @@ -50,8 +55,8 @@ def min_dataframe( numeric_only=None, combine_size=None, method=None, + **kwargs, # kwargs for compatible with numpy reduction ): - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameMin( axis=axis, skipna=skipna, @@ -59,18 +64,15 @@ def min_dataframe( numeric_only=numeric_only, combine_size=combine_size, output_types=[OutputType.series], - use_inf_as_na=use_inf_as_na, method=method, ) return op(df) def min_index(df, axis=None, skipna=True): - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameMin( axis=axis, skipna=skipna, output_types=[OutputType.scalar], - use_inf_as_na=use_inf_as_na, ) return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/prod.py b/python/xorbits/_mars/dataframe/reduction/prod.py index c84b1ce3b..217c34994 100644 --- a/python/xorbits/_mars/dataframe/reduction/prod.py +++ b/python/xorbits/_mars/dataframe/reduction/prod.py @@ -16,7 +16,6 @@ import numpy as np from ... import opcodes -from ...config import options from ...core import OutputType from .aggregation import where_function from .core import DataFrameReductionMixin, DataFrameReductionOperand @@ -48,7 +47,6 @@ def prod(value): def prod_series( df, axis=None, skipna=True, level=None, min_count=0, combine_size=None, method=None ): - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameProd( axis=axis, skipna=skipna, @@ -56,7 +54,6 @@ def prod_series( min_count=min_count, combine_size=combine_size, output_types=[OutputType.scalar], - use_inf_as_na=use_inf_as_na, method=method, ) return op(df) @@ -72,7 +69,6 @@ def prod_dataframe( combine_size=None, method=None, ): - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameProd( axis=axis, skipna=skipna, @@ -81,7 +77,6 @@ def prod_dataframe( numeric_only=numeric_only, combine_size=combine_size, output_types=[OutputType.series], - use_inf_as_na=use_inf_as_na, method=method, ) return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/sem.py b/python/xorbits/_mars/dataframe/reduction/sem.py index 5c6d274ed..f434c4e9b 100644 --- a/python/xorbits/_mars/dataframe/reduction/sem.py +++ b/python/xorbits/_mars/dataframe/reduction/sem.py @@ -14,7 +14,6 @@ # limitations under the License. from ... import opcodes as OperandDef -from ...config import options from ...core import OutputType from ...serialization.serializables import Int32Field from .core import DataFrameReductionMixin, DataFrameReductionOperand @@ -48,7 +47,6 @@ def sem(x): def sem_series( series, axis=None, skipna=True, level=None, ddof=1, combine_size=None, method=None ): - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameSem( axis=axis, skipna=skipna, @@ -56,7 +54,6 @@ def sem_series( ddof=ddof, combine_size=combine_size, output_types=[OutputType.scalar], - use_inf_as_na=use_inf_as_na, method=method, ) return op(series) @@ -72,7 +69,6 @@ def sem_dataframe( combine_size=None, method=None, ): - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameSem( axis=axis, skipna=skipna, @@ -81,7 +77,6 @@ def sem_dataframe( numeric_only=numeric_only, combine_size=combine_size, output_types=[OutputType.series], - use_inf_as_na=use_inf_as_na, method=method, ) return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/skew.py b/python/xorbits/_mars/dataframe/reduction/skew.py index c7e16a5a1..7b9ec6eae 100644 --- a/python/xorbits/_mars/dataframe/reduction/skew.py +++ b/python/xorbits/_mars/dataframe/reduction/skew.py @@ -16,7 +16,6 @@ import numpy as np from ... import opcodes -from ...config import options from ...core import ENTITY_TYPE, OutputType from ...serialization.serializables import BoolField from .core import DataFrameReductionMixin, DataFrameReductionOperand @@ -68,7 +67,6 @@ def skew(x): def skew_series( df, axis=None, skipna=True, level=None, combine_size=None, bias=False, method=None ): - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameSkew( axis=axis, skipna=skipna, @@ -76,7 +74,6 @@ def skew_series( combine_size=combine_size, bias=bias, output_types=[OutputType.scalar], - use_inf_as_na=use_inf_as_na, method=method, ) return op(df) @@ -92,7 +89,6 @@ def skew_dataframe( bias=False, method=None, ): - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameSkew( axis=axis, skipna=skipna, @@ -101,7 +97,6 @@ def skew_dataframe( bias=bias, combine_size=combine_size, output_types=[OutputType.series], - use_inf_as_na=use_inf_as_na, method=method, ) return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/sum.py b/python/xorbits/_mars/dataframe/reduction/sum.py index 1b3a4dd7b..ff8c81b6c 100644 --- a/python/xorbits/_mars/dataframe/reduction/sum.py +++ b/python/xorbits/_mars/dataframe/reduction/sum.py @@ -16,7 +16,6 @@ import numpy as np from ... import opcodes -from ...config import options from ...core import OutputType from .core import DataFrameReductionMixin, DataFrameReductionOperand @@ -47,9 +46,15 @@ def sum_(value): def sum_series( - df, axis=None, skipna=True, level=None, min_count=0, combine_size=None, method=None + df, + axis=None, + skipna=True, + level=None, + min_count=0, + combine_size=None, + method=None, + **kwargs, # kwargs for compatible with numpy reduction ): - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameSum( axis=axis, skipna=skipna, @@ -57,7 +62,6 @@ def sum_series( min_count=min_count, combine_size=combine_size, output_types=[OutputType.scalar], - use_inf_as_na=use_inf_as_na, method=method, ) return op(df) @@ -72,8 +76,8 @@ def sum_dataframe( numeric_only=None, combine_size=None, method=None, + **kwargs, # kwargs for compatible with numpy reduction ): - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameSum( axis=axis, skipna=skipna, @@ -82,7 +86,6 @@ def sum_dataframe( numeric_only=numeric_only, combine_size=combine_size, output_types=[OutputType.series], - use_inf_as_na=use_inf_as_na, method=method, ) return op(df) diff --git a/python/xorbits/_mars/dataframe/reduction/tests/test_reduction_execution.py b/python/xorbits/_mars/dataframe/reduction/tests/test_reduction_execution.py index cfcd70192..53b121805 100644 --- a/python/xorbits/_mars/dataframe/reduction/tests/test_reduction_execution.py +++ b/python/xorbits/_mars/dataframe/reduction/tests/test_reduction_execution.py @@ -771,6 +771,37 @@ def test_dataframe_aggregate(setup, check_ref_counts): ] data = pd.DataFrame(np.random.rand(20, 20)) + def realized_volatility(series): + print(series) + return np.sqrt(np.sum(series**2)) + + df = md.DataFrame(data) + result = df.agg(realized_volatility) + pd.testing.assert_series_equal( + result.execute().fetch(), data.agg(realized_volatility) + ) + + def trip_type(x): + return np.min(x) + + df = md.DataFrame(data) + result = df.agg(trip_type) + pd.testing.assert_series_equal(result.execute().fetch(), data.agg(trip_type)) + + def trip_type_max(x): + return np.max(x) + + df = md.DataFrame(data) + result = df.agg(trip_type_max) + pd.testing.assert_series_equal(result.execute().fetch(), data.agg(trip_type_max)) + + def trip_type_mean(x): + return np.mean(x) + + df = md.DataFrame(data) + result = df.agg(trip_type_mean) + pd.testing.assert_series_equal(result.execute().fetch(), data.agg(trip_type_mean)) + df = md.DataFrame(data) result = df.agg(all_aggs) pd.testing.assert_frame_equal(result.execute().fetch(), data.agg(all_aggs)) @@ -1138,3 +1169,34 @@ def g3(x): s.agg((g1, g2, g3)), ms.agg((g1, g2, g3)).execute().fetch() ) pd.testing.assert_series_equal(s.agg((g1, g1)), ms.agg((g1, g1)).execute().fetch()) + + +@pytest.mark.parametrize("chunk_size", [None, 1, 5, 10]) +def test_agg_with_kwargs(setup, chunk_size): + rs = np.random.RandomState(0) + df = pd.DataFrame( + { + "a": rs.choice([1, 3, 8], size=100), + "b": rs.choice([201.8, 155.7, 95.7], size=100), + "c": rs.choice([1, np.nan, 3], size=100), + }, + ) + mdf = md.DataFrame(df, chunk_size=chunk_size) + res = mdf.agg(a=("a", "sum")) + pd.testing.assert_frame_equal(res.execute().fetch(), df.agg(a=("a", "sum"))) + + res = mdf.agg(x=("a", "sum"), y=("b", "mean")) + pd.testing.assert_frame_equal( + res.execute().fetch(), df.agg(x=("a", "sum"), y=("b", "mean")) + ) + + res = mdf.agg(x=("a", "mean"), y=("c", sum)) + pd.testing.assert_frame_equal( + res.execute().fetch(), df.agg(x=("a", "mean"), y=("c", sum)) + ) + + def g(x): + return x.sum() - (x * 3).sum() + + res = mdf.agg(g=("b", g)) + pd.testing.assert_frame_equal(res.execute().fetch(), df.agg(g=("b", g))) diff --git a/python/xorbits/_mars/dataframe/reduction/var.py b/python/xorbits/_mars/dataframe/reduction/var.py index e59d5fa23..f7e3acbf4 100644 --- a/python/xorbits/_mars/dataframe/reduction/var.py +++ b/python/xorbits/_mars/dataframe/reduction/var.py @@ -14,7 +14,6 @@ # limitations under the License. from ... import opcodes as OperandDef -from ...config import options from ...core import OutputType from ...serialization.serializables import Int32Field from .core import DataFrameReductionMixin, DataFrameReductionOperand @@ -51,7 +50,6 @@ def var(x): def var_series( series, axis=None, skipna=True, level=None, ddof=1, combine_size=None, method=None ): - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameVar( axis=axis, skipna=skipna, @@ -59,7 +57,6 @@ def var_series( ddof=ddof, combine_size=combine_size, output_types=[OutputType.scalar], - use_inf_as_na=use_inf_as_na, method=method, ) return op(series) @@ -75,7 +72,6 @@ def var_dataframe( combine_size=None, method=None, ): - use_inf_as_na = options.dataframe.mode.use_inf_as_na op = DataFrameVar( axis=axis, skipna=skipna, @@ -84,7 +80,6 @@ def var_dataframe( numeric_only=numeric_only, combine_size=combine_size, output_types=[OutputType.series], - use_inf_as_na=use_inf_as_na, method=method, ) return op(df) diff --git a/python/xorbits/_mars/dataframe/tests/test_core.py b/python/xorbits/_mars/dataframe/tests/test_core.py index af061ff4f..2ca4b0777 100644 --- a/python/xorbits/_mars/dataframe/tests/test_core.py +++ b/python/xorbits/_mars/dataframe/tests/test_core.py @@ -442,3 +442,27 @@ def test_mars_tensor_magic(setup): np.testing.assert_array_equal(expected, actual) with pytest.raises(ValueError, match="could not convert string to float"): DataFrame(expected).__mars_tensor__(dtype="float64").execute() + + +def test_series_and_index_array(setup): + data = np.random.rand(10) + series = Series(data).execute() + + array = np.array(series) + np.testing.assert_array_equal(array, data) + + df = pd.DataFrame({"a": [1, 2], "b": ["foo", "bar"]}) + xdf = DataFrame(df) + index = xdf.index.execute() + np.testing.assert_array_equal(np.array(df.index), np.array(index)) + + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + tuples = list(zip(*arrays)) + index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) + s = pd.Series(np.random.randn(8), index=index) + xs = Series(s).index.execute() + + np.testing.assert_array_equal(np.array(s.index), np.array(xs)) diff --git a/python/xorbits/_mars/deploy/oscar/base_config.yml b/python/xorbits/_mars/deploy/oscar/base_config.yml index 99754f1bf..51305f766 100644 --- a/python/xorbits/_mars/deploy/oscar/base_config.yml +++ b/python/xorbits/_mars/deploy/oscar/base_config.yml @@ -56,6 +56,15 @@ scheduling: # Max number of concurrent speculative run for a subtask. max_concurrent_run: 3 subtask_cancel_timeout: 5 + stage_monitor: + enable_check: false + refresh_time: 3 + prepare_data_timeout: 300 + request_quota_timeout: 300 + acquire_slot_timeout: 300 + execution_timeout: null + release_slot_timeout: 300 + finish_timeout: 300 metrics: backend: console # If backend is prometheus, then we can add prometheus config as follows: diff --git a/python/xorbits/_mars/deploy/oscar/session.py b/python/xorbits/_mars/deploy/oscar/session.py index 2fe2d10dd..8e1ac5b09 100644 --- a/python/xorbits/_mars/deploy/oscar/session.py +++ b/python/xorbits/_mars/deploy/oscar/session.py @@ -502,6 +502,17 @@ def decref(self, *tileables_keys): Tileables' keys """ + @abstractmethod + def incref(self, *tileables_keys): + """ + Incref tileables. + + Parameters + ---------- + tileables_keys : list + Tileables' keys + """ + @abstractmethod def _get_ref_counts(self) -> Dict[str, int]: """ @@ -960,10 +971,19 @@ async def execute(self, *tileables, **kwargs) -> ExecutionInfo: def _get_to_fetch_tileable( self, tileable: TileableType ) -> Tuple[TileableType, List[Union[slice, Integral]]]: - from ...dataframe.indexing.iloc import DataFrameIlocGetItem, SeriesIlocGetItem + from ...dataframe.indexing.iloc import ( + DataFrameIlocGetItem, + IndexIlocGetItem, + SeriesIlocGetItem, + ) from ...tensor.indexing import TensorIndex - slice_op_types = TensorIndex, DataFrameIlocGetItem, SeriesIlocGetItem + slice_op_types = ( + TensorIndex, + DataFrameIlocGetItem, + SeriesIlocGetItem, + IndexIlocGetItem, + ) if hasattr(tileable, "data"): tileable = tileable.data @@ -1200,6 +1220,10 @@ async def decref(self, *tileable_keys): logger.debug("Decref tileables on client: %s", tileable_keys) return await self._lifecycle_api.decref_tileables(list(tileable_keys)) + async def incref(self, *tileable_keys): + logger.debug("Incref tileables on client: %s", tileable_keys) + return await self._lifecycle_api.incref_tileables(list(tileable_keys)) + async def _get_ref_counts(self) -> Dict[str, int]: return await self._lifecycle_api.get_all_chunk_ref_counts() @@ -1623,6 +1647,11 @@ def fetch_infos(self, *tileables, fields, **kwargs) -> list: def decref(self, *tileables_keys): pass # pragma: no cover + @implements(AbstractSyncSession.incref) + @_delegate_to_isolated_session + def incref(self, *tileables_keys): + pass # pragma: no cover + @implements(AbstractSyncSession._get_ref_counts) @_delegate_to_isolated_session def _get_ref_counts(self) -> Dict[str, int]: diff --git a/python/xorbits/_mars/learn/contrib/lightgbm/tests/test_classifier.py b/python/xorbits/_mars/learn/contrib/lightgbm/tests/test_classifier.py index 7e226b786..d060e5dd7 100644 --- a/python/xorbits/_mars/learn/contrib/lightgbm/tests/test_classifier.py +++ b/python/xorbits/_mars/learn/contrib/lightgbm/tests/test_classifier.py @@ -156,7 +156,7 @@ def test_local_classifier_from_to_parquet(setup): df.iloc[:500].to_parquet(os.path.join(d, "data", "data1.parquet")) df.iloc[500:].to_parquet(os.path.join(d, "data", "data2.parquet")) - df = md.read_parquet(data_dir) + df = md.read_parquet(data_dir, use_arrow_dtype=False) model = LGBMClassifier() model.load_model(classifier) result = model.predict(df, run=False) @@ -164,7 +164,12 @@ def test_local_classifier_from_to_parquet(setup): r.execute() - ret = md.read_parquet(result_dir).to_pandas().iloc[:, 0].to_numpy() + ret = ( + md.read_parquet(result_dir, use_arrow_dtype=False) + .to_pandas() + .iloc[:, 0] + .to_numpy() + ) expected = classifier.predict(X) expected = np.stack([1 - expected, expected]).argmax(axis=0) np.testing.assert_array_equal(ret, expected) diff --git a/python/xorbits/_mars/learn/contrib/xgboost/tests/test_classifier.py b/python/xorbits/_mars/learn/contrib/xgboost/tests/test_classifier.py index e386ee106..98f16ef2a 100644 --- a/python/xorbits/_mars/learn/contrib/xgboost/tests/test_classifier.py +++ b/python/xorbits/_mars/learn/contrib/xgboost/tests/test_classifier.py @@ -151,7 +151,7 @@ def test_local_classifier_from_to_parquet(setup): df.iloc[:500].to_parquet(os.path.join(d, "data", "data1.parquet")) df.iloc[500:].to_parquet(os.path.join(d, "data", "data2.parquet")) - df = md.read_parquet(data_dir).set_index("id") + df = md.read_parquet(data_dir, use_arrow_dtype=False).set_index("id") model = XGBClassifier() model.load_model(m_name) result = model.predict(df, run=False) @@ -160,7 +160,12 @@ def test_local_classifier_from_to_parquet(setup): # tiles to ensure no iterative tiling exists r.execute() - ret = md.read_parquet(result_dir).to_pandas().iloc[:, 0].to_numpy() + ret = ( + md.read_parquet(result_dir, use_arrow_dtype=False) + .to_pandas() + .iloc[:, 0] + .to_numpy() + ) model2 = xgboost.XGBClassifier() model2.load_model(m_name) expected = model2.predict(X) diff --git a/python/xorbits/_mars/learn/linear_model/_base.py b/python/xorbits/_mars/learn/linear_model/_base.py index bf1e27b82..0b6b9510e 100644 --- a/python/xorbits/_mars/learn/linear_model/_base.py +++ b/python/xorbits/_mars/learn/linear_model/_base.py @@ -302,7 +302,7 @@ def fit(self, X, y, sample_weight=None): self.coef_.execute() except LinAlgError: # TODO: implement linalg.lstsq first - raise NotImplementedError("Does not support sigular matrix!") + raise NotImplementedError("Does not support singular matrix!") if y.ndim == 1: self.coef_ = mt.ravel(self.coef_) diff --git a/python/xorbits/_mars/learn/linear_model/tests/test_base.py b/python/xorbits/_mars/learn/linear_model/tests/test_base.py index eaf32f0ef..942a5c342 100644 --- a/python/xorbits/_mars/learn/linear_model/tests/test_base.py +++ b/python/xorbits/_mars/learn/linear_model/tests/test_base.py @@ -52,7 +52,7 @@ def test_linear_regression(setup): assert_array_almost_equal(reg.predict(X), model.predict(X)) # Regular model fitting, #samples <= 2, # features < 2 - error_msg = re.escape("Does not support sigular matrix!") + error_msg = re.escape("Does not support singular matrix!") X = [[1], [2]] Y = [1, 2] @@ -68,7 +68,7 @@ def test_linear_regression(setup): assert_array_almost_equal(reg.predict(X), model.predict(X)) # Extra case #1: singular matrix, degenerate input - error_msg = re.escape("Does not support sigular matrix!") + error_msg = re.escape("Does not support singular matrix!") X = [[1]] Y = [0] diff --git a/python/xorbits/_mars/learn/metrics/pairwise/pairwise.py b/python/xorbits/_mars/learn/metrics/pairwise/pairwise.py index 8353cff39..f1e650f70 100644 --- a/python/xorbits/_mars/learn/metrics/pairwise/pairwise.py +++ b/python/xorbits/_mars/learn/metrics/pairwise/pairwise.py @@ -72,7 +72,7 @@ "precomputed": None, # HACK: precomputed is always allowed, never called } -# These distances recquire boolean tensors, when using mars.tensor.spatial.distance +# These distances require boolean tensors, when using mars.tensor.spatial.distance PAIRWISE_BOOLEAN_FUNCTIONS = [ "dice", "jaccard", diff --git a/python/xorbits/_mars/learn/neighbors/base.py b/python/xorbits/_mars/learn/neighbors/base.py index b1f1b21f4..408dd9c8e 100644 --- a/python/xorbits/_mars/learn/neighbors/base.py +++ b/python/xorbits/_mars/learn/neighbors/base.py @@ -24,7 +24,6 @@ from ..metrics import pairwise_distances_topk from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS from ..utils import check_array -from ..utils.core import sklearn_version from ..utils.validation import check_is_fitted from ._ball_tree import SklearnBallTree, ball_tree_query, create_ball_tree from ._faiss import METRIC_TO_FAISS_METRIC_TYPE, build_faiss_index, faiss_query @@ -33,12 +32,8 @@ from ._proxima import METRIC_TO_PROXIMA_METRIC_TYPE, build_proxima_index, proxima_query VALID_METRICS = dict( - ball_tree=SklearnBallTree.valid_metrics() - if sklearn_version() >= "1.3.0" - else SklearnBallTree.valid_metrics, - kd_tree=SklearnKDTree.valid_metrics() - if sklearn_version() >= "1.3.0" - else SklearnKDTree.valid_metrics, + ball_tree=SklearnBallTree.valid_metrics, + kd_tree=SklearnKDTree.valid_metrics, # The following list comes from the # sklearn.metrics.pairwise doc string brute=( diff --git a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/column_pruning_rule.py b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/column_pruning_rule.py index a90544a61..9b7c77e8e 100644 --- a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/column_pruning_rule.py +++ b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/column_pruning_rule.py @@ -55,9 +55,17 @@ def _get_successor_required_columns(self, data: TileableData) -> Set[Any]: """ successors = self._get_successors(data) if successors: - return set().union( + res = set().union( *[self._context[successor][data] for successor in successors] ) + # When getting the required columns of a DataFrameIndex node, we need to consider itself. + if ( + isinstance(data, BaseDataFrameData) + and isinstance(data.op, DataFrameIndex) + and len(data.dtypes) > 0 + ): + res = res.union(set(data.dtypes.index)) + return res else: return self._get_all_columns(data) diff --git a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/input_column_selector.py b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/input_column_selector.py index dda97f7f0..97e866e95 100644 --- a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/input_column_selector.py +++ b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/input_column_selector.py @@ -164,9 +164,10 @@ def df_groupby_agg_select_function( ret = {} # group by a series groupby_series = False - if isinstance(by, list) and len(by) == 1 and isinstance(by[0], BaseSeriesData): + if isinstance(by, list) and all([isinstance(_by, BaseSeriesData) for _by in by]): groupby_series = True - ret[by[0]] = {by[0].name} + for _by in by: + ret[_by] = {_by.name} if isinstance(inp, BaseSeriesData): ret[inp] = {inp.name} diff --git a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_column_pruning.py b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_column_pruning.py index 7158ad6f4..9e869f48b 100644 --- a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_column_pruning.py +++ b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_column_pruning.py @@ -15,6 +15,7 @@ import os import tempfile +import numpy as np import pandas as pd import pytest @@ -32,6 +33,7 @@ from ......dataframe.indexing.getitem import DataFrameIndex from ......dataframe.indexing.setitem import DataFrameSetitem from ......dataframe.merge import DataFrameMerge +from ......dataframe.utils import PD_VERSION_GREATER_THAN_2_10 from ......optimization.logical.tileable import optimize from ......tensor.core import TensorData from ......tensor.datasource import ArrayDataSource @@ -331,6 +333,9 @@ def test_merge_then_groupby_apply(setup, gen_data2): raw1 = pd.read_parquet(file_path) raw2 = pd.read_parquet(file_path2) + if PD_VERSION_GREATER_THAN_2_10: + raw1 = raw1.convert_dtypes(dtype_backend="pyarrow") + raw2 = raw2.convert_dtypes(dtype_backend="pyarrow") expected = ( ( ((raw1 + 1) * 2).merge(raw2, left_on=["c1", "c3"], right_on=["cc2", "cc4"])[ @@ -387,6 +392,8 @@ def test_two_merges(setup, gen_data2): ] .merge(raw2, left_on=["cc1"], right_on=["cc3"]) ) + if PD_VERSION_GREATER_THAN_2_10: + expected = expected.convert_dtypes(dtype_backend="pyarrow") pd.testing.assert_frame_equal(r, expected) parquet_nodes = [n for n in graph._nodes if type(n.op) is DataFrameReadParquet] @@ -426,6 +433,8 @@ def test_two_groupby_aggs_with_multi_index(setup, gen_data2): r = c.execute().fetch() raw = pd.read_parquet(file_path) + if PD_VERSION_GREATER_THAN_2_10: + raw = raw.convert_dtypes(dtype_backend="pyarrow") expected = ( (raw * 2) .groupby(["c2", "c3"]) @@ -590,3 +599,42 @@ def test_setitem(setup, gen_data1): raw1["c5"] = raw2["c1"] expected = raw1.groupby(by="c1", as_index=False).sum()["c2"] pd.testing.assert_series_equal(r.execute().fetch(), expected) + + +def test_merge_index_groupby_agg(setup, gen_data1): + file_path, file_path2 = gen_data1 + left = md.read_csv(file_path) + right = md.read_csv(file_path2) + r = left.merge(right, on="c1") + data = r[["c1", "c2_x", "c2_y", "c4_x", "c4_y"]] + + def udf(x): + return np.sum(x) + + res = data.groupby("c1").agg({"c2_x": udf}) + + graph = res.build_graph() + optimize(graph) + + agg_node = graph.result_tileables[0] + assert isinstance(agg_node.op, DataFrameGroupByAgg) + + assert len(graph.predecessors(agg_node)) == 1 + index_node = graph.predecessors(agg_node)[0] + assert type(index_node.op) is DataFrameIndex + assert set(index_node.op.col_names) == {"c1", "c2_x"} + + index_node2 = graph.predecessors(index_node)[0] + assert type(index_node2.op) is DataFrameIndex + assert set(index_node2.op.col_names) == {"c1", "c2_x", "c2_y", "c4_x", "c4_y"} + + merge_node = graph.predecessors(index_node2)[0] + assert type(merge_node.op) is DataFrameMerge + + read_csv_node_left, read_csv_node_right = graph.predecessors(merge_node) + assert type(read_csv_node_left.op) is DataFrameReadCSV + assert type(read_csv_node_right.op) is DataFrameReadCSV + assert len(read_csv_node_left.op.usecols) == 3 + assert len(read_csv_node_right.op.usecols) == 3 + assert set(read_csv_node_left.op.usecols) == {"c1", "c2", "c4"} + assert set(read_csv_node_right.op.usecols) == {"c1", "c2", "c4"} diff --git a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_input_column_selector.py b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_input_column_selector.py index e351e339e..82cf899e8 100644 --- a/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_input_column_selector.py +++ b/python/xorbits/_mars/optimization/logical/tileable/column_pruning/tests/test_input_column_selector.py @@ -116,6 +116,18 @@ def test_df_groupby_agg(): assert labels.data in input_columns assert input_columns[labels.data] == {"label"} + label1 = Series([1, 1, 1, 1], name="label1") + label2 = Series([2, 2, 3, 3], name="label2") + s = df.groupby(by=[label1, label2]).sum() + input_columns = InputColumnSelector.select(s.data, {"foo"}) + assert len(input_columns) == 3 + assert df.data in input_columns + assert input_columns[df.data] == {"foo"} + assert label1.data in input_columns + assert input_columns[label1.data] == {"label1"} + assert label2.data in input_columns + assert input_columns[label2.data] == {"label2"} + @pytest.mark.skip(reason="group by index is not supported yet") def test_df_groupby_index_agg(): diff --git a/python/xorbits/_mars/optimization/logical/tileable/tests/test_head.py b/python/xorbits/_mars/optimization/logical/tileable/tests/test_head.py index 0e0897c24..f43932ac9 100644 --- a/python/xorbits/_mars/optimization/logical/tileable/tests/test_head.py +++ b/python/xorbits/_mars/optimization/logical/tileable/tests/test_head.py @@ -23,6 +23,7 @@ from ..... import dataframe as md from .....core import TileableGraph, TileableGraphBuilder, enter_mode from .....dataframe.indexing.iloc import DataFrameIlocGetItem, SeriesIlocGetItem +from .....dataframe.utils import PD_VERSION_GREATER_THAN_2_10 from .. import optimize @@ -129,6 +130,8 @@ def test_read_parquet_head(prepare_data, setup): extra_config={"operand_executors": _iloc_operand_executors} ).fetch() expected = pdf.head(5) + if PD_VERSION_GREATER_THAN_2_10: + expected = expected.convert_dtypes(dtype_backend="pyarrow") pd.testing.assert_frame_equal(result, expected) diff --git a/python/xorbits/_mars/services/scheduling/api/oscar.py b/python/xorbits/_mars/services/scheduling/api/oscar.py index cb9541e35..9e2e7cac1 100644 --- a/python/xorbits/_mars/services/scheduling/api/oscar.py +++ b/python/xorbits/_mars/services/scheduling/api/oscar.py @@ -174,11 +174,18 @@ async def create(cls: Type[APIType], session_id: str, address: str) -> APIType: from .... import resource as mars_resource from ..worker import ( + StageMonitorActor, SubtaskExecutionActor, WorkerQuotaManagerActor, WorkerSlotManagerActor, ) + await mo.create_actor( + StageMonitorActor, + uid=StageMonitorActor.default_uid(), + address=address, + ) + await mo.create_actor( SubtaskExecutionActor, subtask_max_retries=0, diff --git a/python/xorbits/_mars/services/scheduling/worker/__init__.py b/python/xorbits/_mars/services/scheduling/worker/__init__.py index f43167c94..ea0785d2e 100644 --- a/python/xorbits/_mars/services/scheduling/worker/__init__.py +++ b/python/xorbits/_mars/services/scheduling/worker/__init__.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .execution import SubtaskExecutionActor +from .execution import StageMonitorActor, SubtaskExecutionActor from .quota import MemQuotaActor, QuotaActor, WorkerQuotaManagerActor from .service import SchedulingWorkerService from .workerslot import ( diff --git a/python/xorbits/_mars/services/scheduling/worker/execution.py b/python/xorbits/_mars/services/scheduling/worker/execution.py index dfeceb80b..b90da08fc 100644 --- a/python/xorbits/_mars/services/scheduling/worker/execution.py +++ b/python/xorbits/_mars/services/scheduling/worker/execution.py @@ -19,9 +19,10 @@ import operator import pprint import sys +import time from collections import defaultdict from dataclasses import dataclass, field -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Tuple import xoscar as mo from xoscar.errors import ServerClosed, XoscarError @@ -37,13 +38,93 @@ from ...cluster import ClusterAPI from ...meta import MetaAPI from ...storage import StorageAPI -from ...subtask import Subtask, SubtaskAPI, SubtaskResult, SubtaskStatus +from ...subtask import Subtask, SubtaskAPI, SubtaskResult, SubtaskStage, SubtaskStatus from ...task.task_info_collector import TaskInfoCollector from .quota import QuotaActor from .workerslot import BandSlotManagerActor logger = logging.getLogger(__name__) + +class StageMonitorActor(mo.Actor): + def __init__( + self, + monitoring_config: Dict = {}, + ): + self._records = dict() + + self._enable_check = monitoring_config.get("enable_check", False) + self._refresh_time = monitoring_config.get("refresh_time", 3) + self._kill_timeout = { + SubtaskStage.PREPARE_DATA: monitoring_config.get("prepare_data_timeout"), + SubtaskStage.REQUEST_QUOTA: monitoring_config.get("request_quota_timeout"), + SubtaskStage.ACQUIRE_SLOT: monitoring_config.get("acquire_slot_timeout"), + SubtaskStage.EXECUTE: monitoring_config.get("execution_timeout"), + SubtaskStage.RELEASE_SLOT: monitoring_config.get("release_slot_timeout"), + SubtaskStage.FINISH: monitoring_config.get("finish_timeout"), + } + self._check_task = None + + async def __post_create__(self): + await super().__post_create__() + if self._enable_check: + self._check_task = self.ref().check_subtasks.tell_delay( + delay=self._refresh_time + ) + + async def __pre_destroy__(self): + if self._enable_check: + self._check_task.cancel() + await super().__pre_destroy__() + + async def check_subtasks(self): + stale_tasks = await self.get_all_stale_tasks() + for task_key, stage in stale_tasks: + session_id, subtask_id = task_key + try: + logger.warning( + "Subtask[session_id: %s, subtask_id: %s] is timeout at stage %s", + session_id, + subtask_id, + stage, + ) + except Exception as e: + logger.error(e) + + self._check_task = self.ref().check_subtasks.tell_delay( + delay=self._refresh_time + ) + + async def get_all_stale_tasks(self): + cur_timestamp = time.time() + stale_tasks = [] + for k, v in self._records.items(): + pre_timestamp, cur_stage = v["history"][-1][0], v["history"][-1][1] + if ( + self._kill_timeout[cur_stage] is not None + and cur_timestamp - pre_timestamp >= self._kill_timeout[cur_stage] + ): + stale_tasks.append((k, cur_stage)) + return stale_tasks + + async def register_subtask(self, subtask: Subtask, supervisor_address: str): + keys = (subtask.session_id, subtask.subtask_id) + self._records[keys] = { + "subtask": subtask, + "history": [], + "supervisor_address": supervisor_address, + } + + async def report_stage(self, keys: Tuple[str, str], stage: SubtaskStage): + if stage == SubtaskStage.FINISH: + self._records.pop(keys) + return + self._records[keys]["history"].append((time.time(), stage)) + + async def get_records(self): + return self._records + + # the default times to run subtask. DEFAULT_SUBTASK_MAX_RETRIES = 0 @@ -168,9 +249,16 @@ def __init__( "The count of finished subtasks of the current band.", ("band",), ) + self._stat_monitor_ref = None async def __post_create__(self): self._cluster_api = await ClusterAPI.create(self.address) + self._stat_monitor_ref = await mo.actor_ref( + uid=StageMonitorActor.default_uid(), address=self.address + ) + + async def _get_stat_monitor_ref(self) -> mo.ActorRefType[StageMonitorActor]: + return await mo.actor_ref(StageMonitorActor.default_uid(), address=self.address) @alru_cache(cache_exceptions=False) async def _get_slot_manager_ref( @@ -366,6 +454,9 @@ async def internal_run_subtask(self, subtask: Subtask, band_name: str): ) try: logger.debug("Preparing data for subtask %s", subtask.subtask_id) + await self._stat_monitor_ref.report_stage( + (subtask.session_id, subtask.subtask_id), SubtaskStage.PREPARE_DATA + ) with Timer() as timer: prepare_data_task = asyncio.create_task( _retry_run( @@ -376,6 +467,7 @@ async def internal_run_subtask(self, subtask: Subtask, band_name: str): band_name, ) ) + await asyncio.wait_for( prepare_data_task, timeout=self._data_prepare_timeout ) @@ -429,19 +521,28 @@ async def _run_subtask_once(): aiotask = None slot_id = None try: + await self._stat_monitor_ref.report_stage( + (subtask.session_id, subtask.subtask_id), SubtaskStage.REQUEST_QUOTA + ) await quota_ref.request_batch_quota(batch_quota_req) self._check_cancelling(subtask_info) - + await self._stat_monitor_ref.report_stage( + (subtask.session_id, subtask.subtask_id), SubtaskStage.ACQUIRE_SLOT + ) slot_id = await slot_manager_ref.acquire_free_slot( (subtask.session_id, subtask.subtask_id) ) subtask_info.slot_id = slot_id self._check_cancelling(subtask_info) + await self._stat_monitor_ref.report_stage( + (subtask.session_id, subtask.subtask_id), SubtaskStage.EXECUTE + ) subtask_info.result.status = SubtaskStatus.running aiotask = asyncio.create_task( subtask_api.run_subtask_in_slot(band_name, slot_id, subtask) ) + return await asyncio.shield(aiotask) except asyncio.CancelledError as ex: try: @@ -502,6 +603,10 @@ async def _run_subtask_once(): await slot_manager_ref.release_free_slot( slot_id, (subtask.session_id, subtask.subtask_id) ) + await self._stat_monitor_ref.report_stage( + (subtask.session_id, subtask.subtask_id), + SubtaskStage.RELEASE_SLOT, + ) logger.debug( "Released slot %d for subtask %s", slot_id, subtask.subtask_id ) @@ -541,6 +646,9 @@ async def run_subtask( logger.debug( "Start to schedule subtask %s on %s.", subtask.subtask_id, self.address ) + + await self._stat_monitor_ref.register_subtask(subtask, supervisor_address) + self._submitted_subtask_count.record(1, {"band": self.address}) with mo.debug.no_message_trace(): task = asyncio.create_task( @@ -564,6 +672,9 @@ async def run_subtask( self._subtask_info.pop(subtask.subtask_id, None) self._finished_subtask_count.record(1, {"band": self.address}) logger.debug("Subtask %s finished with result %s", subtask.subtask_id, result) + await self._stat_monitor_ref.report_stage( + (subtask.session_id, subtask.subtask_id), SubtaskStage.FINISH + ) return result async def cancel_subtask(self, subtask_id: str, kill_timeout: Optional[int] = 5): diff --git a/python/xorbits/_mars/services/scheduling/worker/quota.py b/python/xorbits/_mars/services/scheduling/worker/quota.py index 430afe133..c6ab7853a 100644 --- a/python/xorbits/_mars/services/scheduling/worker/quota.py +++ b/python/xorbits/_mars/services/scheduling/worker/quota.py @@ -307,9 +307,15 @@ def __init__( self._stat_refresh_task = None self._slot_manager_ref = None + self._stat_monitor_ref = None async def __post_create__(self): await super().__post_create__() + from .execution import StageMonitorActor + + self._stat_monitor_ref = await mo.actor_ref( + uid=StageMonitorActor.default_uid(), address=self.address + ) self._stat_refresh_task = self.ref().update_mem_stats.tell_delay( delay=self._refresh_time ) @@ -332,7 +338,7 @@ async def update_mem_stats(self): """ cur_mem_available = mars_resource.virtual_memory().available if cur_mem_available > self._last_memory_available: - # memory usage reduced: try reallocate existing requests + # memory usage reduced: try to reallocate existing requests await self._process_requests() self._last_memory_available = cur_mem_available self._report_quota_info() diff --git a/python/xorbits/_mars/services/scheduling/worker/service.py b/python/xorbits/_mars/services/scheduling/worker/service.py index a5fad5cc1..0d12fab05 100644 --- a/python/xorbits/_mars/services/scheduling/worker/service.py +++ b/python/xorbits/_mars/services/scheduling/worker/service.py @@ -17,7 +17,11 @@ from ....utils import calc_size_by_str from ...core import AbstractService -from .execution import DEFAULT_SUBTASK_MAX_RETRIES, SubtaskExecutionActor +from .execution import ( + DEFAULT_SUBTASK_MAX_RETRIES, + StageMonitorActor, + SubtaskExecutionActor, +) from .quota import WorkerQuotaManagerActor from .workerslot import WorkerSlotManagerActor @@ -58,6 +62,12 @@ async def start(self): ) data_prepare_timeout = scheduling_config.get("data_prepare_timeout", 600) + await mo.create_actor( + StageMonitorActor, + monitoring_config=scheduling_config.get("stage_monitor", {}), + uid=StageMonitorActor.default_uid(), + address=address, + ) await mo.create_actor( WorkerSlotManagerActor, uid=WorkerSlotManagerActor.default_uid(), @@ -100,3 +110,6 @@ async def stop(self): uid=WorkerSlotManagerActor.default_uid(), address=address ) ) + await mo.destroy_actor( + mo.create_actor_ref(uid=StageMonitorActor.default_uid(), address=address) + ) diff --git a/python/xorbits/_mars/services/scheduling/worker/tests/test_execution.py b/python/xorbits/_mars/services/scheduling/worker/tests/test_execution.py index 53f05f2ce..e174598c7 100644 --- a/python/xorbits/_mars/services/scheduling/worker/tests/test_execution.py +++ b/python/xorbits/_mars/services/scheduling/worker/tests/test_execution.py @@ -54,7 +54,12 @@ from ....task.supervisor.manager import TaskManagerActor from ....task.task_info_collector import TaskInfoCollectorActor from ...supervisor import GlobalResourceManagerActor -from ...worker import BandSlotManagerActor, QuotaActor, SubtaskExecutionActor +from ...worker import ( + BandSlotManagerActor, + QuotaActor, + StageMonitorActor, + SubtaskExecutionActor, +) class CancelDetectActorMixin: @@ -158,7 +163,7 @@ def collect_task_info_enabled(self): @pytest.fixture async def actor_pool(request): - n_slots, enable_kill = request.param + n_slots, enable_kill, enable_stage_check = request.param pool = await create_actor_pool( "127.0.0.1", labels=[None] + ["numa-0"] * n_slots, n_process=n_slots ) @@ -181,7 +186,18 @@ async def actor_pool(request): pool.external_address, storage_handler_cls=MockStorageHandlerActor, ) - + # create monitor actor + monitor_ref = await mo.create_actor( + StageMonitorActor, + monitoring_config={ + "enable_check": True, + "execution_timeout": 5, + } + if enable_stage_check + else {}, + uid=StageMonitorActor.default_uid(), + address=pool.external_address, + ) # create assigner actor execution_ref = await mo.create_actor( SubtaskExecutionActor, @@ -230,6 +246,7 @@ async def actor_pool(request): try: yield pool, session_id, meta_api, worker_meta_api, storage_api, execution_ref finally: + await mo.destroy_actor(monitor_ref) await mo.destroy_actor(task_manager_ref) await mo.destroy_actor(band_slot_ref) await mo.destroy_actor(global_resource_ref) @@ -242,7 +259,7 @@ async def actor_pool(request): @pytest.mark.asyncio -@pytest.mark.parametrize("actor_pool", [(1, True)], indirect=True) +@pytest.mark.parametrize("actor_pool", [(1, True, False)], indirect=True) async def test_execute_tensor(actor_pool): pool, session_id, meta_api, worker_meta_api, storage_api, execution_ref = actor_pool @@ -323,7 +340,7 @@ async def test_execute_tensor(actor_pool): @pytest.mark.asyncio @pytest.mark.parametrize( "actor_pool,cancel_phase", - [((1, True), phase) for phase in _cancel_phases], + [((1, True, False), phase) for phase in _cancel_phases], indirect=["actor_pool"], ) async def test_execute_with_cancel(actor_pool, cancel_phase): @@ -427,7 +444,7 @@ def delay_fun(delay, _inp1): @pytest.mark.asyncio -@pytest.mark.parametrize("actor_pool", [(1, True)], indirect=True) +@pytest.mark.parametrize("actor_pool", [(1, True, False)], indirect=True) async def test_execute_with_pure_deps(actor_pool): pool, session_id, meta_api, worker_meta_api, storage_api, execution_ref = actor_pool @@ -508,7 +525,7 @@ def test_estimate_size(): @pytest.mark.asyncio -@pytest.mark.parametrize("actor_pool", [(1, False)], indirect=True) +@pytest.mark.parametrize("actor_pool", [(1, False, False)], indirect=True) async def test_cancel_without_kill(actor_pool): pool, session_id, meta_api, worker_meta_api, storage_api, execution_ref = actor_pool executed_file = os.path.join( @@ -611,3 +628,68 @@ def test_fetch_data_from_both_cpu_and_gpu(data_type, chunked, setup_gpu): pd.testing.assert_frame_equal(expected, actual.execute().fetch(to_cpu=True)) else: pd.testing.assert_series_equal(expected, actual.execute().fetch(to_cpu=True)) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("actor_pool", [(1, True, False)], indirect=True) +async def test_stage_monitor_actor(actor_pool): + pool, session_id, meta_api, worker_meta_api, storage_api, execution_ref = actor_pool + subtask_id = f"test_subtask_{uuid.uuid4()}" + subtask = Subtask( + subtask_id=subtask_id, + session_id=session_id, + task_id=f"test_task_{uuid.uuid4()}", + # chunk_graph=chunk_graph, + ) + + monitor_ref = await mo.actor_ref( + StageMonitorActor.default_uid(), address=pool.external_address + ) + await asyncio.wait_for( + execution_ref.run_subtask(subtask, "numa-0", pool.external_address), timeout=30 + ) + + stale_tasks = await monitor_ref.get_all_stale_tasks() + assert len(stale_tasks) == 0 + + # task has been finished + records = await monitor_ref.get_records() + assert len(records) == 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("actor_pool", [(1, True, True)], indirect=True) +async def test_terminate_stale_tasks(actor_pool, caplog): + pool, session_id, meta_api, worker_meta_api, storage_api, execution_ref = actor_pool + + def delay_fun(delay): + time.sleep(delay) + return delay + + remote_result = RemoteFunction( + function=delay_fun, function_args=[10], function_kwargs={} + ).new_chunk([]) + chunk_graph = ChunkGraph([remote_result]) + chunk_graph.add_node(remote_result) + + subtask = Subtask( + f"test_subtask_{uuid.uuid4()}", + session_id=session_id, + task_id=f"test_task_{uuid.uuid4()}", + chunk_graph=chunk_graph, + ) + + with Timer() as timer: + aiotask = asyncio.create_task( + execution_ref.run_subtask(subtask, "numa-0", pool.external_address) + ) + + r = await asyncio.wait_for(aiotask, timeout=20) + assert r.status == SubtaskStatus.succeeded + + assert 5 < timer.duration < 20 + + import re + + match = re.search(r"Subtask\[.*?\].*stage.*", caplog.text) + assert match is not None diff --git a/python/xorbits/_mars/services/scheduling/worker/tests/test_quota.py b/python/xorbits/_mars/services/scheduling/worker/tests/test_quota.py index 99a406f91..bd8b6c5e7 100644 --- a/python/xorbits/_mars/services/scheduling/worker/tests/test_quota.py +++ b/python/xorbits/_mars/services/scheduling/worker/tests/test_quota.py @@ -25,6 +25,7 @@ from .....tests.core import mock from .....utils import get_next_port from ...worker import BandSlotManagerActor, MemQuotaActor, QuotaActor +from .. import StageMonitorActor class MockBandSlotManagerActor(mo.Actor): @@ -40,11 +41,17 @@ async def actor_pool(): start_method = ( os.environ.get("POOL_START_METHOD", "fork") if sys.platform != "win32" else None ) + # create monitor actor pool = await create_actor_pool( f"127.0.0.1:{get_next_port()}", n_process=0, subprocess_start_method=start_method, ) + await mo.create_actor( + StageMonitorActor, + uid=StageMonitorActor.default_uid(), + address=pool.external_address, + ) await pool.start() try: yield pool diff --git a/python/xorbits/_mars/services/subtask/__init__.py b/python/xorbits/_mars/services/subtask/__init__.py index 0b6fda518..0f81469c1 100644 --- a/python/xorbits/_mars/services/subtask/__init__.py +++ b/python/xorbits/_mars/services/subtask/__init__.py @@ -14,5 +14,5 @@ # limitations under the License. from .api import MockSubtaskAPI, SubtaskAPI -from .core import Subtask, SubtaskGraph, SubtaskResult, SubtaskStatus +from .core import Subtask, SubtaskGraph, SubtaskResult, SubtaskStage, SubtaskStatus from .errors import SlotOccupiedAlready, SubtaskNotExist diff --git a/python/xorbits/_mars/services/subtask/core.py b/python/xorbits/_mars/services/subtask/core.py index ac562cfbc..9823d39c0 100644 --- a/python/xorbits/_mars/services/subtask/core.py +++ b/python/xorbits/_mars/services/subtask/core.py @@ -36,6 +36,15 @@ from ...typing import BandType, ChunkType +class SubtaskStage(Enum): + PREPARE_DATA = 0 + REQUEST_QUOTA = 1 + ACQUIRE_SLOT = 2 + EXECUTE = 3 + RELEASE_SLOT = 4 + FINISH = 5 + + class SubtaskStatus(Enum): pending = 0 running = 1 diff --git a/python/xorbits/_mars/tensor/base/tile.py b/python/xorbits/_mars/tensor/base/tile.py index ee654ee80..326434b0a 100644 --- a/python/xorbits/_mars/tensor/base/tile.py +++ b/python/xorbits/_mars/tensor/base/tile.py @@ -29,7 +29,7 @@ def tile(A, reps): behavior, promote `A` to d-dimensions manually before calling this function. - If ``A.ndim > d``, `reps` is promoted to `A`.ndim by pre-pending 1's to it. + If ``A.ndim > d``, `reps` is promoted to `A`.ndim by prepending 1's to it. Thus for an `A` of shape (2, 3, 4, 5), a `reps` of (2, 2) is treated as (1, 1, 2, 2). diff --git a/python/xorbits/_mars/tensor/core.py b/python/xorbits/_mars/tensor/core.py index b16e3c03e..c92fe6f62 100644 --- a/python/xorbits/_mars/tensor/core.py +++ b/python/xorbits/_mars/tensor/core.py @@ -415,6 +415,9 @@ def imag(self, new_imag): def __array__(self, dtype=None): return np.asarray(self.to_numpy(), dtype=dtype) + def tolist(self): + return self.to_numpy().tolist() + def __array_function__(self, func, types, args, kwargs): from .. import tensor as module diff --git a/python/xorbits/_mars/tensor/statistics/bincount.py b/python/xorbits/_mars/tensor/statistics/bincount.py index bb25fe03e..fcec8603a 100644 --- a/python/xorbits/_mars/tensor/statistics/bincount.py +++ b/python/xorbits/_mars/tensor/statistics/bincount.py @@ -286,8 +286,10 @@ def bincount(x, weights=None, minlength=0, chunk_size_limit=None): x = astensor(x) weights = astensor(weights) if weights is not None else None - if not np.issubdtype(x.dtype, np.int_): - raise TypeError(f"Cannot cast array data from {x.dtype} to {np.dtype(np.int_)}") + if not np.issubdtype(x.dtype, np.int64): + raise TypeError( + f"Cannot cast array data from {x.dtype} to {np.dtype(np.int64)}" + ) if x.ndim != 1: raise ValueError("'x' must be 1 dimension") if minlength < 0: diff --git a/python/xorbits/_mars/tensor/tests/test_core_execution.py b/python/xorbits/_mars/tensor/tests/test_core_execution.py index 99d803925..0c59a4e81 100644 --- a/python/xorbits/_mars/tensor/tests/test_core_execution.py +++ b/python/xorbits/_mars/tensor/tests/test_core_execution.py @@ -14,6 +14,7 @@ # limitations under the License. import numpy as np +import pytest from .. import ( add, @@ -281,3 +282,10 @@ def test_flat(setup): np.testing.assert_array_equal(b.execute(), npb) np.testing.assert_array_equal(a.execute(), npa) + + +@pytest.mark.parametrize("chunk_size", [None, 1, 4]) +def test_tolist(setup, chunk_size): + data = np.random.rand(10, 20) + a = tensor(data, chunk_size=chunk_size) + assert a.tolist() == data.tolist() diff --git a/python/xorbits/_mars/tensor/utils.py b/python/xorbits/_mars/tensor/utils.py index ba36abb5b..86bcd483d 100644 --- a/python/xorbits/_mars/tensor/utils.py +++ b/python/xorbits/_mars/tensor/utils.py @@ -774,7 +774,7 @@ def fetch_corner_data(tensor, session=None): # the tensor must have been executed, # thus the size could not be NaN if tensor.size > threshold: - # two edges for each exis + # two edges for each axis indices_iter = list(itertools.product(*(range(2) for _ in range(tensor.ndim)))) corners = np.empty(shape=(2,) * tensor.ndim, dtype=object) shape = [0 for _ in range(tensor.ndim)] diff --git a/python/xorbits/_mars/tests/core.py b/python/xorbits/_mars/tests/core.py index 4e3c5ed4e..8e2decab9 100644 --- a/python/xorbits/_mars/tests/core.py +++ b/python/xorbits/_mars/tests/core.py @@ -320,6 +320,10 @@ def assert_dtype_consistent(expected_dtype, real_dtype): expected_dtype, cate_dtypes ): return + if isinstance(real_dtype, pd.ArrowDtype) or isinstance( + expected_dtype, pd.ArrowDtype + ): + return if not np.can_cast(real_dtype, expected_dtype) and not np.can_cast( expected_dtype, real_dtype ): diff --git a/python/xorbits/_mars/utils.py b/python/xorbits/_mars/utils.py index 7c54ddd3d..91f00ede4 100644 --- a/python/xorbits/_mars/utils.py +++ b/python/xorbits/_mars/utils.py @@ -489,13 +489,15 @@ def calc_data_size(dt: Any, shape: Tuple[int] = None) -> int: return 0 if isinstance(dt, tuple): - return sum(calc_data_size(c) for c in dt) + # int() for windows CI, otherwise may return numpy.int32 by `sum` + return int(sum(calc_data_size(c) for c in dt)) shape = getattr(dt, "shape", None) or shape if isinstance(dt, (pd.DataFrame, pd.Series)): return estimate_pandas_size(dt) if hasattr(dt, "estimate_size"): - return dt.estimate_size() + # int() for windows CI, otherwise may return numpy.int32 + return int(dt.estimate_size()) if dt.estimate_size() is not None else None if hasattr(dt, "nbytes"): return max(sys.getsizeof(dt), dt.nbytes) if hasattr(dt, "shape") and len(dt.shape) == 0: diff --git a/python/xorbits/core/adapter.py b/python/xorbits/core/adapter.py index bd3d93317..92e1dbdc5 100644 --- a/python/xorbits/core/adapter.py +++ b/python/xorbits/core/adapter.py @@ -495,6 +495,12 @@ def collect_cls_members( ) -> Dict[str, Any]: cls_members: Dict[str, Any] = {} for name, cls_member in inspect.getmembers(cls): + # Tileable and TileableData object may have functions that have the same names. + # For example, Index and IndexData both have `copy` function, but they have completely different semantics. + # Therefore, when the Index's `copy` method has been collected, + # the method of the same name on IndexData cannot be collected again. + if cls.__name__.endswith("Data") and name in DATA_MEMBERS[data_type]: # type: ignore + continue if inspect.isfunction(cls_member) and not name.startswith("_"): cls_members[name] = wrap_mars_callable( cls_member, diff --git a/python/xorbits/deploy/docker/Dockerfile.base b/python/xorbits/deploy/docker/Dockerfile.base index b2ba05971..adb1f6008 100644 --- a/python/xorbits/deploy/docker/Dockerfile.base +++ b/python/xorbits/deploy/docker/Dockerfile.base @@ -48,9 +48,11 @@ RUN /opt/conda/bin/conda install \ jaxlib \ uvloop \ libnuma \ + && pip install -U pip \ && pip install -U \ xoscar \ cloudpickle \ + azure-storage-blob>=12.18.1 \ adlfs \ fsspec>=2022.7.1,!=2022.8.0 \ s3fs \ diff --git a/python/xorbits/lightgbm/tests/test_classifier.py b/python/xorbits/lightgbm/tests/test_classifier.py index 2424665f6..47bd17b5c 100644 --- a/python/xorbits/lightgbm/tests/test_classifier.py +++ b/python/xorbits/lightgbm/tests/test_classifier.py @@ -156,7 +156,7 @@ def test_local_classifier_from_to_parquet(setup): df.iloc[:500].to_parquet(os.path.join(d, "data", "data1.parquet")) df.iloc[500:].to_parquet(os.path.join(d, "data", "data2.parquet")) - df = xpd.read_parquet(data_dir) + df = xpd.read_parquet(data_dir, use_arrow_dtype=False) model = lgb.LGBMClassifier() model.load_model(classifier) result = model.predict(df, run=False) @@ -164,7 +164,12 @@ def test_local_classifier_from_to_parquet(setup): r.execute() - ret = xpd.read_parquet(result_dir).to_pandas().iloc[:, 0].to_numpy() + ret = ( + xpd.read_parquet(result_dir, use_arrow_dtype=False) + .to_pandas() + .iloc[:, 0] + .to_numpy() + ) expected = classifier.predict(X) expected = np.stack([1 - expected, expected]).argmax(axis=0) np.testing.assert_array_equal(ret, expected) diff --git a/python/xorbits/numpy/numpy_adapters/tests/test_numpy_adapters.py b/python/xorbits/numpy/numpy_adapters/tests/test_numpy_adapters.py index c03f4ded0..5b62ec55c 100644 --- a/python/xorbits/numpy/numpy_adapters/tests/test_numpy_adapters.py +++ b/python/xorbits/numpy/numpy_adapters/tests/test_numpy_adapters.py @@ -13,9 +13,11 @@ # limitations under the License. import numpy as np +import pandas as pd import pytest from .... import numpy as xnp +from .... import pandas as xpd @pytest.mark.parametrize( @@ -145,20 +147,6 @@ def test_tensorinv_fallback(setup): assert np.equal(xnp_output.all(), np_output.all()) -def test_ndarray_fallback(setup): - with pytest.warns(Warning) as w: - a = np.array([1, 2, 3]) - b = xnp.array([1, 2, 3]) - xnp_output = b.tolist().fetch() - np_output = a.tolist() - - assert f"Tensor.tolist will fallback to Numpy" == str(w[0].message) - assert isinstance(xnp_output, list) - for i in range(0, len(b)): - assert np_output[i] == xnp_output[i] - assert xnp_output[i] == i + 1 - - def test_busday_offset(setup): with pytest.warns(Warning) as w: xnp_output = xnp.busday_offset("2011-10", 0, roll="forward").execute().fetch() @@ -216,3 +204,17 @@ def test_docstring(): assert docstring is not None and docstring.endswith( "This docstring was copied from numpy.ndarray." ) + + +def test_tensor_tolist(setup): + data = np.random.rand(15, 25) + tensor = xnp.array(data) + assert data.tolist() == tensor.tolist() + + expected = pd.unique(pd.Series([i for i in range(100)])).tolist() + result = xpd.unique(xpd.Series([i for i in range(100)])).tolist() + assert expected == result + + data = np.array([1, 2, 3, 4]) + tensor = xnp.array([1, 2, 3, 4]) + assert data.tolist() == tensor.tolist() diff --git a/python/xorbits/pandas/pandas_adapters/tests/test_pandas_adapters.py b/python/xorbits/pandas/pandas_adapters/tests/test_pandas_adapters.py index cd2ce21f8..14f2f5595 100644 --- a/python/xorbits/pandas/pandas_adapters/tests/test_pandas_adapters.py +++ b/python/xorbits/pandas/pandas_adapters/tests/test_pandas_adapters.py @@ -22,6 +22,7 @@ from .... import pandas as xpd from ....core.data import DataRef +from ....core.execution import need_to_execute def test_pandas_dataframe_methods(setup): @@ -499,3 +500,36 @@ def test_read_pickle(setup): assert (x == y).all() finally: shutil.rmtree(tempdir) + + +def test_copy(setup): + index = xpd.Index([i for i in range(100)], name="test") + index_iloc = index[:20] + assert need_to_execute(index_iloc) is True + repr(index_iloc) + + index_copy = index_iloc.copy() + assert need_to_execute(index_copy) is False + pd.testing.assert_index_equal(index_copy.to_pandas(), index_iloc.to_pandas()) + + index_copy = index_iloc.copy(name="abc") + assert need_to_execute(index_copy) is True + pd.testing.assert_index_equal( + index_copy.to_pandas(), index_iloc.to_pandas().copy(name="abc") + ) + + series = xpd.Series([1, 2, 3, 4, np.nan, 6]) + series = series + 1 + assert need_to_execute(series) is True + repr(series) + + sc = series.copy() + assert need_to_execute(sc) is False + expected = series.to_pandas() + pd.testing.assert_series_equal(sc.to_pandas(), expected) + + sc[0] = np.nan + assert need_to_execute(sc) is True + ec = expected.copy() + ec[0] = np.nan + pd.testing.assert_series_equal(sc.to_pandas(), ec) diff --git a/python/xorbits/sklearn/__init__.py b/python/xorbits/sklearn/__init__.py new file mode 100644 index 000000000..9c9f2b596 --- /dev/null +++ b/python/xorbits/sklearn/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def _install(): + """Nothing required for installing sklearn.""" + + +__all__ = [ + "cluster", + "datasets", + "decomposition", + "ensemble", + "linear_model", + "metrics", + "model_selection", + "neighbors", + "preprocessing", + "semi_supervised", +] diff --git a/python/xorbits/sklearn/cluster/__init__.py b/python/xorbits/sklearn/cluster/__init__.py new file mode 100644 index 000000000..da13e6baa --- /dev/null +++ b/python/xorbits/sklearn/cluster/__init__.py @@ -0,0 +1,49 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from ...core.utils.fallback import unimplemented_func + + +def _install(): + """Nothing required for installing sklearn.""" + + +def __dir__(): # pragma: no cover + try: + import sklearn + except ImportError: + raise AttributeError("sklearn is required but not installed.") + from .mars_adapters import MARS_SKLEARN_CLUSTER_CALLABLES + + return list(MARS_SKLEARN_CLUSTER_CALLABLES.keys()) + + +def __getattr__(name: str): # pragma: no cover + import inspect + + try: + import sklearn.cluster as sk_cluster + except ImportError: + raise AttributeError("sklearn is required but not installed.") + from .mars_adapters import MARS_SKLEARN_CLUSTER_CALLABLES + + if name in MARS_SKLEARN_CLUSTER_CALLABLES: + return MARS_SKLEARN_CLUSTER_CALLABLES[name] + else: + if not hasattr(sk_cluster, name): + raise AttributeError(name) + else: + if inspect.ismethod(getattr(sk_cluster, name)): + return unimplemented_func() + else: + raise AttributeError diff --git a/python/xorbits/sklearn/cluster/mars_adapters/__init__.py b/python/xorbits/sklearn/cluster/mars_adapters/__init__.py new file mode 100644 index 000000000..e9aabb1cc --- /dev/null +++ b/python/xorbits/sklearn/cluster/mars_adapters/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .core import MARS_SKLEARN_CLUSTER_CALLABLES diff --git a/python/xorbits/sklearn/cluster/mars_adapters/core.py b/python/xorbits/sklearn/cluster/mars_adapters/core.py new file mode 100644 index 000000000..cdd3f302e --- /dev/null +++ b/python/xorbits/sklearn/cluster/mars_adapters/core.py @@ -0,0 +1,35 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sklearn.cluster as sk_cluster + +from ...._mars.learn import cluster as mars_cluster +from ...._mars.learn.cluster import KMeans as MarsKMeans +from ....core.utils.docstring import attach_module_callable_docstring +from ...utils import SKLearnBase, _collect_module_callables, _install_cls_members + + +class KMeans(SKLearnBase): + _marscls = MarsKMeans + + +SKLEARN_CLUSTER_CLS_MAP = {KMeans: MarsKMeans} + +MARS_SKLEARN_CLUSTER_CALLABLES = _collect_module_callables( + mars_cluster, sk_cluster, skip_members=["register_op"] +) +_install_cls_members( + SKLEARN_CLUSTER_CLS_MAP, MARS_SKLEARN_CLUSTER_CALLABLES, sk_cluster +) +attach_module_callable_docstring(KMeans, sk_cluster, sk_cluster.KMeans) diff --git a/python/xorbits/sklearn/cluster/tests/__init__.py b/python/xorbits/sklearn/cluster/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/sklearn/cluster/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/sklearn/cluster/tests/test_core.py b/python/xorbits/sklearn/cluster/tests/test_core.py new file mode 100644 index 000000000..c2a10ec61 --- /dev/null +++ b/python/xorbits/sklearn/cluster/tests/test_core.py @@ -0,0 +1,57 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +try: + import sklearn +except ImportError: # pragma: no cover + sklearn = None + +import numpy as np +import pytest + +from .... import numpy as xnp +from .. import KMeans + +n_rows = 1000 +n_clusters = 8 +n_columns = 10 +chunk_size = 200 +rs = xnp.random.RandomState(0) +X = rs.rand(n_rows, n_columns, chunk_size=chunk_size) +X_new = rs.rand(n_rows, n_columns, chunk_size=chunk_size) + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_doc(): + docstring = KMeans.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.cluster." + ) + + docstring = KMeans.fit.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.cluster._kmeans.KMeans." + ) + + +@pytest.mark.skipif(sklearn is None, reason="sci-kit-learn not installed") +def test_kmeans_cluster(): + kms = KMeans(n_clusters=n_clusters, random_state=0) + kms.fit(X) + predict = kms.predict(X_new).fetch() + + assert kms.n_clusters == n_clusters + assert np.shape(kms.labels_.fetch()) == (n_rows,) + assert np.shape(kms.cluster_centers_.fetch()) == (n_clusters, n_columns) + assert np.shape(predict) == (n_rows,) diff --git a/python/xorbits/sklearn/datasets/__init__.py b/python/xorbits/sklearn/datasets/__init__.py new file mode 100644 index 000000000..accf8fbcb --- /dev/null +++ b/python/xorbits/sklearn/datasets/__init__.py @@ -0,0 +1,48 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def _install(): + """Nothing required for installing sklearn.""" + + +def __dir__(): # pragma: no cover + try: + import sklearn + except ImportError: + raise AttributeError("sklearn is required but not installed.") + from .mars_adapters import MARS_SKLEARN_DATASETS_CALLABLES + + return list(MARS_SKLEARN_DATASETS_CALLABLES.keys()) + + +def __getattr__(name: str): # pragma: no cover + import inspect + + try: + import sklearn.datasets as sk_datasets + except ImportError: + raise AttributeError("sklearn is required but not installed.") + from .mars_adapters import MARS_SKLEARN_DATASETS_CALLABLES + + if name in MARS_SKLEARN_DATASETS_CALLABLES: + return MARS_SKLEARN_DATASETS_CALLABLES[name] + else: + if not hasattr(sk_datasets, name): + raise AttributeError(name) + else: + if inspect.ismethod(getattr(sk_datasets, name)): + raise NotImplementedError(f"This function is not implemented yet.") + else: + raise AttributeError diff --git a/python/xorbits/sklearn/datasets/mars_adapters/__init__.py b/python/xorbits/sklearn/datasets/mars_adapters/__init__.py new file mode 100644 index 000000000..050a5f86b --- /dev/null +++ b/python/xorbits/sklearn/datasets/mars_adapters/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .core import MARS_SKLEARN_DATASETS_CALLABLES diff --git a/python/xorbits/sklearn/datasets/mars_adapters/core.py b/python/xorbits/sklearn/datasets/mars_adapters/core.py new file mode 100644 index 000000000..a312031b5 --- /dev/null +++ b/python/xorbits/sklearn/datasets/mars_adapters/core.py @@ -0,0 +1,22 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sklearn.datasets as sk_datasets + +from ...._mars.learn import datasets as mars_datasets +from ...utils import _collect_module_callables + +MARS_SKLEARN_DATASETS_CALLABLES = _collect_module_callables( + mars_datasets, sk_datasets, skip_members=["register_op"] +) diff --git a/python/xorbits/sklearn/datasets/tests/__init__.py b/python/xorbits/sklearn/datasets/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/sklearn/datasets/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/sklearn/datasets/tests/test_core.py b/python/xorbits/sklearn/datasets/tests/test_core.py new file mode 100644 index 000000000..a7a06f71b --- /dev/null +++ b/python/xorbits/sklearn/datasets/tests/test_core.py @@ -0,0 +1,131 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +try: + import sklearn +except ImportError: # pragma: no cover + sklearn = None + +import pytest + +import xorbits.numpy as np + +from ... import datasets +from ...datasets import ( + make_blobs, + make_classification, + make_low_rank_matrix, + make_regression, +) + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_doc(): + docstring = datasets.make_blobs.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.datasets." + ) + + docstring = datasets.make_classification.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.datasets." + ) + + docstring = datasets.make_low_rank_matrix.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.datasets." + ) + + docstring = datasets.make_regression.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.datasets." + ) + + +def test_make_classification(): + weights = [0.1, 0.25] + X, y = make_classification( + n_samples=100, + n_features=20, + n_informative=5, + n_redundant=1, + n_repeated=1, + n_classes=3, + n_clusters_per_class=1, + hypercube=False, + shift=None, + scale=None, + weights=weights, + random_state=0, + flip_y=-1, + ) + X, y = X.execute().fetch(), y.execute().fetch() + assert X.shape == (100, 20) + assert y.shape == (100,) + assert np.unique(y).shape == (3,) + assert (y == 0).sum() == 10 + assert (y == 1).sum() == 25 + assert (y == 2).sum() == 65 + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_make_regression(): + X, y, c = make_regression( + n_samples=100, + n_features=10, + n_informative=3, + effective_rank=5, + coef=True, + bias=0.0, + noise=1.0, + random_state=0, + ) + X, y, c = X.execute().fetch(), y.execute().fetch(), c.execute().fetch() + assert X.shape == (100, 10), "X shape mismatch" + assert y.shape == (100,), "y shape mismatch" + assert c.shape == (10,), "coef shape mismatch" + assert sum(c != 0.0) == 3, "Unexpected number of informative features" + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_make_blobs(): + cluster_stds = np.array([0.05, 0.2, 0.4]) + cluster_centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]]) + X, y = make_blobs( + random_state=0, + n_samples=50, + n_features=2, + centers=cluster_centers, + cluster_std=cluster_stds, + ) + X, y = X.execute().fetch(), y.execute().fetch() + assert X.shape == (50, 2) + assert y.shape == (50,) + assert np.unique(y).shape == (3,) + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_make_low_rank_matrix(): + X = make_low_rank_matrix( + n_samples=50, + n_features=25, + effective_rank=5, + tail_strength=0.01, + random_state=0, + ) + X = X.execute().fetch() + assert X.shape == (50, 25) + _, s, _ = np.linalg.svd(X) + s = s.execute().fetch() + assert (s.sum() - 5) < 0.1 diff --git a/python/xorbits/sklearn/decomposition/__init__.py b/python/xorbits/sklearn/decomposition/__init__.py new file mode 100644 index 000000000..a54d6b392 --- /dev/null +++ b/python/xorbits/sklearn/decomposition/__init__.py @@ -0,0 +1,49 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from ...core.utils.fallback import unimplemented_func + + +def _install(): + """Nothing required for installing sklearn.""" + + +def __dir__(): # pragma: no cover + try: + import sklearn + except ImportError: + raise AttributeError("sklearn is required but not installed.") + from .mars_adapters import MARS_SKLEARN_DECOMP_CALLABLES + + return list(MARS_SKLEARN_DECOMP_CALLABLES.keys()) + + +def __getattr__(name: str): # pragma: no cover + import inspect + + try: + import sklearn.decomposition as sk_decomp + except ImportError: + raise AttributeError("sklearn is required but not installed.") + from .mars_adapters import MARS_SKLEARN_DECOMP_CALLABLES + + if name in MARS_SKLEARN_DECOMP_CALLABLES: + return MARS_SKLEARN_DECOMP_CALLABLES[name] + else: + if not hasattr(sk_decomp, name): + raise AttributeError(name) + else: + if inspect.ismethod(getattr(sk_decomp, name)): + return unimplemented_func() + else: + raise AttributeError diff --git a/python/xorbits/sklearn/decomposition/mars_adapters/__init__.py b/python/xorbits/sklearn/decomposition/mars_adapters/__init__.py new file mode 100644 index 000000000..8c9727dc3 --- /dev/null +++ b/python/xorbits/sklearn/decomposition/mars_adapters/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .core import MARS_SKLEARN_DECOMP_CALLABLES diff --git a/python/xorbits/sklearn/decomposition/mars_adapters/core.py b/python/xorbits/sklearn/decomposition/mars_adapters/core.py new file mode 100644 index 000000000..49f3242ed --- /dev/null +++ b/python/xorbits/sklearn/decomposition/mars_adapters/core.py @@ -0,0 +1,43 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sklearn.decomposition as sk_decomposition + +from ...._mars.learn import decomposition as mars_decomposition +from ...._mars.learn.decomposition import PCA as MarsPCA +from ...._mars.learn.decomposition import TruncatedSVD as MarsTruncatedSVD +from ....core.utils.docstring import attach_module_callable_docstring +from ...utils import SKLearnBase, _collect_module_callables, _install_cls_members + + +class PCA(SKLearnBase): + _marscls = MarsPCA + + +class TruncatedSVD(SKLearnBase): + _marscls = MarsTruncatedSVD + + +SKLEARN_DECOMP_CLS_MAP = {PCA: MarsPCA, TruncatedSVD: MarsTruncatedSVD} + +MARS_SKLEARN_DECOMP_CALLABLES = _collect_module_callables( + mars_decomposition, sk_decomposition, skip_members=["register_op"] +) +_install_cls_members( + SKLEARN_DECOMP_CLS_MAP, MARS_SKLEARN_DECOMP_CALLABLES, sk_decomposition +) +attach_module_callable_docstring(PCA, sk_decomposition, sk_decomposition.PCA) +attach_module_callable_docstring( + TruncatedSVD, sk_decomposition, sk_decomposition.TruncatedSVD +) diff --git a/python/xorbits/sklearn/decomposition/tests/__init__.py b/python/xorbits/sklearn/decomposition/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/sklearn/decomposition/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/sklearn/decomposition/tests/test_core.py b/python/xorbits/sklearn/decomposition/tests/test_core.py new file mode 100644 index 000000000..361b7cd3d --- /dev/null +++ b/python/xorbits/sklearn/decomposition/tests/test_core.py @@ -0,0 +1,87 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +try: + import sklearn +except ImportError: # pragma: no cover + sklearn = None + +import numpy as np +import pytest +import scipy.sparse as sp +from numpy.testing import assert_array_almost_equal, assert_equal +from sklearn import datasets +from sklearn.utils import check_random_state + +from .. import PCA, TruncatedSVD + +iris = np.asarray(datasets.load_iris().data) + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_doc(): + docstring = PCA.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.decomposition." + ) + + docstring = PCA.fit.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.decomposition._pca.PCA." + ) + + docstring = TruncatedSVD.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.decomposition." + ) + + docstring = TruncatedSVD.fit.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.decomposition._truncated_svd.TruncatedSVD." + ) + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_pca(): + X = iris + + for n_comp in np.arange(X.shape[1]): + pca = PCA(n_components=n_comp, svd_solver="full") + pca.fit(X) + X_r = pca.transform(X).fetch() + assert_equal(X_r.shape[1], n_comp) + + X_r2 = pca.fit_transform(X).fetch() + assert_array_almost_equal(X_r, X_r2) + + X_r = pca.transform(X).fetch() + X_r2 = pca.fit_transform(X).fetch() + assert_array_almost_equal(X_r, X_r2) + + # Test get_covariance and get_precision + cov = pca.get_covariance() + precision = pca.get_precision() + assert_array_almost_equal(np.dot(cov, precision), np.eye(X.shape[1]), 12) + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_truncated_svd(): + shape = 60, 55 + n_samples, n_features = shape + rng = check_random_state(42) + X = rng.randint(-100, 20, np.product(shape)).reshape(shape) + X = sp.csr_matrix(np.maximum(X, 0), dtype=np.float64) + for n_components in (10, 25, 41): + tsvd = TruncatedSVD(n_components).fit(X) + assert tsvd.n_components == n_components + assert tsvd.components_.shape == (n_components, n_features) diff --git a/python/xorbits/sklearn/ensemble/__init__.py b/python/xorbits/sklearn/ensemble/__init__.py new file mode 100644 index 000000000..92ad66397 --- /dev/null +++ b/python/xorbits/sklearn/ensemble/__init__.py @@ -0,0 +1,49 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from ...core.utils.fallback import unimplemented_func + + +def _install(): + """Nothing required for installing sklearn.""" + + +def __dir__(): # pragma: no cover + try: + import sklearn + except ImportError: + raise AttributeError("sklearn is required but not installed.") + from .mars_adapters import MARS_SKLEARN_EN_CALLABLES + + return list(MARS_SKLEARN_EN_CALLABLES.keys()) + + +def __getattr__(name: str): # pragma: no cover + import inspect + + try: + import sklearn.ensemble as sk_en + except ImportError: + raise AttributeError("sklearn is required but not installed.") + from .mars_adapters import MARS_SKLEARN_EN_CALLABLES + + if name in MARS_SKLEARN_EN_CALLABLES: + return MARS_SKLEARN_EN_CALLABLES[name] + else: + if not hasattr(sk_en, name): + raise AttributeError(name) + else: + if inspect.ismethod(getattr(sk_en, name)): + return unimplemented_func() + else: + raise AttributeError diff --git a/python/xorbits/sklearn/ensemble/mars_adapters/__init__.py b/python/xorbits/sklearn/ensemble/mars_adapters/__init__.py new file mode 100644 index 000000000..8b02cbb39 --- /dev/null +++ b/python/xorbits/sklearn/ensemble/mars_adapters/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .core import MARS_SKLEARN_EN_CALLABLES diff --git a/python/xorbits/sklearn/ensemble/mars_adapters/core.py b/python/xorbits/sklearn/ensemble/mars_adapters/core.py new file mode 100644 index 000000000..d7025e5a3 --- /dev/null +++ b/python/xorbits/sklearn/ensemble/mars_adapters/core.py @@ -0,0 +1,49 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sklearn.ensemble as sk_en + +from ...._mars.learn import ensemble as mars_en +from ...._mars.learn.ensemble import BaggingClassifier as MarsBaggingClassifier +from ...._mars.learn.ensemble import BaggingRegressor as MarsBaggingRegressor +from ...._mars.learn.ensemble import IsolationForest as MarsIsolationForest +from ....core.utils.docstring import attach_module_callable_docstring +from ...utils import SKLearnBase, _collect_module_callables, _install_cls_members + + +class BaggingClassifier(SKLearnBase): + _marscls = MarsBaggingClassifier + + +class BaggingRegressor(SKLearnBase): + _marscls = MarsBaggingRegressor + + +class IsolationForest(SKLearnBase): + _marscls = MarsIsolationForest + + +SKLEARN_EN_CLS_MAP = { + BaggingClassifier: MarsBaggingClassifier, + IsolationForest: MarsIsolationForest, + BaggingRegressor: MarsBaggingRegressor, +} + +MARS_SKLEARN_EN_CALLABLES = _collect_module_callables( + mars_en, sk_en, skip_members=["register_op"] +) +_install_cls_members(SKLEARN_EN_CLS_MAP, MARS_SKLEARN_EN_CALLABLES, sk_en) +attach_module_callable_docstring(BaggingClassifier, sk_en, sk_en.BaggingClassifier) +attach_module_callable_docstring(BaggingRegressor, sk_en, sk_en.BaggingRegressor) +attach_module_callable_docstring(IsolationForest, sk_en, sk_en.IsolationForest) diff --git a/python/xorbits/sklearn/ensemble/tests/__init__.py b/python/xorbits/sklearn/ensemble/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/sklearn/ensemble/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/sklearn/ensemble/tests/test_core.py b/python/xorbits/sklearn/ensemble/tests/test_core.py new file mode 100644 index 000000000..07cdfc537 --- /dev/null +++ b/python/xorbits/sklearn/ensemble/tests/test_core.py @@ -0,0 +1,124 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +try: + import sklearn +except ImportError: # pragma: no cover + sklearn = None + +import numpy as np +import pytest +from sklearn.linear_model import LinearRegression +from sklearn.svm import SVC + +from ...datasets import make_classification, make_regression +from ...ensemble import BaggingClassifier, BaggingRegressor, IsolationForest + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_doc(): + docstring = BaggingClassifier.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.ensemble." + ) + + docstring = BaggingRegressor.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.ensemble." + ) + + docstring = IsolationForest.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.ensemble." + ) + + docstring = BaggingClassifier.fit.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.ensemble._bagging.BaggingClassifier." + ) + + docstring = BaggingRegressor.fit.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.ensemble._bagging.BaggingRegressor." + ) + + docstring = IsolationForest.fit.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.ensemble._iforest.IsolationForest." + ) + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_baggingclassifier(): + rs = np.random.RandomState(0) + + raw_x, raw_y = make_classification( + n_samples=100, + n_features=4, + n_informative=2, + n_redundant=0, + random_state=rs, + shuffle=False, + ) + + clf = BaggingClassifier( + base_estimator=SVC(), + n_estimators=10, + max_samples=10, + max_features=1, + random_state=rs, + warm_start=True, + ) + + clf.fit(raw_x, raw_y) + log_proba = clf.predict_log_proba(raw_x) + log_proba = log_proba.fetch() + exp_log_proba_array = np.exp(log_proba) + assert clf.n_estimators == 10 + assert np.all((exp_log_proba_array >= 0) & (exp_log_proba_array <= 1)) + assert np.allclose(np.sum(exp_log_proba_array, axis=1), 1.0) + + +def test_bagging_regression(): + rs = np.random.RandomState(0) + + raw_x, raw_y = make_regression( + n_samples=100, n_features=4, n_informative=2, random_state=rs, shuffle=False + ) + clf = BaggingRegressor( + base_estimator=LinearRegression(), + n_estimators=10, + max_samples=10, + max_features=0.5, + random_state=rs, + warm_start=True, + ) + clf.fit(raw_x, raw_y) + + predict_y = clf.predict(raw_x) + predict_y_array = predict_y.fetch() + assert predict_y_array.shape == raw_y.shape + + +def test_iforest(): + rs = np.random.RandomState(0) + raw_train = rs.poisson(size=(100, 10)) + raw_test = rs.poisson(size=(200, 10)) + + clf = IsolationForest(random_state=rs, n_estimators=10, max_samples=1) + pred = clf.fit(raw_train).predict(raw_test).fetch() + score = clf.score_samples(raw_test).fetch() + + assert clf.n_estimators == 10 + assert pred.shape == (200,) + assert score.shape == (200,) diff --git a/python/xorbits/sklearn/linear_model/__init__.py b/python/xorbits/sklearn/linear_model/__init__.py new file mode 100644 index 000000000..1011a91b5 --- /dev/null +++ b/python/xorbits/sklearn/linear_model/__init__.py @@ -0,0 +1,49 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from ...core.utils.fallback import unimplemented_func + + +def _install(): + """Nothing required for installing sklearn.""" + + +def __dir__(): # pragma: no cover + try: + import sklearn + except ImportError: + raise AttributeError("sklearn is required but not installed.") + from .mars_adapters import MARS_SKLEARN_LM_CALLABLES + + return list(MARS_SKLEARN_LM_CALLABLES.keys()) + + +def __getattr__(name: str): # pragma: no cover + import inspect + + try: + import sklearn.linear_model as sk_lm + except ImportError: + raise AttributeError("sklearn is required but not installed.") + from .mars_adapters import MARS_SKLEARN_LM_CALLABLES + + if name in MARS_SKLEARN_LM_CALLABLES: + return MARS_SKLEARN_LM_CALLABLES[name] + else: + if not hasattr(sk_lm, name): + raise AttributeError(name) + else: + if inspect.ismethod(getattr(sk_lm, name)): + return unimplemented_func() + else: + raise AttributeError diff --git a/python/xorbits/sklearn/linear_model/mars_adapters/__init__.py b/python/xorbits/sklearn/linear_model/mars_adapters/__init__.py new file mode 100644 index 000000000..dc3dfcca7 --- /dev/null +++ b/python/xorbits/sklearn/linear_model/mars_adapters/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .core import MARS_SKLEARN_LM_CALLABLES diff --git a/python/xorbits/sklearn/linear_model/mars_adapters/core.py b/python/xorbits/sklearn/linear_model/mars_adapters/core.py new file mode 100644 index 000000000..7be6ff8e3 --- /dev/null +++ b/python/xorbits/sklearn/linear_model/mars_adapters/core.py @@ -0,0 +1,42 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sklearn.linear_model as sk_lm + +from ...._mars.learn import linear_model as mars_lm +from ...._mars.learn.glm import LogisticRegression as MarsLogisticRegression +from ...._mars.learn.linear_model import LinearRegression as MarsLinearRegression +from ....core.utils.docstring import attach_module_callable_docstring +from ...utils import SKLearnBase, _collect_module_callables, _install_cls_members + + +class LinearRegression(SKLearnBase): + _marscls = MarsLinearRegression + + +class LogisticRegression(SKLearnBase): + _marscls = MarsLogisticRegression + + +SKLEARN_LM_CLS_MAP = { + LinearRegression: MarsLinearRegression, + LogisticRegression: MarsLogisticRegression, +} + +MARS_SKLEARN_LM_CALLABLES = _collect_module_callables( + mars_lm, sk_lm, skip_members=["register_op"] +) +_install_cls_members(SKLEARN_LM_CLS_MAP, MARS_SKLEARN_LM_CALLABLES, sk_lm) +attach_module_callable_docstring(LinearRegression, sk_lm, sk_lm.LinearRegression) +attach_module_callable_docstring(LogisticRegression, sk_lm, sk_lm.LogisticRegression) diff --git a/python/xorbits/sklearn/linear_model/tests/__init__.py b/python/xorbits/sklearn/linear_model/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/sklearn/linear_model/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/sklearn/linear_model/tests/test_core.py b/python/xorbits/sklearn/linear_model/tests/test_core.py new file mode 100644 index 000000000..4e002dc89 --- /dev/null +++ b/python/xorbits/sklearn/linear_model/tests/test_core.py @@ -0,0 +1,73 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +try: + import sklearn +except ImportError: # pragma: no cover + sklearn = None + +import numpy as np +import pytest + +from .. import LinearRegression, LogisticRegression + +n_rows = 100 +n_columns = 5 +X = np.random.rand(n_rows, n_columns) +y = np.random.rand(n_rows) +y_cat = np.random.randint(0, 2, n_rows) +X_new = np.random.rand(n_rows, n_columns) + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_doc(): + docstring = LogisticRegression.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.linear_model." + ) + + docstring = LogisticRegression.fit.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.linear_model._logistic.LogisticRegression." + ) + + docstring = LinearRegression.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.linear_model." + ) + + docstring = LinearRegression.fit.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.linear_model._base.LinearRegression." + ) + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_linear_regression(): + lr = LinearRegression() + lr.fit(X, y) + predict = lr.predict(X_new) + + assert np.shape(lr.coef_.fetch()) == (n_columns,) + assert np.shape(lr.intercept_.fetch()) == () + assert np.shape(predict) == (n_rows,) + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_logistic_regression(): + lr = LogisticRegression(max_iter=1) + lr.fit(X, y_cat) + predict = lr.predict(X_new).fetch() + + assert np.shape(predict) == (n_rows,) diff --git a/python/xorbits/sklearn/metrics/__init__.py b/python/xorbits/sklearn/metrics/__init__.py new file mode 100644 index 000000000..c0365ddd5 --- /dev/null +++ b/python/xorbits/sklearn/metrics/__init__.py @@ -0,0 +1,49 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from ...core.utils.fallback import unimplemented_func + + +def _install(): + """Nothing required for installing sklearn.""" + + +def __dir__(): # pragma: no cover + try: + import sklearn + except ImportError: + raise AttributeError("sklearn is required but not installed.") + from .mars_adapters import MARS_SKLEARN_METRICS_CALLABLES + + return list(MARS_SKLEARN_METRICS_CALLABLES.keys()) + + +def __getattr__(name: str): # pragma: no cover + import inspect + + try: + import sklearn.metrics as sk_metrics + except ImportError: + raise AttributeError("sklearn is required but not installed.") + from .mars_adapters import MARS_SKLEARN_METRICS_CALLABLES + + if name in MARS_SKLEARN_METRICS_CALLABLES: + return MARS_SKLEARN_METRICS_CALLABLES[name] + else: + if not hasattr(sk_metrics, name): + raise AttributeError(name) + else: + if inspect.ismethod(getattr(sk_metrics, name)): + return unimplemented_func() + else: + raise AttributeError diff --git a/python/xorbits/sklearn/metrics/mars_adapters/__init__.py b/python/xorbits/sklearn/metrics/mars_adapters/__init__.py new file mode 100644 index 000000000..d1e23cf5e --- /dev/null +++ b/python/xorbits/sklearn/metrics/mars_adapters/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .core import MARS_SKLEARN_METRICS_CALLABLES diff --git a/python/xorbits/sklearn/metrics/mars_adapters/core.py b/python/xorbits/sklearn/metrics/mars_adapters/core.py new file mode 100644 index 000000000..b0fa6a862 --- /dev/null +++ b/python/xorbits/sklearn/metrics/mars_adapters/core.py @@ -0,0 +1,22 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sklearn.metrics as sk_metrics + +from ...._mars.learn import metrics as mars_metrics +from ...utils import _collect_module_callables + +MARS_SKLEARN_METRICS_CALLABLES = _collect_module_callables( + mars_metrics, sk_metrics, skip_members=["register_op"] +) diff --git a/python/xorbits/sklearn/metrics/tests/__init__.py b/python/xorbits/sklearn/metrics/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/sklearn/metrics/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/sklearn/metrics/tests/test_core.py b/python/xorbits/sklearn/metrics/tests/test_core.py new file mode 100644 index 000000000..347b05aee --- /dev/null +++ b/python/xorbits/sklearn/metrics/tests/test_core.py @@ -0,0 +1,142 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +try: + import sklearn +except ImportError: # pragma: no cover + sklearn = None + +import inspect + +import numpy as np +import pytest + +from ... import metrics + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_doc(): + for name, f in inspect.getmembers(metrics, inspect.isfunction): + if name.startswith("_"): + continue + docstring = f.__doc__ + assert docstring is not None + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_classification(): + from sklearn.metrics import f1_score as sklearn_f1_score + from sklearn.metrics import fbeta_score as sklearn_fbeta_score + from sklearn.metrics import ( + multilabel_confusion_matrix as sklearn_multilabel_confusion_matrix, + ) + from sklearn.metrics import ( + precision_recall_fscore_support as sklearn_precision_recall_fscore_support, + ) + from sklearn.metrics import precision_score as sklearn_precision_score + from sklearn.metrics import recall_score as sklearn_recall_score + + from ...metrics import ( + f1_score, + fbeta_score, + multilabel_confusion_matrix, + precision_recall_fscore_support, + precision_score, + recall_score, + ) + + y_true = np.array([0, 1, 2, 0, 1, 2], dtype=np.int64) + y_pred = np.array([0, 2, 1, 0, 0, 1], dtype=np.int64) + + np.testing.assert_array_almost_equal( + f1_score(y_true, y_pred, average="macro").execute().fetch(), + sklearn_f1_score(y_true, y_pred, average="macro"), + ) + np.testing.assert_array_almost_equal( + fbeta_score(y_true, y_pred, beta=0.5, average="macro").execute().fetch(), + sklearn_fbeta_score(y_true, y_pred, beta=0.5, average="macro"), + ) + + np.testing.assert_array_almost_equal( + precision_score(y_true, y_pred, average="macro").execute().fetch(), + sklearn_precision_score(y_true, y_pred, average="macro"), + ) + + np.testing.assert_array_almost_equal( + recall_score(y_true, y_pred, average="macro").execute().fetch(), + sklearn_recall_score(y_true, y_pred, average="macro"), + ) + + np.testing.assert_array_almost_equal( + multilabel_confusion_matrix(y_true, y_pred).execute().fetch(), + sklearn_multilabel_confusion_matrix(y_true, y_pred), + ) + + np.testing.assert_array_almost_equal( + precision_recall_fscore_support(y_true, y_pred)[0].execute().fetch(), + sklearn_precision_recall_fscore_support(y_true, y_pred)[0], + ) + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_scorer(): + from sklearn.metrics import r2_score + + from ...metrics import get_scorer + + assert get_scorer("r2") is not None + assert get_scorer(r2_score) is not None + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_r2_score(): + from ...metrics import r2_score + + y_true = np.array([[1, 0, 0, 1], [0, 1, 1, 1], [1, 1, 0, 1]]) + y_pred = np.array([[0, 0, 0, 1], [1, 0, 1, 1], [0, 0, 0, 1]]) + + error = r2_score(y_true, y_pred, multioutput="variance_weighted") + np.testing.assert_almost_equal(error.fetch(), 1.0 - 5.0 / 2) + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_ranking(): + from sklearn.metrics import accuracy_score as sklearn_accuracy_score + from sklearn.metrics import auc as sklearn_auc + from sklearn.metrics import roc_curve as sklearn_roc_curve + from sklearn.metrics.tests.test_ranking import make_prediction + + from ...metrics import accuracy_score, auc, roc_auc_score, roc_curve + + y_true, y_score, _ = make_prediction(binary=True) + + np.testing.assert_almost_equal( + accuracy_score(y_true, y_score).fetch(), + sklearn_accuracy_score(y_true, y_score), + ) + rs = np.random.RandomState(0) + y = rs.randint(0, 10, (10,)) + pred = rs.rand(10) + fpr, tpr, thresholds = roc_curve(y, pred, pos_label=2) + m = auc(fpr, tpr) + + sk_fpr, sk_tpr, sk_threshod = sklearn_roc_curve( + y, + pred, + pos_label=2, + ) + expect_m = sklearn_auc(sk_fpr, sk_tpr) + assert pytest.approx(m.fetch()) == expect_m + y_true = np.array([0, 0, 1, 1], dtype=np.int64) + assert roc_auc_score(y_true, y_true, max_fpr=1) == 1 diff --git a/python/xorbits/sklearn/model_selection/__init__.py b/python/xorbits/sklearn/model_selection/__init__.py new file mode 100644 index 000000000..3b18a2c03 --- /dev/null +++ b/python/xorbits/sklearn/model_selection/__init__.py @@ -0,0 +1,49 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from ...core.utils.fallback import unimplemented_func + + +def _install(): + """Nothing required for installing sklearn.""" + + +def __dir__(): # pragma: no cover + try: + import sklearn + except ImportError: + raise AttributeError("sklearn is required but not installed.") + from .mars_adapters import MARS_SKLEARN_ML_CALLABLES + + return list(MARS_SKLEARN_ML_CALLABLES.keys()) + + +def __getattr__(name: str): # pragma: no cover + import inspect + + try: + import sklearn.model_selection as sk_ml + except ImportError: + raise AttributeError("sklearn is required but not installed.") + from .mars_adapters import MARS_SKLEARN_ML_CALLABLES + + if name in MARS_SKLEARN_ML_CALLABLES: + return MARS_SKLEARN_ML_CALLABLES[name] + else: + if not hasattr(sk_ml, name): + raise AttributeError(name) + else: + if inspect.ismethod(getattr(sk_ml, name)): + return unimplemented_func() + else: + raise AttributeError diff --git a/python/xorbits/sklearn/model_selection/mars_adapters/__init__.py b/python/xorbits/sklearn/model_selection/mars_adapters/__init__.py new file mode 100644 index 000000000..9a8b3c370 --- /dev/null +++ b/python/xorbits/sklearn/model_selection/mars_adapters/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .core import MARS_SKLEARN_ML_CALLABLES diff --git a/python/xorbits/sklearn/model_selection/mars_adapters/core.py b/python/xorbits/sklearn/model_selection/mars_adapters/core.py new file mode 100644 index 000000000..7f7869fbc --- /dev/null +++ b/python/xorbits/sklearn/model_selection/mars_adapters/core.py @@ -0,0 +1,51 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sklearn.model_selection as sk_ml + +from ...._mars.learn import model_selection as mars_ml +from ...._mars.learn.model_selection import KFold as MarsKFold +from ...._mars.learn.model_selection import ParameterGrid as MarsParameterGrid +from ....core.utils.docstring import attach_module_callable_docstring +from ...utils import SKLearnBase, _collect_module_callables, _install_cls_members + + +class KFold(SKLearnBase): + _marscls = MarsKFold + + +class ParameterGrid(SKLearnBase): + _marscls = MarsParameterGrid + + def __len__(self): + return len(self.mars_instance) + + def __iter__(self): + return iter(self.mars_instance) + + def __getitem__(self, index): + return self.mars_instance[index] + + +SKLEARN_ML_CLS_MAP = { + KFold: MarsKFold, + ParameterGrid: MarsParameterGrid, +} + +MARS_SKLEARN_ML_CALLABLES = _collect_module_callables( + mars_ml, sk_ml, skip_members=["register_op"] +) +_install_cls_members(SKLEARN_ML_CLS_MAP, MARS_SKLEARN_ML_CALLABLES, sk_ml) +attach_module_callable_docstring(KFold, sk_ml, sk_ml.KFold) +attach_module_callable_docstring(ParameterGrid, sk_ml, sk_ml.ParameterGrid) diff --git a/python/xorbits/sklearn/model_selection/tests/__init__.py b/python/xorbits/sklearn/model_selection/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/sklearn/model_selection/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/sklearn/model_selection/tests/test_core.py b/python/xorbits/sklearn/model_selection/tests/test_core.py new file mode 100644 index 000000000..e9cd89328 --- /dev/null +++ b/python/xorbits/sklearn/model_selection/tests/test_core.py @@ -0,0 +1,66 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +try: + import sklearn +except ImportError: # pragma: no cover + sklearn = None + +from typing import Iterable, Sized + +import numpy as np +import pytest + +from ...model_selection import KFold, ParameterGrid, train_test_split + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_doc(): + docstring = KFold.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.model_selection." + ) + + docstring = ParameterGrid.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.model_selection." + ) + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_parameter_grid(): + arr1 = [1, 2, 3] + params1 = {"foo": arr1} + grid1 = ParameterGrid(params1) + assert isinstance(grid1, Iterable) + assert isinstance(grid1, Sized) + assert len(grid1) == 3 + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_kfold(): + X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) + kf = KFold(n_splits=2) + splits = kf.get_n_splits(X) + assert splits == 2 + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_train_test_split(): + X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) + y = np.array([1, 2, 3, 4]) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) + assert X_train.shape == (2, 2) + assert X_test.shape == (2, 2) + assert y_train.shape == (2,) + assert y_test.shape == (2,) diff --git a/python/xorbits/sklearn/neighbors/__init__.py b/python/xorbits/sklearn/neighbors/__init__.py new file mode 100644 index 000000000..07bd6edea --- /dev/null +++ b/python/xorbits/sklearn/neighbors/__init__.py @@ -0,0 +1,49 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from ...core.utils.fallback import unimplemented_func + + +def _install(): + """Nothing required for installing sklearn.""" + + +def __dir__(): # pragma: no cover + try: + import sklearn + except ImportError: + raise AttributeError("sklearn is required but not installed.") + from .mars_adapters import MARS_SKLEARN_NEIGHBORS_CALLABLES + + return list(MARS_SKLEARN_NEIGHBORS_CALLABLES.keys()) + + +def __getattr__(name: str): # pragma: no cover + import inspect + + try: + import sklearn.neighbors as sk_neigh + except ImportError: + raise AttributeError("sklearn is required but not installed.") + from .mars_adapters import MARS_SKLEARN_NEIGHBORS_CALLABLES + + if name in MARS_SKLEARN_NEIGHBORS_CALLABLES: + return MARS_SKLEARN_NEIGHBORS_CALLABLES[name] + else: + if not hasattr(sk_neigh, name): + raise AttributeError(name) + else: + if inspect.ismethod(getattr(sk_neigh, name)): + return unimplemented_func() + else: + raise AttributeError diff --git a/python/xorbits/sklearn/neighbors/mars_adapters/__init__.py b/python/xorbits/sklearn/neighbors/mars_adapters/__init__.py new file mode 100644 index 000000000..4c58c1f1f --- /dev/null +++ b/python/xorbits/sklearn/neighbors/mars_adapters/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .core import MARS_SKLEARN_NEIGHBORS_CALLABLES diff --git a/python/xorbits/sklearn/neighbors/mars_adapters/core.py b/python/xorbits/sklearn/neighbors/mars_adapters/core.py new file mode 100644 index 000000000..bfa906478 --- /dev/null +++ b/python/xorbits/sklearn/neighbors/mars_adapters/core.py @@ -0,0 +1,39 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sklearn.neighbors as sk_neighbors + +from ...._mars.learn import neighbors as mars_neighbors +from ...._mars.learn.neighbors import NearestNeighbors as MarsNearestNeighbors +from ....core.utils.docstring import attach_module_callable_docstring +from ...utils import SKLearnBase, _collect_module_callables, _install_cls_members + + +class NearestNeighbors(SKLearnBase): + _marscls = MarsNearestNeighbors + + +SKLEARN_NEIGHBORS_CLS_MAP = { + NearestNeighbors: MarsNearestNeighbors, +} + +MARS_SKLEARN_NEIGHBORS_CALLABLES = _collect_module_callables( + mars_neighbors, sk_neighbors, skip_members=["register_op"] +) +_install_cls_members( + SKLEARN_NEIGHBORS_CLS_MAP, MARS_SKLEARN_NEIGHBORS_CALLABLES, sk_neighbors +) +attach_module_callable_docstring( + NearestNeighbors, sk_neighbors, sk_neighbors.NearestNeighbors +) diff --git a/python/xorbits/sklearn/neighbors/tests/__init__.py b/python/xorbits/sklearn/neighbors/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/sklearn/neighbors/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/sklearn/neighbors/tests/test_core.py b/python/xorbits/sklearn/neighbors/tests/test_core.py new file mode 100644 index 000000000..990931fff --- /dev/null +++ b/python/xorbits/sklearn/neighbors/tests/test_core.py @@ -0,0 +1,34 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +try: + import sklearn +except ImportError: # pragma: no cover + sklearn = None + +import pytest + +from ...neighbors import NearestNeighbors + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_doc(): + docstring = NearestNeighbors.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.neighbors." + ) + + docstring = NearestNeighbors.fit.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.neighbors._unsupervised.NearestNeighbors." + ) diff --git a/python/xorbits/sklearn/preprocessing/__init__.py b/python/xorbits/sklearn/preprocessing/__init__.py new file mode 100644 index 000000000..bf05574d2 --- /dev/null +++ b/python/xorbits/sklearn/preprocessing/__init__.py @@ -0,0 +1,49 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from ...core.utils.fallback import unimplemented_func + + +def _install(): + """Nothing required for installing sklearn.""" + + +def __dir__(): # pragma: no cover + try: + import sklearn + except ImportError: + raise AttributeError("sklearn is required but not installed.") + from .mars_adapters import MARS_SKLEARN_PREPROC_CALLABLES + + return list(MARS_SKLEARN_PREPROC_CALLABLES.keys()) + + +def __getattr__(name: str): # pragma: no cover + import inspect + + try: + import sklearn.preprocessing as sk_preproc + except ImportError: + raise AttributeError("sklearn is required but not installed.") + from .mars_adapters import MARS_SKLEARN_PREPROC_CALLABLES + + if name in MARS_SKLEARN_PREPROC_CALLABLES: + return MARS_SKLEARN_PREPROC_CALLABLES[name] + else: + if not hasattr(sk_preproc, name): + raise AttributeError(name) + else: + if inspect.ismethod(getattr(sk_preproc, name)): + return unimplemented_func() + else: + raise AttributeError diff --git a/python/xorbits/sklearn/preprocessing/mars_adapters/__init__.py b/python/xorbits/sklearn/preprocessing/mars_adapters/__init__.py new file mode 100644 index 000000000..ecfb158b7 --- /dev/null +++ b/python/xorbits/sklearn/preprocessing/mars_adapters/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .core import MARS_SKLEARN_PREPROC_CALLABLES diff --git a/python/xorbits/sklearn/preprocessing/mars_adapters/core.py b/python/xorbits/sklearn/preprocessing/mars_adapters/core.py new file mode 100644 index 000000000..b71799039 --- /dev/null +++ b/python/xorbits/sklearn/preprocessing/mars_adapters/core.py @@ -0,0 +1,51 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sklearn.preprocessing as sk_preproc + +from ...._mars.learn import preprocessing as mars_preproc +from ...._mars.learn.preprocessing import LabelBinarizer as MarsLabelBinarizer +from ...._mars.learn.preprocessing import LabelEncoder as MarsLabelEncoder +from ...._mars.learn.preprocessing import MinMaxScaler as MarsMinMaxScaler +from ....core.utils.docstring import attach_module_callable_docstring +from ...utils import SKLearnBase, _collect_module_callables, _install_cls_members + + +class MinMaxScaler(SKLearnBase): + _marscls = MarsMinMaxScaler + + +class LabelBinarizer(SKLearnBase): + _marscls = MarsLabelBinarizer + + +class LabelEncoder(SKLearnBase): + _marscls = MarsLabelEncoder + + +SKLEARN_PREPROC_CLS_MAP = { + MinMaxScaler: MarsMinMaxScaler, + LabelEncoder: MarsLabelEncoder, + LabelBinarizer: MarsLabelBinarizer, +} + +MARS_SKLEARN_PREPROC_CALLABLES = _collect_module_callables( + mars_preproc, sk_preproc, skip_members=["register_op"] +) +_install_cls_members( + SKLEARN_PREPROC_CLS_MAP, MARS_SKLEARN_PREPROC_CALLABLES, sk_preproc +) +attach_module_callable_docstring(MinMaxScaler, sk_preproc, sk_preproc.MinMaxScaler) +attach_module_callable_docstring(LabelBinarizer, sk_preproc, sk_preproc.LabelBinarizer) +attach_module_callable_docstring(LabelEncoder, sk_preproc, sk_preproc.LabelEncoder) diff --git a/python/xorbits/sklearn/preprocessing/tests/__init__.py b/python/xorbits/sklearn/preprocessing/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/sklearn/preprocessing/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/sklearn/preprocessing/tests/test_core.py b/python/xorbits/sklearn/preprocessing/tests/test_core.py new file mode 100644 index 000000000..68f4cf3e3 --- /dev/null +++ b/python/xorbits/sklearn/preprocessing/tests/test_core.py @@ -0,0 +1,82 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +try: + import sklearn +except ImportError: # pragma: no cover + sklearn = None + +import numpy as np +import pytest + +from ...preprocessing import LabelBinarizer, LabelEncoder, MinMaxScaler + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_doc(): + docstring = MinMaxScaler.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.preprocessing." + ) + + docstring = LabelBinarizer.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.preprocessing." + ) + + docstring = LabelEncoder.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.preprocessing." + ) + + docstring = MinMaxScaler.fit.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.preprocessing._data.MinMaxScaler." + ) + + docstring = LabelBinarizer.fit.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.preprocessing._label.LabelBinarizer." + ) + + docstring = LabelEncoder.fit.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.preprocessing._label.LabelEncoder." + ) + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_min_max_scaler(): + X = np.array([[1, 2], [2, 4], [4, 8], [8, 16]], dtype=np.float64) + scaler = MinMaxScaler() + scaler.fit(X) + np.testing.assert_array_equal(scaler.data_min_, [1.0, 2.0]) + np.testing.assert_array_equal(scaler.data_max_, [8.0, 16.0]) + np.testing.assert_array_equal(scaler.data_range_, [7.0, 14.0]) + + X_transformed = scaler.transform(X).fetch() + assert X_transformed.shape == (4, 2) + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_label_binarizer(): + lb = LabelBinarizer() + lb.fit([1, 2, 6, 4, 2]) + assert lb.classes_.tolist() == [1, 2, 4, 6] + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_label_encoder(): + le = LabelEncoder() + le.fit([1, 2, 2, 6]) + assert le.classes_.tolist() == [1, 2, 6] diff --git a/python/xorbits/sklearn/semi_supervised/__init__.py b/python/xorbits/sklearn/semi_supervised/__init__.py new file mode 100644 index 000000000..e4d2a1aca --- /dev/null +++ b/python/xorbits/sklearn/semi_supervised/__init__.py @@ -0,0 +1,49 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from ...core.utils.fallback import unimplemented_func + + +def _install(): + """Nothing required for installing sklearn.""" + + +def __dir__(): # pragma: no cover + try: + import sklearn + except ImportError: + raise AttributeError("sklearn is required but not installed.") + from .mars_adapters import MARS_SKLEARN_SS_CALLABLES + + return list(MARS_SKLEARN_SS_CALLABLES.keys()) + + +def __getattr__(name: str): # pragma: no cover + import inspect + + try: + import sklearn.semi_supervised as sk_ss + except ImportError: + raise AttributeError("sklearn is required but not installed.") + from .mars_adapters import MARS_SKLEARN_SS_CALLABLES + + if name in MARS_SKLEARN_SS_CALLABLES: + return MARS_SKLEARN_SS_CALLABLES[name] + else: + if not hasattr(sk_ss, name): + raise AttributeError(name) + else: + if inspect.ismethod(getattr(sk_ss, name)): + return unimplemented_func() + else: + raise AttributeError diff --git a/python/xorbits/sklearn/semi_supervised/mars_adapters/__init__.py b/python/xorbits/sklearn/semi_supervised/mars_adapters/__init__.py new file mode 100644 index 000000000..b53765590 --- /dev/null +++ b/python/xorbits/sklearn/semi_supervised/mars_adapters/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .core import MARS_SKLEARN_SS_CALLABLES diff --git a/python/xorbits/sklearn/semi_supervised/mars_adapters/core.py b/python/xorbits/sklearn/semi_supervised/mars_adapters/core.py new file mode 100644 index 000000000..9f144f49b --- /dev/null +++ b/python/xorbits/sklearn/semi_supervised/mars_adapters/core.py @@ -0,0 +1,35 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sklearn.semi_supervised as sk_ss + +from ...._mars.learn import semi_supervised as mars_ss +from ...._mars.learn.semi_supervised import LabelPropagation as MarsLabelPropagation +from ....core.utils.docstring import attach_module_callable_docstring +from ...utils import SKLearnBase, _collect_module_callables, _install_cls_members + + +class LabelPropagation(SKLearnBase): + _marscls = MarsLabelPropagation + + +SKLEARN_SS_CLS_MAP = { + LabelPropagation: MarsLabelPropagation, +} + +MARS_SKLEARN_SS_CALLABLES = _collect_module_callables( + mars_ss, sk_ss, skip_members=["register_op"] +) +_install_cls_members(SKLEARN_SS_CLS_MAP, MARS_SKLEARN_SS_CALLABLES, sk_ss) +attach_module_callable_docstring(LabelPropagation, sk_ss, sk_ss.LabelPropagation) diff --git a/python/xorbits/sklearn/semi_supervised/tests/__init__.py b/python/xorbits/sklearn/semi_supervised/tests/__init__.py new file mode 100644 index 000000000..37f6558d9 --- /dev/null +++ b/python/xorbits/sklearn/semi_supervised/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/xorbits/sklearn/semi_supervised/tests/test_core.py b/python/xorbits/sklearn/semi_supervised/tests/test_core.py new file mode 100644 index 000000000..3739664cc --- /dev/null +++ b/python/xorbits/sklearn/semi_supervised/tests/test_core.py @@ -0,0 +1,48 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +try: + import sklearn +except ImportError: # pragma: no cover + sklearn = None + +import numpy as np +import pytest + +from ...semi_supervised import LabelPropagation + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_doc(): + docstring = LabelPropagation.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.semi_supervised." + ) + + docstring = LabelPropagation.fit.__doc__ + assert docstring is not None and docstring.endswith( + "This docstring was copied from sklearn.semi_supervised._label_propagation.LabelPropagation." + ) + + +@pytest.mark.skipif(sklearn is None, reason="scikit-learn not installed") +def test_label_propagation(): + rng = np.random.RandomState(0) + X = rng.rand(10, 5) + y = np.array([0, 0, 0, 1, 1, -1, -1, -1, -1, -1]) + lp = LabelPropagation() + lp.fit(X, y) + assert lp.classes_.tolist() == [0, 1] + assert lp.transduction_.tolist() == [0, 0, 0, 1, 1, 0, 0, 0, 0, 0] + assert lp.predict(X).tolist() == [0, 0, 0, 1, 1, 0, 0, 0, 0, 0] + assert lp.score(X, y) == 0.5 diff --git a/python/xorbits/sklearn/utils.py b/python/xorbits/sklearn/utils.py new file mode 100644 index 000000000..ac2834e86 --- /dev/null +++ b/python/xorbits/sklearn/utils.py @@ -0,0 +1,72 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import inspect +from typing import Callable, Dict, List, Optional + +from ..core.adapter import to_mars, wrap_mars_callable + + +class SKLearnBase: + def __init__(self, *args, **kwargs): + self.mars_instance = self._marscls(*to_mars(args), **to_mars(kwargs)) + + def __getattr__(self, name): + return getattr(self.mars_instance, name) + + +def wrap_cls_func(marscls: Callable, name: str, submodule): + @functools.wraps(getattr(marscls, name)) + def wrapped(self, *args, **kwargs): + return getattr(self.mars_instance, name)(*args, **kwargs) + + return wrap_mars_callable( + wrapped, + member_name=name, + attach_docstring=True, + is_cls_member=True, + docstring_src_module=submodule, + docstring_src_cls=getattr(submodule, marscls.__name__, None), + ) + + +def _collect_module_callables( + mars_module, + orig_module, + skip_members: Optional[List[str]] = None, +) -> Dict[str, Callable]: + module_callables: Dict[str, Callable] = dict() + + for name, func in inspect.getmembers(mars_module, inspect.isfunction): + if skip_members is not None and name in skip_members: + continue + module_callables[name] = wrap_mars_callable( + func, + attach_docstring=True, + is_cls_member=False, + docstring_src_module=orig_module, + docstring_src=getattr(orig_module, name, None), + ) + return module_callables + + +def _install_cls_members( + module_cls_map, module_callables: Dict[str, Callable], orig_submodule +): + for x_cls, mars_cls in module_cls_map.items(): + module_callables[x_cls.__name__] = x_cls + for name, _ in inspect.getmembers(mars_cls, inspect.isfunction): + if not name.startswith("_"): + setattr(x_cls, name, wrap_cls_func(mars_cls, name, orig_submodule))