From f2fa7cfca7e116c3d2e0a4104553f938c910e3fd Mon Sep 17 00:00:00 2001 From: Dawnfz <2912706234@qq.com> Date: Mon, 17 Jun 2024 10:48:00 +0800 Subject: [PATCH] ENH: Remove `sum_over_features` parameter from `manhattan_distances` with sklearn > 1.4 (#779) --- .../_mars/learn/metrics/pairwise/manhattan.py | 70 +++++-------------- .../tests/test_manhattan_distances.py | 29 +++----- 2 files changed, 25 insertions(+), 74 deletions(-) diff --git a/python/xorbits/_mars/learn/metrics/pairwise/manhattan.py b/python/xorbits/_mars/learn/metrics/pairwise/manhattan.py index 0798f38c0..059b0906d 100644 --- a/python/xorbits/_mars/learn/metrics/pairwise/manhattan.py +++ b/python/xorbits/_mars/learn/metrics/pairwise/manhattan.py @@ -24,8 +24,7 @@ from .... import opcodes as OperandDef from ....core import recursive_tile -from ....serialization.serializables import BoolField, KeyField -from ....tensor.arithmetic import abs as mt_abs +from ....serialization.serializables import KeyField from ....tensor.array_utils import as_same_device, device from ....tensor.core import TensorOrder from ....tensor.spatial.distance import cdist @@ -38,13 +37,11 @@ class ManhattanDistances(PairwiseDistances): _x = KeyField("x") _y = KeyField("y") - _sum_over_features = BoolField("sum_over_features") - def __init__(self, x=None, y=None, sum_over_features=None, use_sklearn=None, **kw): + def __init__(self, x=None, y=None, use_sklearn=None, **kw): super().__init__( _x=x, _y=y, - _sum_over_features=sum_over_features, _use_sklearn=use_sklearn, **kw, ) @@ -57,10 +54,6 @@ def x(self): def y(self): return self._y - @property - def sum_over_features(self): - return self._sum_over_features - def _set_inputs(self, inputs): super()._set_inputs(inputs) self._x = self._inputs[0] @@ -71,16 +64,7 @@ def __call__(self, X, Y=None): if self._y is None: self._y = Y - if (X.issparse() or Y.issparse()) and not self._sum_over_features: - raise TypeError( - f"sum_over_features={self._sum_over_features} not supported" - " for sparse matrices" - ) - - if not self._sum_over_features: - shape = (X.shape[0] * Y.shape[0], X.shape[1]) - else: - shape = (X.shape[0], Y.shape[0]) + shape = (X.shape[0], Y.shape[0]) return self.new_tensor([X, Y], shape=shape, order=TensorOrder.C_ORDER) @@ -92,17 +76,10 @@ def tile(cls, op): return cls._tile_one_chunk(op) if x.issparse() or y.issparse(): - assert op.sum_over_features return cls._tile_chunks(op, x, y) - elif op.sum_over_features: - # if x, y are not sparse and `sum_over_features` is True - # just use cdist - return [(yield from recursive_tile(cdist(x, y, "cityblock")))] else: - d = x[:, np.newaxis, :] - y[np.newaxis, :, :] - d = mt_abs(d) - d = d.reshape((-1, x.shape[1])) - return [(yield from recursive_tile(d))] + # if x, y are not sparse, just use cdist + return [(yield from recursive_tile(cdist(x, y, "cityblock")))] @classmethod def execute(cls, ctx, op): @@ -116,7 +93,6 @@ def execute(cls, ctx, op): ctx[out.key] = sklearn_manhattan_distances( ensure_own_data(x), ensure_own_data(y), - sum_over_features=op.sum_over_features, ) else: # pragma: no cover # we cannot support sparse @@ -125,11 +101,8 @@ def execute(cls, ctx, op): ) -def manhattan_distances(X, Y=None, sum_over_features=True): - """ Compute the L1 distances between the vectors in X and Y. - - With sum_over_features equal to False it returns the componentwise - distances. +def manhattan_distances(X, Y=None): + """Compute the L1 distances between the vectors in X and Y. Read more in the :ref:`User Guide `. @@ -141,19 +114,16 @@ def manhattan_distances(X, Y=None, sum_over_features=True): Y : array_like, optional A tensor with shape (n_samples_Y, n_features). - sum_over_features : bool, default=True - If True the function returns the pairwise distance matrix - else it returns the componentwise L1 pairwise-distances. - Not supported for sparse matrix inputs. - Returns ------- - D : Tensor - If sum_over_features is False shape is - (n_samples_X * n_samples_Y, n_features) and D contains the - componentwise L1 pairwise-distances (ie. absolute difference), - else shape is (n_samples_X, n_samples_Y) and D contains - the pairwise L1 distances. + distances : ndarray of shape (n_samples_X, n_samples_Y) + Pairwise L1 distances. + + Notes + ----- + When X and/or Y are CSR sparse matrices and they are not already + in canonical format, this function modifies them in-place to + make them canonical. Examples -------- @@ -168,14 +138,6 @@ def manhattan_distances(X, Y=None, sum_over_features=True): [[1, 2], [0, 3]]).execute() #doctest:+ELLIPSIS array([[0., 2.], [4., 4.]]) - >>> import mars.tensor as mt - >>> X = mt.ones((1, 2)) - >>> y = mt.full((2, 2), 2.) - >>> manhattan_distances(X, y, sum_over_features=False).execute() #doctest:+ELLIPSIS - array([[1., 1.], - [1., 1.]]) """ - op = ManhattanDistances( - x=X, y=Y, sum_over_features=sum_over_features, dtype=np.dtype(np.float64) - ) + op = ManhattanDistances(x=X, y=Y, dtype=np.dtype(np.float64)) return op(X, Y=Y) diff --git a/python/xorbits/_mars/learn/metrics/pairwise/tests/test_manhattan_distances.py b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_manhattan_distances.py index b8ba5b747..acfeee0e9 100644 --- a/python/xorbits/_mars/learn/metrics/pairwise/tests/test_manhattan_distances.py +++ b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_manhattan_distances.py @@ -26,16 +26,8 @@ def test_manhattan_distances(): x = mt.random.randint(10, size=(10, 3), density=0.4) y = mt.random.randint(10, size=(11, 3), density=0.5) - with pytest.raises(TypeError): - manhattan_distances(x, y, sum_over_features=False) - - x = x.todense() - y = y.todense() - - d = manhattan_distances(x, y, sum_over_features=True) + d = manhattan_distances(x, y) assert d.shape == (10, 11) - d = manhattan_distances(x, y, sum_over_features=False) - assert d.shape == (110, 3) raw_x = np.random.rand(20, 5) @@ -67,19 +59,16 @@ def test_manhattan_distances_execution(setup, x, y, is_sparse): else: rx, ry = raw_x, raw_y - sv = [True, False] if not is_sparse else [True] - - for sum_over_features in sv: - d = manhattan_distances(x, y, sum_over_features) + d = manhattan_distances(x, y) - result = d.execute().fetch() - expected = sk_manhattan_distances(rx, ry, sum_over_features=sum_over_features) + result = d.execute().fetch() + expected = sk_manhattan_distances(rx, ry) - np.testing.assert_almost_equal(result, expected) + np.testing.assert_almost_equal(result, expected) - d = manhattan_distances(x, sum_over_features=sum_over_features) + d = manhattan_distances(x) - result = d.execute().fetch() - expected = sk_manhattan_distances(rx, sum_over_features=sum_over_features) + result = d.execute().fetch() + expected = sk_manhattan_distances(rx) - np.testing.assert_almost_equal(result, expected) + np.testing.assert_almost_equal(result, expected)