From f2fa7cfca7e116c3d2e0a4104553f938c910e3fd Mon Sep 17 00:00:00 2001
From: Dawnfz <2912706234@qq.com>
Date: Mon, 17 Jun 2024 10:48:00 +0800
Subject: [PATCH] ENH: Remove `sum_over_features` parameter from
 `manhattan_distances` with sklearn > 1.4 (#779)

---
 .../_mars/learn/metrics/pairwise/manhattan.py | 70 +++++--------------
 .../tests/test_manhattan_distances.py         | 29 +++-----
 2 files changed, 25 insertions(+), 74 deletions(-)

diff --git a/python/xorbits/_mars/learn/metrics/pairwise/manhattan.py b/python/xorbits/_mars/learn/metrics/pairwise/manhattan.py
index 0798f38c0..059b0906d 100644
--- a/python/xorbits/_mars/learn/metrics/pairwise/manhattan.py
+++ b/python/xorbits/_mars/learn/metrics/pairwise/manhattan.py
@@ -24,8 +24,7 @@
 
 from .... import opcodes as OperandDef
 from ....core import recursive_tile
-from ....serialization.serializables import BoolField, KeyField
-from ....tensor.arithmetic import abs as mt_abs
+from ....serialization.serializables import KeyField
 from ....tensor.array_utils import as_same_device, device
 from ....tensor.core import TensorOrder
 from ....tensor.spatial.distance import cdist
@@ -38,13 +37,11 @@ class ManhattanDistances(PairwiseDistances):
 
     _x = KeyField("x")
     _y = KeyField("y")
-    _sum_over_features = BoolField("sum_over_features")
 
-    def __init__(self, x=None, y=None, sum_over_features=None, use_sklearn=None, **kw):
+    def __init__(self, x=None, y=None, use_sklearn=None, **kw):
         super().__init__(
             _x=x,
             _y=y,
-            _sum_over_features=sum_over_features,
             _use_sklearn=use_sklearn,
             **kw,
         )
@@ -57,10 +54,6 @@ def x(self):
     def y(self):
         return self._y
 
-    @property
-    def sum_over_features(self):
-        return self._sum_over_features
-
     def _set_inputs(self, inputs):
         super()._set_inputs(inputs)
         self._x = self._inputs[0]
@@ -71,16 +64,7 @@ def __call__(self, X, Y=None):
         if self._y is None:
             self._y = Y
 
-        if (X.issparse() or Y.issparse()) and not self._sum_over_features:
-            raise TypeError(
-                f"sum_over_features={self._sum_over_features} not supported"
-                " for sparse matrices"
-            )
-
-        if not self._sum_over_features:
-            shape = (X.shape[0] * Y.shape[0], X.shape[1])
-        else:
-            shape = (X.shape[0], Y.shape[0])
+        shape = (X.shape[0], Y.shape[0])
 
         return self.new_tensor([X, Y], shape=shape, order=TensorOrder.C_ORDER)
 
@@ -92,17 +76,10 @@ def tile(cls, op):
             return cls._tile_one_chunk(op)
 
         if x.issparse() or y.issparse():
-            assert op.sum_over_features
             return cls._tile_chunks(op, x, y)
-        elif op.sum_over_features:
-            # if x, y are not sparse and `sum_over_features` is True
-            # just use cdist
-            return [(yield from recursive_tile(cdist(x, y, "cityblock")))]
         else:
-            d = x[:, np.newaxis, :] - y[np.newaxis, :, :]
-            d = mt_abs(d)
-            d = d.reshape((-1, x.shape[1]))
-            return [(yield from recursive_tile(d))]
+            # if x, y are not sparse, just use cdist
+            return [(yield from recursive_tile(cdist(x, y, "cityblock")))]
 
     @classmethod
     def execute(cls, ctx, op):
@@ -116,7 +93,6 @@ def execute(cls, ctx, op):
                 ctx[out.key] = sklearn_manhattan_distances(
                     ensure_own_data(x),
                     ensure_own_data(y),
-                    sum_over_features=op.sum_over_features,
                 )
             else:  # pragma: no cover
                 # we cannot support sparse
@@ -125,11 +101,8 @@ def execute(cls, ctx, op):
                 )
 
 
-def manhattan_distances(X, Y=None, sum_over_features=True):
-    """ Compute the L1 distances between the vectors in X and Y.
-
-    With sum_over_features equal to False it returns the componentwise
-    distances.
+def manhattan_distances(X, Y=None):
+    """Compute the L1 distances between the vectors in X and Y.
 
     Read more in the :ref:`User Guide <metrics>`.
 
@@ -141,19 +114,16 @@ def manhattan_distances(X, Y=None, sum_over_features=True):
     Y : array_like, optional
         A tensor with shape (n_samples_Y, n_features).
 
-    sum_over_features : bool, default=True
-        If True the function returns the pairwise distance matrix
-        else it returns the componentwise L1 pairwise-distances.
-        Not supported for sparse matrix inputs.
-
     Returns
     -------
-    D : Tensor
-        If sum_over_features is False shape is
-        (n_samples_X * n_samples_Y, n_features) and D contains the
-        componentwise L1 pairwise-distances (ie. absolute difference),
-        else shape is (n_samples_X, n_samples_Y) and D contains
-        the pairwise L1 distances.
+    distances : ndarray of shape (n_samples_X, n_samples_Y)
+        Pairwise L1 distances.
+
+    Notes
+    -----
+    When X and/or Y are CSR sparse matrices and they are not already
+    in canonical format, this function modifies them in-place to
+    make them canonical.
 
     Examples
     --------
@@ -168,14 +138,6 @@ def manhattan_distances(X, Y=None, sum_over_features=True):
          [[1, 2], [0, 3]]).execute() #doctest:+ELLIPSIS
     array([[0., 2.],
            [4., 4.]])
-    >>> import mars.tensor as mt
-    >>> X = mt.ones((1, 2))
-    >>> y = mt.full((2, 2), 2.)
-    >>> manhattan_distances(X, y, sum_over_features=False).execute() #doctest:+ELLIPSIS
-    array([[1., 1.],
-           [1., 1.]])
     """
-    op = ManhattanDistances(
-        x=X, y=Y, sum_over_features=sum_over_features, dtype=np.dtype(np.float64)
-    )
+    op = ManhattanDistances(x=X, y=Y, dtype=np.dtype(np.float64))
     return op(X, Y=Y)
diff --git a/python/xorbits/_mars/learn/metrics/pairwise/tests/test_manhattan_distances.py b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_manhattan_distances.py
index b8ba5b747..acfeee0e9 100644
--- a/python/xorbits/_mars/learn/metrics/pairwise/tests/test_manhattan_distances.py
+++ b/python/xorbits/_mars/learn/metrics/pairwise/tests/test_manhattan_distances.py
@@ -26,16 +26,8 @@ def test_manhattan_distances():
     x = mt.random.randint(10, size=(10, 3), density=0.4)
     y = mt.random.randint(10, size=(11, 3), density=0.5)
 
-    with pytest.raises(TypeError):
-        manhattan_distances(x, y, sum_over_features=False)
-
-    x = x.todense()
-    y = y.todense()
-
-    d = manhattan_distances(x, y, sum_over_features=True)
+    d = manhattan_distances(x, y)
     assert d.shape == (10, 11)
-    d = manhattan_distances(x, y, sum_over_features=False)
-    assert d.shape == (110, 3)
 
 
 raw_x = np.random.rand(20, 5)
@@ -67,19 +59,16 @@ def test_manhattan_distances_execution(setup, x, y, is_sparse):
     else:
         rx, ry = raw_x, raw_y
 
-    sv = [True, False] if not is_sparse else [True]
-
-    for sum_over_features in sv:
-        d = manhattan_distances(x, y, sum_over_features)
+    d = manhattan_distances(x, y)
 
-        result = d.execute().fetch()
-        expected = sk_manhattan_distances(rx, ry, sum_over_features=sum_over_features)
+    result = d.execute().fetch()
+    expected = sk_manhattan_distances(rx, ry)
 
-        np.testing.assert_almost_equal(result, expected)
+    np.testing.assert_almost_equal(result, expected)
 
-        d = manhattan_distances(x, sum_over_features=sum_over_features)
+    d = manhattan_distances(x)
 
-        result = d.execute().fetch()
-        expected = sk_manhattan_distances(rx, sum_over_features=sum_over_features)
+    result = d.execute().fetch()
+    expected = sk_manhattan_distances(rx)
 
-        np.testing.assert_almost_equal(result, expected)
+    np.testing.assert_almost_equal(result, expected)