Skip to content

Commit

Permalink
ENH: Remove sum_over_features parameter from manhattan_distances
Browse files Browse the repository at this point in the history
…with sklearn > 1.4 (#779)
  • Loading branch information
Dawnfz-Lenfeng authored Jun 17, 2024
1 parent 79c3bc8 commit f2fa7cf
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 74 deletions.
70 changes: 16 additions & 54 deletions python/xorbits/_mars/learn/metrics/pairwise/manhattan.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@

from .... import opcodes as OperandDef
from ....core import recursive_tile
from ....serialization.serializables import BoolField, KeyField
from ....tensor.arithmetic import abs as mt_abs
from ....serialization.serializables import KeyField
from ....tensor.array_utils import as_same_device, device
from ....tensor.core import TensorOrder
from ....tensor.spatial.distance import cdist
Expand All @@ -38,13 +37,11 @@ class ManhattanDistances(PairwiseDistances):

_x = KeyField("x")
_y = KeyField("y")
_sum_over_features = BoolField("sum_over_features")

def __init__(self, x=None, y=None, sum_over_features=None, use_sklearn=None, **kw):
def __init__(self, x=None, y=None, use_sklearn=None, **kw):
super().__init__(
_x=x,
_y=y,
_sum_over_features=sum_over_features,
_use_sklearn=use_sklearn,
**kw,
)
Expand All @@ -57,10 +54,6 @@ def x(self):
def y(self):
return self._y

@property
def sum_over_features(self):
return self._sum_over_features

def _set_inputs(self, inputs):
super()._set_inputs(inputs)
self._x = self._inputs[0]
Expand All @@ -71,16 +64,7 @@ def __call__(self, X, Y=None):
if self._y is None:
self._y = Y

if (X.issparse() or Y.issparse()) and not self._sum_over_features:
raise TypeError(
f"sum_over_features={self._sum_over_features} not supported"
" for sparse matrices"
)

if not self._sum_over_features:
shape = (X.shape[0] * Y.shape[0], X.shape[1])
else:
shape = (X.shape[0], Y.shape[0])
shape = (X.shape[0], Y.shape[0])

return self.new_tensor([X, Y], shape=shape, order=TensorOrder.C_ORDER)

Expand All @@ -92,17 +76,10 @@ def tile(cls, op):
return cls._tile_one_chunk(op)

if x.issparse() or y.issparse():
assert op.sum_over_features
return cls._tile_chunks(op, x, y)
elif op.sum_over_features:
# if x, y are not sparse and `sum_over_features` is True
# just use cdist
return [(yield from recursive_tile(cdist(x, y, "cityblock")))]
else:
d = x[:, np.newaxis, :] - y[np.newaxis, :, :]
d = mt_abs(d)
d = d.reshape((-1, x.shape[1]))
return [(yield from recursive_tile(d))]
# if x, y are not sparse, just use cdist
return [(yield from recursive_tile(cdist(x, y, "cityblock")))]

@classmethod
def execute(cls, ctx, op):
Expand All @@ -116,7 +93,6 @@ def execute(cls, ctx, op):
ctx[out.key] = sklearn_manhattan_distances(
ensure_own_data(x),
ensure_own_data(y),
sum_over_features=op.sum_over_features,
)
else: # pragma: no cover
# we cannot support sparse
Expand All @@ -125,11 +101,8 @@ def execute(cls, ctx, op):
)


def manhattan_distances(X, Y=None, sum_over_features=True):
""" Compute the L1 distances between the vectors in X and Y.
With sum_over_features equal to False it returns the componentwise
distances.
def manhattan_distances(X, Y=None):
"""Compute the L1 distances between the vectors in X and Y.
Read more in the :ref:`User Guide <metrics>`.
Expand All @@ -141,19 +114,16 @@ def manhattan_distances(X, Y=None, sum_over_features=True):
Y : array_like, optional
A tensor with shape (n_samples_Y, n_features).
sum_over_features : bool, default=True
If True the function returns the pairwise distance matrix
else it returns the componentwise L1 pairwise-distances.
Not supported for sparse matrix inputs.
Returns
-------
D : Tensor
If sum_over_features is False shape is
(n_samples_X * n_samples_Y, n_features) and D contains the
componentwise L1 pairwise-distances (ie. absolute difference),
else shape is (n_samples_X, n_samples_Y) and D contains
the pairwise L1 distances.
distances : ndarray of shape (n_samples_X, n_samples_Y)
Pairwise L1 distances.
Notes
-----
When X and/or Y are CSR sparse matrices and they are not already
in canonical format, this function modifies them in-place to
make them canonical.
Examples
--------
Expand All @@ -168,14 +138,6 @@ def manhattan_distances(X, Y=None, sum_over_features=True):
[[1, 2], [0, 3]]).execute() #doctest:+ELLIPSIS
array([[0., 2.],
[4., 4.]])
>>> import mars.tensor as mt
>>> X = mt.ones((1, 2))
>>> y = mt.full((2, 2), 2.)
>>> manhattan_distances(X, y, sum_over_features=False).execute() #doctest:+ELLIPSIS
array([[1., 1.],
[1., 1.]])
"""
op = ManhattanDistances(
x=X, y=Y, sum_over_features=sum_over_features, dtype=np.dtype(np.float64)
)
op = ManhattanDistances(x=X, y=Y, dtype=np.dtype(np.float64))
return op(X, Y=Y)
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,8 @@ def test_manhattan_distances():
x = mt.random.randint(10, size=(10, 3), density=0.4)
y = mt.random.randint(10, size=(11, 3), density=0.5)

with pytest.raises(TypeError):
manhattan_distances(x, y, sum_over_features=False)

x = x.todense()
y = y.todense()

d = manhattan_distances(x, y, sum_over_features=True)
d = manhattan_distances(x, y)
assert d.shape == (10, 11)
d = manhattan_distances(x, y, sum_over_features=False)
assert d.shape == (110, 3)


raw_x = np.random.rand(20, 5)
Expand Down Expand Up @@ -67,19 +59,16 @@ def test_manhattan_distances_execution(setup, x, y, is_sparse):
else:
rx, ry = raw_x, raw_y

sv = [True, False] if not is_sparse else [True]

for sum_over_features in sv:
d = manhattan_distances(x, y, sum_over_features)
d = manhattan_distances(x, y)

result = d.execute().fetch()
expected = sk_manhattan_distances(rx, ry, sum_over_features=sum_over_features)
result = d.execute().fetch()
expected = sk_manhattan_distances(rx, ry)

np.testing.assert_almost_equal(result, expected)
np.testing.assert_almost_equal(result, expected)

d = manhattan_distances(x, sum_over_features=sum_over_features)
d = manhattan_distances(x)

result = d.execute().fetch()
expected = sk_manhattan_distances(rx, sum_over_features=sum_over_features)
result = d.execute().fetch()
expected = sk_manhattan_distances(rx)

np.testing.assert_almost_equal(result, expected)
np.testing.assert_almost_equal(result, expected)

0 comments on commit f2fa7cf

Please sign in to comment.