diff --git a/.travis.yml b/.travis.yml index ba646954..77767f9b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,6 +8,10 @@ python: - 3.5 - 3.6 +before_install: + - sudo apt-get -qq update + - sudo apt-get install -y libflann-dev + addons: apt: packages: diff --git a/README.rst b/README.rst index 5ae3f297..7b8890ae 100644 --- a/README.rst +++ b/README.rst @@ -37,7 +37,7 @@ The documentation is available on `Read the Docs `_ and development takes place on `GitHub `_. -A (mostly unmaintained) `Matlab version `_ exists. +(A (mostly unmaintained) `Matlab version `_ exists.) The PyGSP facilitates a wide variety of operations on graphs, like computing their Fourier basis, filtering or interpolating signals, plotting graphs, diff --git a/pygsp/graphs/nngraphs/nngraph.py b/pygsp/graphs/nngraphs/nngraph.py index 3415d773..ec91afe5 100644 --- a/pygsp/graphs/nngraphs/nngraph.py +++ b/pygsp/graphs/nngraphs/nngraph.py @@ -3,23 +3,197 @@ import traceback import numpy as np -from scipy import sparse, spatial +from scipy import sparse +import scipy.spatial as sps +import scipy.spatial.distance as spsd +import multiprocessing from pygsp import utils from pygsp.graphs import Graph # prevent circular import in Python < 3.5 _logger = utils.build_logger(__name__) - -def _import_pfl(): +# conversion between the FLANN conventions and the various backend functions +_dist_translation = { + 'scipy-kdtree': { + 'euclidean': 2, + 'manhattan': 1, + 'max_dist': np.inf + }, + 'scipy-ckdtree': { + 'euclidean': 2, + 'manhattan': 1, + 'max_dist': np.inf + }, + 'scipy-pdist' : { + 'euclidean': 'euclidean', + 'manhattan': 'cityblock', + 'max_dist': 'chebyshev', + 'minkowski': 'minkowski' + }, + 'nmslib' : { + 'euclidean': 'l2', + 'manhattan': 'l1', + 'max_dist': 'linf', + 'minkowski': 'lp' + } + } + +def _import_cfl(): try: - import pyflann as pfl + import cyflann as cfl except Exception: - raise ImportError('Cannot import pyflann. Choose another nearest ' + raise ImportError('Cannot import cyflann. Choose another nearest ' 'neighbors method or try to install it with ' - 'pip (or conda) install pyflann (or pyflann3).') - return pfl + 'pip (or conda) install cyflann.') + return cfl +def _import_nmslib(): + try: + import nmslib as nms + except Exception: + raise ImportError('Cannot import nmslib. Choose another nearest ' + 'neighbors method or try to install it with ' + 'pip (or conda) install nmslib.') + return nms + +def _knn_sp_kdtree(X, num_neighbors, dist_type, order=0): + kdt = sps.KDTree(X) + D, NN = kdt.query(X, k=(num_neighbors + 1), + p=_dist_translation['scipy-kdtree'][dist_type]) + return NN, D + +def _knn_sp_ckdtree(X, num_neighbors, dist_type, order=0): + kdt = sps.cKDTree(X) + D, NN = kdt.query(X, k=(num_neighbors + 1), + p=_dist_translation['scipy-ckdtree'][dist_type], + n_jobs=-1) + return NN, D + + +def _knn_flann(X, num_neighbors, dist_type, order): + # the combination FLANN + max_dist produces incorrect results + # do not allow it + if dist_type == 'max_dist': + raise ValueError('FLANN and max_dist is not supported') + + cfl = _import_cfl() + cfl.set_distance_type(dist_type, order=order) + c = cfl.FLANNIndex(algorithm='kdtree') + c.build_index(X) + # Default FLANN parameters (I tried changing the algorithm and + # testing performance on huge matrices, but the default one + # seems to work best). + NN, D = c.nn_index(X, num_neighbors + 1) + c.free_index() + if dist_type == 'euclidean': # flann returns squared distances + return NN, np.sqrt(D) + return NN, D + +def _radius_sp_kdtree(X, epsilon, dist_type, order=0): + kdt = sps.KDTree(X) + D, NN = kdt.query(X, k=None, distance_upper_bound=epsilon, + p=_dist_translation['scipy-kdtree'][dist_type]) + return NN, D + +def _radius_sp_ckdtree(X, epsilon, dist_type, order=0): + N, dim = np.shape(X) + kdt = sps.cKDTree(X) + nn = kdt.query_ball_point(X, r=epsilon, + p=_dist_translation['scipy-ckdtree'][dist_type], + n_jobs=-1) + D = [] + NN = [] + for k in range(N): + x = np.matlib.repmat(X[k, :], len(nn[k]), 1) + d = np.linalg.norm(x - X[nn[k], :], + ord=_dist_translation['scipy-ckdtree'][dist_type], + axis=1) + nidx = d.argsort() + NN.append(np.take(nn[k], nidx)) + D.append(np.sort(d)) + return NN, D + + +def _knn_sp_pdist(X, num_neighbors, dist_type, order): + if dist_type == 'minkowski': + p = spsd.pdist(X, metric=_dist_translation['scipy-pdist'][dist_type], + p=order) + else: + p = spsd.pdist(X, metric=_dist_translation['scipy-pdist'][dist_type]) + pd = spsd.squareform(p) + pds = np.sort(pd)[:, 0:num_neighbors+1] + pdi = pd.argsort()[:, 0:num_neighbors+1] + return pdi, pds + +def _knn_nmslib(X, num_neighbors, dist_type, order): + N, dim = np.shape(X) + ncpu = multiprocessing.cpu_count() + if dist_type == 'minkowski': + raise ValueError('unsupported distance type (lp) for nmslib') + nms = _import_nmslib() + nmsidx = nms.init(space=_dist_translation['nmslib'][dist_type]) + nmsidx.addDataPointBatch(X) + nmsidx.createIndex() + q = nmsidx.knnQueryBatch(X, k=num_neighbors+1, num_threads=int(ncpu/2)) + nn, d = zip(*q) + D = np.concatenate(d).reshape(N, num_neighbors+1) + NN = np.concatenate(nn).reshape(N, num_neighbors+1) + return NN, D + +def _radius_sp_pdist(X, epsilon, dist_type, order): + N, dim = np.shape(X) + if dist_type == 'minkowski': + p = spsd.pdist(X, metric=_dist_translation['scipy-pdist'][dist_type], + p=order) + else: + p = spsd.pdist(X, metric=_dist_translation['scipy-pdist'][dist_type]) + pd = spsd.squareform(p) + pdf = pd < epsilon + D = [] + NN = [] + for k in range(N): + v = pd[k, pdf[k, :]] + d = pd[k, :].argsort() + # use the same conventions as in scipy.distance.kdtree + NN.append(d[0:len(v)]) + D.append(np.sort(v)) + + return NN, D + +def _radius_flann(X, epsilon, dist_type, order=0): + N, dim = np.shape(X) + # the combination FLANN + max_dist produces incorrect results + # do not allow it + if dist_type == 'max_dist': + raise ValueError('FLANN and max_dist is not supported') + + cfl = _import_cfl() + cfl.set_distance_type(dist_type, order=order) + c = cfl.FLANNIndex(algorithm='kdtree') + c.build_index(X) + D = [] + NN = [] + for k in range(N): + nn, d = c.nn_radius(X[k, :], epsilon*epsilon) + D.append(d) + NN.append(nn) + c.free_index() + if dist_type == 'euclidean': # flann returns squared distances + return NN, list(map(np.sqrt, D)) + return NN, D + +def _radius_nmslib(X, epsilon, dist_type, order=0): + raise ValueError('nmslib does not support (yet?) range queries') + +def center_input(X, N): + return X - np.kron(np.ones((N, 1)), np.mean(X, axis=0)) + +def rescale_input(X, N, d): + bounding_radius = 0.5 * np.linalg.norm(np.amax(X, axis=0) - + np.amin(X, axis=0), 2) + scale = np.power(N, 1. / float(min(d, 3))) / 10. + return X * scale / bounding_radius class NNGraph(Graph): r"""Nearest-neighbor graph from given point cloud. @@ -33,9 +207,13 @@ class NNGraph(Graph): Type of nearest neighbor graph to create. The options are 'knn' for k-Nearest Neighbors or 'radius' for epsilon-Nearest Neighbors (default is 'knn'). - use_flann : bool, optional - Use Fast Library for Approximate Nearest Neighbors (FLANN) or not. - (default is False) + backend : {'scipy-kdtree', 'scipy-pdist', 'flann'} + Type of the backend for graph construction. + - 'scipy-kdtree' will use scipy.spatial.KDTree + - 'scipy-ckdtree'(default) will use scipy.spatial.cKDTree + - 'scipy-pdist' will use scipy.spatial.distance.pdist (slowest but exact) + - 'flann' use Fast Library for Approximate Nearest Neighbors (FLANN) + - 'nmslib' use nmslib for approximate nearest neighbors (faster in high-dimensional spaces) center : bool, optional Center the data so that it has zero mean (default is True) rescale : bool, optional @@ -72,19 +250,43 @@ class NNGraph(Graph): """ - def __init__(self, Xin, NNtype='knn', use_flann=False, center=True, + + def __init__(self, Xin, NNtype='knn', backend='scipy-ckdtree', center=True, rescale=True, k=10, sigma=0.1, epsilon=0.01, plotting={}, symmetrize_type='average', dist_type='euclidean', order=0, **kwargs): self.Xin = Xin self.NNtype = NNtype - self.use_flann = use_flann + self.backend = backend self.center = center self.rescale = rescale self.k = k self.sigma = sigma self.epsilon = epsilon + + _dist_translation['scipy-kdtree']['minkowski'] = order + _dist_translation['scipy-ckdtree']['minkowski'] = order + + self._nn_functions = { + 'knn': { + 'scipy-kdtree': _knn_sp_kdtree, + 'scipy-ckdtree': _knn_sp_ckdtree, + 'scipy-pdist': _knn_sp_pdist, + 'flann': _knn_flann, + 'nmslib': _knn_nmslib + }, + 'radius': { + 'scipy-kdtree': _radius_sp_kdtree, + 'scipy-ckdtree': _radius_sp_ckdtree, + 'scipy-pdist': _radius_sp_pdist, + 'flann': _radius_flann, + 'nmslib': _radius_nmslib + }, + } + + + self.symmetrize_type = symmetrize_type self.dist_type = dist_type self.order = order @@ -93,73 +295,40 @@ def __init__(self, Xin, NNtype='knn', use_flann=False, center=True, Xout = self.Xin if self.center: - Xout = self.Xin - np.kron(np.ones((N, 1)), - np.mean(self.Xin, axis=0)) + Xout = center_input(Xout, N) if self.rescale: - bounding_radius = 0.5 * np.linalg.norm(np.amax(Xout, axis=0) - - np.amin(Xout, axis=0), 2) - scale = np.power(N, 1. / float(min(d, 3))) / 10. - Xout *= scale / bounding_radius - - # Translate distance type string to corresponding Minkowski order. - dist_translation = {"euclidean": 2, - "manhattan": 1, - "max_dist": np.inf, - "minkowski": order - } + Xout = rescale_input(Xout, N, d) - if self.NNtype == 'knn': - spi = np.zeros((N * k)) - spj = np.zeros((N * k)) - spv = np.zeros((N * k)) - - if self.use_flann: - pfl = _import_pfl() - pfl.set_distance_type(dist_type, order=order) - flann = pfl.FLANN() - - # Default FLANN parameters (I tried changing the algorithm and - # testing performance on huge matrices, but the default one - # seems to work best). - NN, D = flann.nn(Xout, Xout, num_neighbors=(k + 1), - algorithm='kdtree') - - else: - kdt = spatial.KDTree(Xout) - D, NN = kdt.query(Xout, k=(k + 1), - p=dist_translation[dist_type]) - - for i in range(N): - spi[i * k:(i + 1) * k] = np.kron(np.ones((k)), i) - spj[i * k:(i + 1) * k] = NN[i, 1:] - spv[i * k:(i + 1) * k] = np.exp(-np.power(D[i, 1:], 2) / - float(self.sigma)) + + if self._nn_functions.get(NNtype) == None: + raise ValueError('Invalid NNtype {}'.format(self.NNtype)) + if self._nn_functions[NNtype].get(backend) == None: + raise ValueError('Invalid backend {} for type {}'.format(backend, + self.NNtype)) + if self.NNtype == 'knn': + NN, D = self._nn_functions[NNtype][backend](Xout, k, + dist_type, order) + elif self.NNtype == 'radius': + NN, D = self._nn_functions[NNtype][backend](Xout, epsilon, + dist_type, order) + countV = list(map(len, NN)) + count = sum(countV) + spi = np.zeros((count)) + spj = np.zeros((count)) + spv = np.zeros((count)) + + start = 0 + for i in range(N): + leng = countV[i] - 1 + spi[start:start + leng] = np.kron(np.ones((leng)), i) + spj[start:start + leng] = NN[i][1:] + spv[start:start + leng] = np.exp(-np.power(D[i][1:], 2) / + float(self.sigma)) + start = start + leng - kdt = spatial.KDTree(Xout) - D, NN = kdt.query(Xout, k=None, distance_upper_bound=epsilon, - p=dist_translation[dist_type]) - count = 0 - for i in range(N): - count = count + len(NN[i]) - - spi = np.zeros((count)) - spj = np.zeros((count)) - spv = np.zeros((count)) - - start = 0 - for i in range(N): - leng = len(NN[i]) - 1 - spi[start:start + leng] = np.kron(np.ones((leng)), i) - spj[start:start + leng] = NN[i][1:] - spv[start:start + leng] = np.exp(-np.power(D[i][1:], 2) / - float(self.sigma)) - start = start + leng - - else: - raise ValueError('Unknown NNtype {}'.format(self.NNtype)) W = sparse.csc_matrix((spv, (spi, spj)), shape=(N, N)) @@ -176,7 +345,7 @@ def __init__(self, Xin, NNtype='knn', use_flann=False, center=True, def _get_extra_repr(self): return {'NNtype': self.NNtype, - 'use_flann': self.use_flann, + 'backend': self.backend, 'center': self.center, 'rescale': self.rescale, 'k': self.k, diff --git a/pygsp/tests/test_graphs.py b/pygsp/tests/test_graphs.py index 1680defc..b2720629 100644 --- a/pygsp/tests/test_graphs.py +++ b/pygsp/tests/test_graphs.py @@ -6,9 +6,10 @@ """ import unittest - +import os import numpy as np import scipy.linalg +import scipy.sparse.linalg from skimage import data, img_as_float from pygsp import graphs @@ -182,20 +183,93 @@ def test_set_coordinates(self): def test_nngraph(self): Xin = np.arange(90).reshape(30, 3) dist_types = ['euclidean', 'manhattan', 'max_dist', 'minkowski'] - - for dist_type in dist_types: - - # Only p-norms with 1<=p<=infinity permitted. - if dist_type != 'minkowski': - graphs.NNGraph(Xin, NNtype='radius', dist_type=dist_type) - graphs.NNGraph(Xin, NNtype='knn', dist_type=dist_type) - - # Distance type unsupported in the C bindings, - # use the C++ bindings instead. - if dist_type != 'max_dist': - graphs.NNGraph(Xin, use_flann=True, NNtype='knn', - dist_type=dist_type) - + backends = ['scipy-kdtree', 'scipy-ckdtree', 'scipy-pdist', 'nmslib'] + if os.name != 'nt': + backends.append('flann') + order=3 # for minkowski, FLANN only accepts integer orders + + for cur_backend in backends: + for dist_type in dist_types: + #print("backend={} dist={}".format(cur_backend, dist_type)) + if (cur_backend == 'flann' and + dist_type == 'max_dist') or (cur_backend == 'nmslib' and + dist_type == 'minkowski'): + self.assertRaises(ValueError, graphs.NNGraph, Xin, + NNtype='knn', backend=cur_backend, + dist_type=dist_type) + self.assertRaises(ValueError, graphs.NNGraph, Xin, + NNtype='radius', backend=cur_backend, + dist_type=dist_type) + else: + if cur_backend == 'nmslib': + self.assertRaises(ValueError, graphs.NNGraph, Xin, + NNtype='radius', backend=cur_backend, + dist_type=dist_type, order=order) + else: + graphs.NNGraph(Xin, NNtype='radius', + backend=cur_backend, + dist_type=dist_type, order=order) + graphs.NNGraph(Xin, NNtype='knn', + backend=cur_backend, + dist_type=dist_type, order=order) + graphs.NNGraph(Xin, NNtype='knn', + backend=cur_backend, + dist_type=dist_type, order=order, + center=False) + graphs.NNGraph(Xin, NNtype='knn', + backend=cur_backend, + dist_type=dist_type, order=order, + rescale=False) + graphs.NNGraph(Xin, NNtype='knn', + backend=cur_backend, + dist_type=dist_type, order=order, + rescale=False, center=False) + self.assertRaises(ValueError, graphs.NNGraph, Xin, + NNtype='badtype', backend=cur_backend, + dist_type=dist_type) + self.assertRaises(ValueError, graphs.NNGraph, Xin, + NNtype='knn', backend='badtype', + dist_type=dist_type) + + def test_nngraph_consistency(self): + Xin = np.arange(90).reshape(30, 3) + dist_types = ['euclidean', 'manhattan', 'max_dist', 'minkowski'] + backends = ['scipy-kdtree', 'scipy-ckdtree', 'nmslib'] + if os.name != 'nt': + backends.append('flann') + num_neighbors=4 + epsilon=0.1 + + # use pdist as ground truth + G = graphs.NNGraph(Xin, NNtype='knn', + backend='scipy-pdist', k=num_neighbors) + for cur_backend in backends: + for dist_type in dist_types: + if cur_backend == 'flann' and dist_type == 'max_dist': + continue + if cur_backend == 'nmslib' and dist_type == 'minkowski': + continue + #print("backend={} dist={}".format(cur_backend, dist_type)) + Gt = graphs.NNGraph(Xin, NNtype='knn', + backend=cur_backend, k=num_neighbors) + d = scipy.sparse.linalg.norm(G.W - Gt.W) + self.assertTrue(d < 0.01, 'Graphs (knn {}/{}) are not identical error={}'.format(cur_backend, dist_type, d)) + + G = graphs.NNGraph(Xin, NNtype='radius', + backend='scipy-pdist', epsilon=epsilon) + for cur_backend in backends: + for dist_type in dist_types: + if cur_backend == 'flann' and dist_type == 'max_dist': + continue + if cur_backend == 'nmslib': #unsupported + continue + #print("backend={} dist={}".format(cur_backend, dist_type)) + Gt = graphs.NNGraph(Xin, NNtype='radius', + backend=cur_backend, epsilon=epsilon) + d = scipy.sparse.linalg.norm(G.W - Gt.W, ord=1) + self.assertTrue(d < 0.01, + 'Graphs (radius {}/{}) are not identical error={}'.format(cur_backend, dist_type, d)) + def test_bunny(self): graphs.Bunny() diff --git a/setup.py b/setup.py index 5d0e2143..4e48bba5 100644 --- a/setup.py +++ b/setup.py @@ -36,8 +36,9 @@ # Construct patch graphs from images. 'scikit-image', # Approximate nearest neighbors for kNN graphs. - 'pyflann; python_version == "2.*"', - 'pyflann3; python_version == "3.*"', + 'cyflann', + 'pybind11', + 'nmslib', # Convex optimization on graph. 'pyunlocbox', # Plot graphs, signals, and filters. @@ -66,7 +67,7 @@ # Dependencies to build and upload packages. 'pkg': [ 'wheel', - 'twine', + 'twine' ], }, license="BSD",