epfl-lts2 · mdeff · Mar 19, 2018 · Mar 19, 2018 · Mar 19, 2018 · Mar 19, 2018
diff --git a/.travis.yml b/.travis.yml
@@ -25,9 +25,10 @@ addons:
     - ubuntu-toolchain-r-test
     - sourceline: 'deb http://downloads.skewed.de/apt/trusty trusty universe'
     packages:
+    - libqt5gui5  # pyqt5>5.11 otherwise cannot load the xcb platform plugin
+    - libflann-dev
     - python3-graph-tool
     - python-graph-tool
-    - libqt5gui5  # pyqt5>5.11 fails to load the xcb platform plugin without it
 
 install:
   - pip install --upgrade --upgrade-strategy eager .[dev]

diff --git a/README.rst b/README.rst
@@ -39,6 +39,15 @@ A (mostly unmaintained) `Matlab version <https://lts2.epfl.ch/gsp>`_ exists.
 .. |conda| image:: https://anaconda.org/conda-forge/pygsp/badges/installer/conda.svg
    :target: https://anaconda.org/conda-forge/pygsp
 
+
+The PyGSP is a Python package to ease
+`Signal Processing on Graphs <https://arxiv.org/abs/1211.0053>`_.
+The documentation is available on
+`Read the Docs <https://pygsp.readthedocs.io>`_
+and development takes place on
+`GitHub <https://github.com/epfl-lts2/pygsp>`_.
+A (mostly unmaintained) `Matlab version <https://epfl-lts2.github.io/gspbox-html>`_ exists.
+
 The PyGSP facilitates a wide variety of operations on graphs, like computing
 their Fourier basis, filtering or interpolating signals, plotting graphs,
 signals, and filters. Its core is spectral graph theory, and many of the

diff --git a/doc/history.rst b/doc/history.rst
@@ -24,13 +24,17 @@ History
 * New implementation of the Sensor graph that is simpler and scales better.
 * A new learning module with three functions to solve standard semi-supervised
   classification and regression problems.
+* A much improved, fixed, documented, and tested NNGraph. The user can now
+  select the backend and similarity kernel. The radius can be estimated and
+  features standardized. (PR #43)
 * Import and export graphs and their signals to NetworkX and graph-tool.
 * Save and load graphs and theirs signals to / from GraphML, GML, and GEXF.
 * Documentation: path graph linked to DCT, ring graph linked to DFT.
 * We now have a gallery of examples! That is convenient for users to get a
   taste of what the library can do, and to start working from a code snippet.
 * Merged all the extra requirements in a single dev requirement.
 
+
 Experimental filter API (to be tested and validated):
 
 * evaluate a filter bank with ``g(values)``

diff --git a/doc/tutorials/optimization.rst b/doc/tutorials/optimization.rst
@@ -85,7 +85,7 @@ We start with the graph TV regularization. We will use the :class:`pyunlocbox.so
     >>> prob1 = pyunlocbox.solvers.solve([d, r, f], solver=solver,
     ...                                  x0=x0, rtol=0, maxit=1000)
     Solution found after 1000 iterations:
-        objective function f(sol) = 2.250584e+02
+        objective function f(sol) = 2.256055e+02
         stopping criterion: MAXIT
     >>>
     >>> fig, ax = G.plot(prob1['sol'])
@@ -107,7 +107,7 @@ This figure shows the label signal recovered by graph total variation regulariza
     >>> prob2 = pyunlocbox.solvers.solve([r, f], solver=solver,
     ...                                  x0=x0, rtol=0, maxit=1000)
     Solution found after 1000 iterations:
-        objective function f(sol) = 6.504290e+01
+        objective function f(sol) = 4.376481e+01
         stopping criterion: MAXIT
     >>>
     >>> fig, ax = G.plot(prob2['sol'])

diff --git a/pygsp/_nearest_neighbor.py b/pygsp/_nearest_neighbor.py
@@ -0,0 +1,247 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import division
+
+import numpy as np
+from scipy import sparse, spatial
+from pygsp import utils
+
+_logger = utils.build_logger(__name__)
+
+def _scipy_pdist(features, metric, order, kind, k, radius, params):
+    if params:
+        raise ValueError('unexpected parameters {}'.format(params))
+    metric = 'cityblock' if metric == 'manhattan' else metric
+    metric = 'chebyshev' if metric == 'max_dist' else metric
+    params = dict(metric=metric)
+    if metric == 'minkowski':
+        params['p'] = order
+    dist = spatial.distance.pdist(features, **params)
+    dist = spatial.distance.squareform(dist)
+    if kind == 'knn':
+        neighbors = np.argsort(dist)[:, :k+1]
+        distances = np.take_along_axis(dist, neighbors, axis=-1)
+    elif kind == 'radius':
+        distances = []
+        neighbors = []
+        for distance in dist:
+            neighbor = np.flatnonzero(distance < radius)
+            neighbors.append(neighbor)
+            distances.append(distance[neighbor])
+    return neighbors, distances
+
+
+def _scipy_kdtree(features, _, order, kind, k, radius, params):
+    if order is None:
+        raise ValueError('invalid metric for scipy-kdtree')
+    eps = params.pop('eps', 0)
+    tree = spatial.KDTree(features, **params)
+    params = dict(p=order, eps=eps)
+    if kind == 'knn':
+        params['k'] = k + 1
+    elif kind == 'radius':
+        params['k'] = None
+        params['distance_upper_bound'] = radius
+    distances, neighbors = tree.query(features, **params)
+    return neighbors, distances
+
+
+def _scipy_ckdtree(features, _, order, kind, k, radius, params):
+    if order is None:
+        raise ValueError('invalid metric for scipy-kdtree')
+    eps = params.pop('eps', 0)
+    tree = spatial.cKDTree(features, **params)
+    params = dict(p=order, eps=eps, n_jobs=-1)
+    if kind == 'knn':
+        params['k'] = k + 1
+    elif kind == 'radius':
+        params['k'] = features.shape[0]  # number of vertices
+        params['distance_upper_bound'] = radius
+    distances, neighbors = tree.query(features, **params)
+    if kind == 'knn':
+        return neighbors, distances
+    elif kind == 'radius':
+        dist = []
+        neigh = []
+        for distance, neighbor in zip(distances, neighbors):
+            mask = (distance != np.inf)
+            dist.append(distance[mask])
+            neigh.append(neighbor[mask])
+        return neigh, dist
+
+
+def _flann(features, metric, order, kind, k, radius, params):
+    if metric == 'max_dist':
+        raise ValueError('flann gives wrong results for metric="max_dist".')
+    try:
+        import cyflann as cfl
+    except Exception as e:
+        raise ImportError('Cannot import cyflann. Choose another nearest '
+                          'neighbors backend or try to install it with '
+                          'pip (or conda) install cyflann. '
+                          'Original exception: {}'.format(e))
+    cfl.set_distance_type(metric, order=order)
+    index = cfl.FLANNIndex()
+    index.build_index(features, **params)
+    # I tried changing the algorithm and testing performance on huge matrices,
+    # but the default parameters seems to work best.
+    if kind == 'knn':
+        neighbors, distances = index.nn_index(features, k+1)
+        if metric == 'euclidean':
+            np.sqrt(distances, out=distances)
+        elif metric == 'minkowski':
+            np.power(distances, 1/order, out=distances)
+    elif kind == 'radius':
+        distances = []
+        neighbors = []
+        if metric == 'euclidean':
+            radius = radius**2
+        elif metric == 'minkowski':
+            radius = radius**order
+        n_vertices, _ = features.shape
+        for vertex in range(n_vertices):
+            neighbor, distance = index.nn_radius(features[vertex, :], radius)
+            distances.append(distance)
+            neighbors.append(neighbor)
+        if metric == 'euclidean':
+            distances = list(map(np.sqrt, distances))
+        elif metric == 'minkowski':
+            distances = list(map(lambda d: np.power(d, 1/order), distances))
+    index.free_index()
+    return neighbors, distances
+
+
+def _nmslib(features, metric, order, kind, k, _, params):
+    if kind == 'radius':
+        raise ValueError('nmslib does not support kind="radius".')
+    if metric == 'minkowski':
+        raise ValueError('nmslib does not support metric="minkowski".')
+    try:
+        import nmslib as nms
+    except Exception as e:
+        raise ImportError('Cannot import nmslib. Choose another nearest '
+                          'neighbors backend or try to install it with '
+                          'pip (or conda) install nmslib. '
+                          'Original exception: {}'.format(e))
+    n_vertices, _ = features.shape
+    params_index = params.pop('index', None)
+    params_query = params.pop('query', None)
+    metric = 'l2' if metric == 'euclidean' else metric
+    metric = 'l1' if metric == 'manhattan' else metric
+    metric = 'linf' if metric == 'max_dist' else metric
+    index = nms.init(space=metric, **params)
+    index.addDataPointBatch(features)
+    index.createIndex(params_index)
+    if params_query is not None:
+        index.setQueryTimeParams(params_query)
+    results = index.knnQueryBatch(features, k=k+1)
+    neighbors, distances = zip(*results)
+    distances = np.concatenate(distances).reshape(n_vertices, k+1)
+    neighbors = np.concatenate(neighbors).reshape(n_vertices, k+1)
+    return neighbors, distances
+
+def nearest_neighbor(features, metric='euclidean', order=2, kind='knn', k=10, radius=None, backend='scipy-ckdtree', **kwargs):
+    '''Find nearest neighboors.
+
+    Parameters
+    ----------
+    features : data numpy array 
+    metric : {'euclidean', 'manhattan', 'minkowski', 'max_dist'}, optional
+        Metric used to compute pairwise distances.
+
+        * ``'euclidean'`` defines pairwise distances as
+          :math:`d(v_i, v_j) = \| x_i - x_j \|_2`.
+        * ``'manhattan'`` defines pairwise distances as
+          :math:`d(v_i, v_j) = \| x_i - x_j \|_1`.
+        * ``'minkowski'`` generalizes the above and defines distances as
+          :math:`d(v_i, v_j) = \| x_i - x_j \|_p`
+          where :math:`p` is the ``order`` of the norm.
+        * ``'max_dist'`` defines pairwise distances as
+          :math:`d(v_i, v_j) = \| x_i - x_j \|_\infty = \max(x_i - x_j)`, where
+          the maximum is taken over the elements of the vector.
+
+        More metrics may be supported for some backends.
+        Please refer to the documentation of the chosen backend.
+    kind : 'knn' or 'radius' (default 'knn')
+    k : number of nearest neighboors if 'knn' is selected
+    radius : radius of the search if 'radius' is slected
+
+    order : float, optional
+        The order of the Minkowski distance for ``metric='minkowski'``.
+    backend : string, optional
+        * ``'scipy-pdist'`` uses :func:`scipy.spatial.distance.pdist` to
+          compute pairwise distances. The method is brute force and computes
+          all distances. That is the slowest method.
+        * ``'scipy-kdtree'`` uses :class:`scipy.spatial.KDTree`. The method
+          builds a k-d tree to prune the number of pairwise distances it has to
+          compute. That is an efficient strategy for low-dimensional spaces.
+        * ``'scipy-ckdtree'`` uses :class:`scipy.spatial.cKDTree`. The same as
+          ``'scipy-kdtree'`` but with C bindings, which should be faster.
+          That is the default.
+        * ``'flann'`` uses the `Fast Library for Approximate Nearest Neighbors
+          (FLANN) <https://github.com/mariusmuja/flann>`_. That method is an
+          approximation.
+        * ``'nmslib'`` uses the `Non-Metric Space Library (NMSLIB)
+          <https://github.com/nmslib/nmslib>`_. That method is an
+          approximation. It should be the fastest in high-dimensional spaces.
+
+        You can look at this `benchmark
+        <https://github.com/erikbern/ann-benchmarks>`_ to get an idea of the
+        relative performance of those backends. It's nonetheless wise to run
+        some tests on your own data.   
+    '''
+    if kind=='knn':
+        radius = None
+    elif kind=='radius':
+        k = None
+    else:
+        raise ValueError('"kind" must be "knn" or "radius"')
+
+    _orders = {
+        'euclidean': 2,
+        'manhattan': 1,
+        'max_dist': np.inf,
+        'minkowski': order,
+    }
+    order = _orders.pop(metric, None)  
+    try:
+        function = globals()['_' + backend.replace('-', '_')]
+    except KeyError:
+        raise ValueError('Invalid backend "{}".'.format(backend))
+    neighbors, distances = function(features, metric, order,
+                                    kind, k, radius, kwargs)
+    return neighbors, distances
+
+
+def sparse_distance_matrix(neighbors, distances, symmetrize=True, safe=False, kind = None, k=None):
+    '''Build a sparse distance matrix from nearest neighbors'''
+    n_edges = [len(n) - 1 for n in neighbors]  # remove distance to self
+    if safe and kind is None:
+        raise ValueError('Please specify "kind" to "knn" or "radius" to use the safe mode')
+
+    n_vertices = len(n_edges)
+    if safe and kind == 'radius':
+        n_disconnected = np.sum(np.asarray(n_edges) == 0)
+        if n_disconnected > 0:
+            _logger.warning('{} points (out of {}) have no neighboors. '
+                            'Consider increasing the radius or setting '
+                            'kind=knn.'.format(n_disconnected, n_vertices))
+
+    value = np.empty(sum(n_edges), dtype=np.float)
+    row = np.empty_like(value, dtype=np.int)
+    col = np.empty_like(value, dtype=np.int)
+    start = 0
+    for vertex in range(n_vertices):
+        if safe and kind == 'knn':
+            assert n_edges[vertex] == k
+        end = start + n_edges[vertex]
+        value[start:end] = distances[vertex][1:]
+        row[start:end] = np.full(n_edges[vertex], vertex)
+        col[start:end] = neighbors[vertex][1:]
+        start = end
+    W = sparse.csr_matrix((value, (row, col)), (n_vertices, n_vertices))
+    if symmetrize:
+        # Enforce symmetry. May have been broken by k-NN. Checking symmetry
+        # with np.abs(W - W.T).sum() is as costly as the symmetrization itself.
+        W = utils.symmetrize(W, method='fill')
+    return W
diff --git a/pygsp/filters/filter.py b/pygsp/filters/filter.py
@@ -255,7 +255,7 @@ def filter(self, s, method='chebyshev', order=30):
         >>> _ = G.plot(s1, ax=axes[0])
         >>> _ = G.plot(s2, ax=axes[1])
         >>> print('{:.5f}'.format(np.linalg.norm(s1 - s2)))
-        0.26808
+        0.26995
 
         Perfect reconstruction with Itersine, a tight frame:
 
@@ -448,7 +448,7 @@ def estimate_frame_bounds(self, x=None):
         A=1.708, B=2.359
         >>> A, B = g.estimate_frame_bounds(G.e)
         >>> print('A={:.3f}, B={:.3f}'.format(A, B))
-        A=1.723, B=2.359
+        A=1.839, B=2.359
 
         The frame bounds can be seen in the plot of the filter bank as the
         minimum and maximum of their squared sum (the black curve):

diff --git a/pygsp/graphs/fourier.py b/pygsp/graphs/fourier.py
@@ -85,7 +85,7 @@ def coherence(self):
         >>> graph.compute_fourier_basis()
         >>> minimum = 1 / np.sqrt(graph.n_vertices)
         >>> print('{:.2f} in [{:.2f}, 1]'.format(graph.coherence, minimum))
-        0.75 in [0.12, 1]
+        0.87 in [0.12, 1]
         >>>
         >>> # Plot the most localized eigenvector.
         >>> import matplotlib.pyplot as plt

diff --git a/pygsp/graphs/nngraphs/bunny.py b/pygsp/graphs/nngraphs/bunny.py
@@ -34,7 +34,5 @@ def __init__(self, **kwargs):
             'distance': 8,
         }
 
-        super(Bunny, self).__init__(Xin=data['bunny'],
-                                    epsilon=0.02, NNtype='radius',
-                                    center=False, rescale=False,
+        super(Bunny, self).__init__(data['bunny'], kind='radius', radius=0.02,
                                     plotting=plotting, **kwargs)