From a3d5aa38bc79bf8207c9bd77480d832c8aa00caa Mon Sep 17 00:00:00 2001
From: Jonathan Ellis <jbellis@datastax.com>
Date: Mon, 8 Jan 2024 15:22:34 -0600
Subject: [PATCH 1/2] only rerank candidates whose approximate score is greater
 than rerankFloor (experimental)

---
 .../jbellis/jvector/graph/GraphSearcher.java  | 49 +++++++++++++++++--
 .../jbellis/jvector/graph/NodeQueue.java      | 16 +++---
 2 files changed, 52 insertions(+), 13 deletions(-)
diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphSearcher.java b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphSearcher.java
index 710bb6104..ec243ebcf 100644
--- a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphSearcher.java
+++ b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphSearcher.java
@@ -107,6 +107,32 @@ public GraphSearcher<T> build() {
         }
     }
 
+
+    /**
+     * @param scoreFunction   a function returning the similarity of a given node to the query vector
+     * @param reRanker        if scoreFunction is approximate, this should be non-null and perform exact
+     *                        comparisons of the vectors for re-ranking at the end of the search.
+     * @param topK            the number of results to look for
+     * @param threshold       the minimum similarity (0..1) to accept; 0 will accept everything. (Experimental!)
+     * @param rerankFloor     (Experimental!) Candidates whose approximate similarity is below this value
+     *                        will not be reranked with the exact score (which requires loading the raw vector).
+     *                        This is intended for use when your dataset is split across multiple indices.
+     * @param acceptOrds      a Bits instance indicating which nodes are acceptable results.
+     *                        If {@link Bits#ALL}, all nodes are acceptable.
+     *                        It is caller's responsibility to ensure that there are enough acceptable nodes
+     *                        that we don't search the entire graph trying to satisfy topK.
+     * @return a SearchResult containing the topK results and the number of nodes visited during the search.
+     */
+    @Experimental
+    public SearchResult search(NodeSimilarity.ScoreFunction scoreFunction,
+                               NodeSimilarity.ReRanker reRanker,
+                               int topK,
+                               float threshold,
+                               float rerankFloor,
+                               Bits acceptOrds) {
+        return searchInternal(scoreFunction, reRanker, topK, threshold, rerankFloor, view.entryNode(), acceptOrds);
+    }
+
     /**
      * @param scoreFunction a function returning the similarity of a given node to the query vector
      * @param reRanker      if scoreFunction is approximate, this should be non-null and perform exact
@@ -125,9 +151,10 @@ public SearchResult search(NodeSimilarity.ScoreFunction scoreFunction,
                                int topK,
                                float threshold,
                                Bits acceptOrds) {
-        return searchInternal(scoreFunction, reRanker, topK, threshold, view.entryNode(), acceptOrds);
+        return search(scoreFunction, reRanker, topK, threshold, 0.0f, acceptOrds);
     }
 
+
     /**
      * @param scoreFunction a function returning the similarity of a given node to the query vector
      * @param reRanker      if scoreFunction is approximate, this should be non-null and perform exact
@@ -147,6 +174,16 @@ public SearchResult search(NodeSimilarity.ScoreFunction scoreFunction,
         return search(scoreFunction, reRanker, topK, 0.0f, acceptOrds);
     }
 
+    SearchResult searchInternal(NodeSimilarity.ScoreFunction scoreFunction,
+                                NodeSimilarity.ReRanker reRanker,
+                                int topK,
+                                float threshold,
+                                int ep,
+                                Bits acceptOrds)
+    {
+        return searchInternal(scoreFunction, reRanker, topK, threshold, 0, ep, acceptOrds);
+    }
+
     /**
      * Add the closest neighbors found to a priority queue (heap). These are returned in
      * proximity order -- the closest neighbor of the topK found, i.e. the one with the highest
@@ -160,6 +197,7 @@ SearchResult searchInternal(NodeSimilarity.ScoreFunction scoreFunction,
                                 NodeSimilarity.ReRanker reRanker,
                                 int topK,
                                 float threshold,
+                                float rerankFloor,
                                 int ep,
                                 Bits acceptOrds)
     {
@@ -231,13 +269,14 @@ SearchResult searchInternal(NodeSimilarity.ScoreFunction scoreFunction,
         }
 
         assert resultsQueue.size() <= topK;
-        SearchResult.NodeScore[] nodes = extractScores(scoreFunction, reRanker, resultsQueue);
+        SearchResult.NodeScore[] nodes = extractScores(scoreFunction, reRanker, resultsQueue, rerankFloor);
         return new SearchResult(nodes, visited, numVisited);
     }
 
     private static SearchResult.NodeScore[] extractScores(NodeSimilarity.ScoreFunction sf,
                                                           NodeSimilarity.ReRanker reRanker,
-                                                          NodeQueue resultsQueue)
+                                                          NodeQueue resultsQueue,
+                                                          float rerankFloor)
     {
         SearchResult.NodeScore[] nodes;
         if (sf.isExact()) {
@@ -248,8 +287,8 @@ private static SearchResult.NodeScore[] extractScores(NodeSimilarity.ScoreFuncti
                 nodes[i] = new SearchResult.NodeScore(n, nScore);
             }
         } else {
-            nodes = resultsQueue.nodesCopy(reRanker::similarityTo);
-            Arrays.sort(nodes, 0, resultsQueue.size(), Comparator.comparingDouble((SearchResult.NodeScore nodeScore) -> nodeScore.score).reversed());
+            nodes = resultsQueue.nodesCopy(reRanker::similarityTo, rerankFloor);
+            Arrays.sort(nodes, 0, nodes.length, Comparator.comparingDouble((SearchResult.NodeScore nodeScore) -> nodeScore.score).reversed());
         }
         return nodes;
     }
diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/NodeQueue.java b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/NodeQueue.java
index fc709ae36..b79e34884 100644
--- a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/NodeQueue.java
+++ b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/NodeQueue.java
@@ -27,6 +27,8 @@
 import io.github.jbellis.jvector.util.AbstractLongHeap;
 import io.github.jbellis.jvector.util.NumericUtils;
 
+import java.util.stream.IntStream;
+
 /**
  * NodeQueue uses a {@link io.github.jbellis.jvector.util.AbstractLongHeap} to store lists of nodes in a graph,
  * represented as a node id with an associated score packed together as a sortable long, which is sorted
@@ -135,14 +137,12 @@ public int[] nodesCopy() {
         return nodes;
     }
 
-    public SearchResult.NodeScore[] nodesCopy(NodeSimilarity.ExactScoreFunction sf) {
-        int size = size();
-        SearchResult.NodeScore[] ns = new SearchResult.NodeScore[size];
-        for (int i = 0; i < size; i++) {
-            var node = decodeNodeId(heap.get(i + 1));
-            ns[i] = new SearchResult.NodeScore(node, sf.similarityTo(node));
-        }
-        return ns;
+    public SearchResult.NodeScore[] nodesCopy(NodeSimilarity.ExactScoreFunction sf, float rerankFloor) {
+        return IntStream.range(0, size())
+                .mapToObj(i -> heap.get(i + 1))
+                .filter(m -> decodeScore(m) >= rerankFloor)
+                .map(m -> new SearchResult.NodeScore(decodeNodeId(m), sf.similarityTo(decodeNodeId(m))))
+                .toArray(SearchResult.NodeScore[]::new);
     }
 
     /** Returns the top element's node id. */

From 47c10e0d246ec7f0afa1b44a51782080896d6cd3 Mon Sep 17 00:00:00 2001
From: Jonathan Ellis <jbellis@datastax.com>
Date: Mon, 8 Jan 2024 20:24:33 -0600
Subject: [PATCH 2/2] parameter `threshold` no longer experimental, and clarify
 javdoc for search()

---
 .../jbellis/jvector/graph/GraphSearcher.java  | 56 +++++++++++--------
 1 file changed, 33 insertions(+), 23 deletions(-)

diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphSearcher.java b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphSearcher.java
index ec243ebcf..70da3be98 100644
--- a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphSearcher.java
+++ b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphSearcher.java
@@ -112,11 +112,17 @@ public GraphSearcher<T> build() {
      * @param scoreFunction   a function returning the similarity of a given node to the query vector
      * @param reRanker        if scoreFunction is approximate, this should be non-null and perform exact
      *                        comparisons of the vectors for re-ranking at the end of the search.
-     * @param topK            the number of results to look for
-     * @param threshold       the minimum similarity (0..1) to accept; 0 will accept everything. (Experimental!)
-     * @param rerankFloor     (Experimental!) Candidates whose approximate similarity is below this value
-     *                        will not be reranked with the exact score (which requires loading the raw vector).
-     *                        This is intended for use when your dataset is split across multiple indices.
+     * @param topK            the number of results to look for. With threshold=0, the search will continue until at least
+     *                        `topK` results have been found, or until the entire graph has been searched.
+     * @param threshold       the minimum similarity (0..1) to accept; 0 will accept everything. May be used
+     *                        with a large topK to find (approximately) all nodes above the given threshold.
+     *                        If threshold > 0 then the search will stop when it is probabilistically unlikely
+     *                        to find more nodes above the threshold, even if `topK` results have not yet been found.
+     * @param rerankFloor     (Experimental!) Candidates whose approximate similarity is at least this value
+     *                        will not be reranked with the exact score (which requires loading the raw vector)
+     *                        and included in the final results.  (Potentially leaving fewer than topK entries
+     *                        in the results.)  Other candidates will be discarded.  This is intended for use
+     *                        when combining results from multiple indexes.
      * @param acceptOrds      a Bits instance indicating which nodes are acceptable results.
      *                        If {@link Bits#ALL}, all nodes are acceptable.
      *                        It is caller's responsibility to ensure that there are enough acceptable nodes
@@ -134,18 +140,21 @@ public SearchResult search(NodeSimilarity.ScoreFunction scoreFunction,
     }
 
     /**
-     * @param scoreFunction a function returning the similarity of a given node to the query vector
-     * @param reRanker      if scoreFunction is approximate, this should be non-null and perform exact
-     *                      comparisons of the vectors for re-ranking at the end of the search.
-     * @param topK          the number of results to look for
-     * @param threshold     the minimum similarity (0..1) to accept; 0 will accept everything. (Experimental!)
-     * @param acceptOrds    a Bits instance indicating which nodes are acceptable results.
-     *                      If {@link Bits#ALL}, all nodes are acceptable.
-     *                      It is caller's responsibility to ensure that there are enough acceptable nodes
-     *                      that we don't search the entire graph trying to satisfy topK.
+     * @param scoreFunction   a function returning the similarity of a given node to the query vector
+     * @param reRanker        if scoreFunction is approximate, this should be non-null and perform exact
+     *                        comparisons of the vectors for re-ranking at the end of the search.
+     * @param topK            the number of results to look for. With threshold=0, the search will continue until at least
+     *                        `topK` results have been found, or until the entire graph has been searched.
+     * @param threshold       the minimum similarity (0..1) to accept; 0 will accept everything. May be used
+     *                        with a large topK to find (approximately) all nodes above the given threshold.
+     *                        If threshold > 0 then the search will stop when it is probabilistically unlikely
+     *                        to find more nodes above the threshold, even if `topK` results have not yet been found.
+     * @param acceptOrds      a Bits instance indicating which nodes are acceptable results.
+     *                        If {@link Bits#ALL}, all nodes are acceptable.
+     *                        It is caller's responsibility to ensure that there are enough acceptable nodes
+     *                        that we don't search the entire graph trying to satisfy topK.
      * @return a SearchResult containing the topK results and the number of nodes visited during the search.
      */
-    @Experimental
     public SearchResult search(NodeSimilarity.ScoreFunction scoreFunction,
                                NodeSimilarity.ReRanker reRanker,
                                int topK,
@@ -156,14 +165,15 @@ public SearchResult search(NodeSimilarity.ScoreFunction scoreFunction,
 
 
     /**
-     * @param scoreFunction a function returning the similarity of a given node to the query vector
-     * @param reRanker      if scoreFunction is approximate, this should be non-null and perform exact
-     *                      comparisons of the vectors for re-ranking at the end of the search.
-     * @param topK          the number of results to look for
-     * @param acceptOrds    a Bits instance indicating which nodes are acceptable results.
-     *                      If {@link Bits#ALL}, all nodes are acceptable.
-     *                      It is caller's responsibility to ensure that there are enough acceptable nodes
-     *                      that we don't search the entire graph trying to satisfy topK.
+     * @param scoreFunction   a function returning the similarity of a given node to the query vector
+     * @param reRanker        if scoreFunction is approximate, this should be non-null and perform exact
+     *                        comparisons of the vectors for re-ranking at the end of the search.
+     * @param topK            the number of results to look for. With threshold=0, the search will continue until at least
+     *                        `topK` results have been found, or until the entire graph has been searched.
+     * @param acceptOrds      a Bits instance indicating which nodes are acceptable results.
+     *                        If {@link Bits#ALL}, all nodes are acceptable.
+     *                        It is caller's responsibility to ensure that there are enough acceptable nodes
+     *                        that we don't search the entire graph trying to satisfy topK.
      * @return a SearchResult containing the topK results and the number of nodes visited during the search.
      */
     public SearchResult search(NodeSimilarity.ScoreFunction scoreFunction,