From a3d5aa38bc79bf8207c9bd77480d832c8aa00caa Mon Sep 17 00:00:00 2001 From: Jonathan Ellis Date: Mon, 8 Jan 2024 15:22:34 -0600 Subject: [PATCH 1/2] only rerank candidates whose approximate score is greater than rerankFloor (experimental) --- .../jbellis/jvector/graph/GraphSearcher.java | 49 +++++++++++++++++-- .../jbellis/jvector/graph/NodeQueue.java | 16 +++--- 2 files changed, 52 insertions(+), 13 deletions(-) diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphSearcher.java b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphSearcher.java index 710bb6104..ec243ebcf 100644 --- a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphSearcher.java +++ b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphSearcher.java @@ -107,6 +107,32 @@ public GraphSearcher build() { } } + + /** + * @param scoreFunction a function returning the similarity of a given node to the query vector + * @param reRanker if scoreFunction is approximate, this should be non-null and perform exact + * comparisons of the vectors for re-ranking at the end of the search. + * @param topK the number of results to look for + * @param threshold the minimum similarity (0..1) to accept; 0 will accept everything. (Experimental!) + * @param rerankFloor (Experimental!) Candidates whose approximate similarity is below this value + * will not be reranked with the exact score (which requires loading the raw vector). + * This is intended for use when your dataset is split across multiple indices. + * @param acceptOrds a Bits instance indicating which nodes are acceptable results. + * If {@link Bits#ALL}, all nodes are acceptable. + * It is caller's responsibility to ensure that there are enough acceptable nodes + * that we don't search the entire graph trying to satisfy topK. + * @return a SearchResult containing the topK results and the number of nodes visited during the search. + */ + @Experimental + public SearchResult search(NodeSimilarity.ScoreFunction scoreFunction, + NodeSimilarity.ReRanker reRanker, + int topK, + float threshold, + float rerankFloor, + Bits acceptOrds) { + return searchInternal(scoreFunction, reRanker, topK, threshold, rerankFloor, view.entryNode(), acceptOrds); + } + /** * @param scoreFunction a function returning the similarity of a given node to the query vector * @param reRanker if scoreFunction is approximate, this should be non-null and perform exact @@ -125,9 +151,10 @@ public SearchResult search(NodeSimilarity.ScoreFunction scoreFunction, int topK, float threshold, Bits acceptOrds) { - return searchInternal(scoreFunction, reRanker, topK, threshold, view.entryNode(), acceptOrds); + return search(scoreFunction, reRanker, topK, threshold, 0.0f, acceptOrds); } + /** * @param scoreFunction a function returning the similarity of a given node to the query vector * @param reRanker if scoreFunction is approximate, this should be non-null and perform exact @@ -147,6 +174,16 @@ public SearchResult search(NodeSimilarity.ScoreFunction scoreFunction, return search(scoreFunction, reRanker, topK, 0.0f, acceptOrds); } + SearchResult searchInternal(NodeSimilarity.ScoreFunction scoreFunction, + NodeSimilarity.ReRanker reRanker, + int topK, + float threshold, + int ep, + Bits acceptOrds) + { + return searchInternal(scoreFunction, reRanker, topK, threshold, 0, ep, acceptOrds); + } + /** * Add the closest neighbors found to a priority queue (heap). These are returned in * proximity order -- the closest neighbor of the topK found, i.e. the one with the highest @@ -160,6 +197,7 @@ SearchResult searchInternal(NodeSimilarity.ScoreFunction scoreFunction, NodeSimilarity.ReRanker reRanker, int topK, float threshold, + float rerankFloor, int ep, Bits acceptOrds) { @@ -231,13 +269,14 @@ SearchResult searchInternal(NodeSimilarity.ScoreFunction scoreFunction, } assert resultsQueue.size() <= topK; - SearchResult.NodeScore[] nodes = extractScores(scoreFunction, reRanker, resultsQueue); + SearchResult.NodeScore[] nodes = extractScores(scoreFunction, reRanker, resultsQueue, rerankFloor); return new SearchResult(nodes, visited, numVisited); } private static SearchResult.NodeScore[] extractScores(NodeSimilarity.ScoreFunction sf, NodeSimilarity.ReRanker reRanker, - NodeQueue resultsQueue) + NodeQueue resultsQueue, + float rerankFloor) { SearchResult.NodeScore[] nodes; if (sf.isExact()) { @@ -248,8 +287,8 @@ private static SearchResult.NodeScore[] extractScores(NodeSimilarity.ScoreFuncti nodes[i] = new SearchResult.NodeScore(n, nScore); } } else { - nodes = resultsQueue.nodesCopy(reRanker::similarityTo); - Arrays.sort(nodes, 0, resultsQueue.size(), Comparator.comparingDouble((SearchResult.NodeScore nodeScore) -> nodeScore.score).reversed()); + nodes = resultsQueue.nodesCopy(reRanker::similarityTo, rerankFloor); + Arrays.sort(nodes, 0, nodes.length, Comparator.comparingDouble((SearchResult.NodeScore nodeScore) -> nodeScore.score).reversed()); } return nodes; } diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/NodeQueue.java b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/NodeQueue.java index fc709ae36..b79e34884 100644 --- a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/NodeQueue.java +++ b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/NodeQueue.java @@ -27,6 +27,8 @@ import io.github.jbellis.jvector.util.AbstractLongHeap; import io.github.jbellis.jvector.util.NumericUtils; +import java.util.stream.IntStream; + /** * NodeQueue uses a {@link io.github.jbellis.jvector.util.AbstractLongHeap} to store lists of nodes in a graph, * represented as a node id with an associated score packed together as a sortable long, which is sorted @@ -135,14 +137,12 @@ public int[] nodesCopy() { return nodes; } - public SearchResult.NodeScore[] nodesCopy(NodeSimilarity.ExactScoreFunction sf) { - int size = size(); - SearchResult.NodeScore[] ns = new SearchResult.NodeScore[size]; - for (int i = 0; i < size; i++) { - var node = decodeNodeId(heap.get(i + 1)); - ns[i] = new SearchResult.NodeScore(node, sf.similarityTo(node)); - } - return ns; + public SearchResult.NodeScore[] nodesCopy(NodeSimilarity.ExactScoreFunction sf, float rerankFloor) { + return IntStream.range(0, size()) + .mapToObj(i -> heap.get(i + 1)) + .filter(m -> decodeScore(m) >= rerankFloor) + .map(m -> new SearchResult.NodeScore(decodeNodeId(m), sf.similarityTo(decodeNodeId(m)))) + .toArray(SearchResult.NodeScore[]::new); } /** Returns the top element's node id. */ From 47c10e0d246ec7f0afa1b44a51782080896d6cd3 Mon Sep 17 00:00:00 2001 From: Jonathan Ellis Date: Mon, 8 Jan 2024 20:24:33 -0600 Subject: [PATCH 2/2] parameter `threshold` no longer experimental, and clarify javdoc for search() --- .../jbellis/jvector/graph/GraphSearcher.java | 56 +++++++++++-------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphSearcher.java b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphSearcher.java index ec243ebcf..70da3be98 100644 --- a/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphSearcher.java +++ b/jvector-base/src/main/java/io/github/jbellis/jvector/graph/GraphSearcher.java @@ -112,11 +112,17 @@ public GraphSearcher build() { * @param scoreFunction a function returning the similarity of a given node to the query vector * @param reRanker if scoreFunction is approximate, this should be non-null and perform exact * comparisons of the vectors for re-ranking at the end of the search. - * @param topK the number of results to look for - * @param threshold the minimum similarity (0..1) to accept; 0 will accept everything. (Experimental!) - * @param rerankFloor (Experimental!) Candidates whose approximate similarity is below this value - * will not be reranked with the exact score (which requires loading the raw vector). - * This is intended for use when your dataset is split across multiple indices. + * @param topK the number of results to look for. With threshold=0, the search will continue until at least + * `topK` results have been found, or until the entire graph has been searched. + * @param threshold the minimum similarity (0..1) to accept; 0 will accept everything. May be used + * with a large topK to find (approximately) all nodes above the given threshold. + * If threshold > 0 then the search will stop when it is probabilistically unlikely + * to find more nodes above the threshold, even if `topK` results have not yet been found. + * @param rerankFloor (Experimental!) Candidates whose approximate similarity is at least this value + * will not be reranked with the exact score (which requires loading the raw vector) + * and included in the final results. (Potentially leaving fewer than topK entries + * in the results.) Other candidates will be discarded. This is intended for use + * when combining results from multiple indexes. * @param acceptOrds a Bits instance indicating which nodes are acceptable results. * If {@link Bits#ALL}, all nodes are acceptable. * It is caller's responsibility to ensure that there are enough acceptable nodes @@ -134,18 +140,21 @@ public SearchResult search(NodeSimilarity.ScoreFunction scoreFunction, } /** - * @param scoreFunction a function returning the similarity of a given node to the query vector - * @param reRanker if scoreFunction is approximate, this should be non-null and perform exact - * comparisons of the vectors for re-ranking at the end of the search. - * @param topK the number of results to look for - * @param threshold the minimum similarity (0..1) to accept; 0 will accept everything. (Experimental!) - * @param acceptOrds a Bits instance indicating which nodes are acceptable results. - * If {@link Bits#ALL}, all nodes are acceptable. - * It is caller's responsibility to ensure that there are enough acceptable nodes - * that we don't search the entire graph trying to satisfy topK. + * @param scoreFunction a function returning the similarity of a given node to the query vector + * @param reRanker if scoreFunction is approximate, this should be non-null and perform exact + * comparisons of the vectors for re-ranking at the end of the search. + * @param topK the number of results to look for. With threshold=0, the search will continue until at least + * `topK` results have been found, or until the entire graph has been searched. + * @param threshold the minimum similarity (0..1) to accept; 0 will accept everything. May be used + * with a large topK to find (approximately) all nodes above the given threshold. + * If threshold > 0 then the search will stop when it is probabilistically unlikely + * to find more nodes above the threshold, even if `topK` results have not yet been found. + * @param acceptOrds a Bits instance indicating which nodes are acceptable results. + * If {@link Bits#ALL}, all nodes are acceptable. + * It is caller's responsibility to ensure that there are enough acceptable nodes + * that we don't search the entire graph trying to satisfy topK. * @return a SearchResult containing the topK results and the number of nodes visited during the search. */ - @Experimental public SearchResult search(NodeSimilarity.ScoreFunction scoreFunction, NodeSimilarity.ReRanker reRanker, int topK, @@ -156,14 +165,15 @@ public SearchResult search(NodeSimilarity.ScoreFunction scoreFunction, /** - * @param scoreFunction a function returning the similarity of a given node to the query vector - * @param reRanker if scoreFunction is approximate, this should be non-null and perform exact - * comparisons of the vectors for re-ranking at the end of the search. - * @param topK the number of results to look for - * @param acceptOrds a Bits instance indicating which nodes are acceptable results. - * If {@link Bits#ALL}, all nodes are acceptable. - * It is caller's responsibility to ensure that there are enough acceptable nodes - * that we don't search the entire graph trying to satisfy topK. + * @param scoreFunction a function returning the similarity of a given node to the query vector + * @param reRanker if scoreFunction is approximate, this should be non-null and perform exact + * comparisons of the vectors for re-ranking at the end of the search. + * @param topK the number of results to look for. With threshold=0, the search will continue until at least + * `topK` results have been found, or until the entire graph has been searched. + * @param acceptOrds a Bits instance indicating which nodes are acceptable results. + * If {@link Bits#ALL}, all nodes are acceptable. + * It is caller's responsibility to ensure that there are enough acceptable nodes + * that we don't search the entire graph trying to satisfy topK. * @return a SearchResult containing the topK results and the number of nodes visited during the search. */ public SearchResult search(NodeSimilarity.ScoreFunction scoreFunction,