From 9e5c2e8e70cbaff019db4f2b6961ed3a84881270 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 11 Jul 2024 15:54:02 -0400 Subject: [PATCH 1/9] Iniial commit for rbc move --- cpp/CMakeLists.txt | 9 +- cpp/include/cuvs/neighbors/ball_cover.hpp | 362 ++++ cpp/include/cuvs/neighbors/brute_force.hpp | 13 +- cpp/include/cuvs/neighbors/ivf_pq.hpp | 2 +- cpp/src/neighbors/ball_cover.cu | 76 + cpp/src/neighbors/ball_cover.cuh | 492 +++++ cpp/src/neighbors/ball_cover/ball_cover.cuh | 718 ++++++++ cpp/src/neighbors/ball_cover/common.cuh | 69 + .../neighbors/ball_cover/registers-ext.cuh | 205 +++ .../neighbors/ball_cover/registers-inl.cuh | 1630 +++++++++++++++++ cpp/src/neighbors/ball_cover/registers.cuh | 24 + .../neighbors/ball_cover/registers_types.cuh | 76 + cpp/src/neighbors/brute_force.cu | 51 +- .../neighbors/faiss_select/Comparators.cuh | 29 + .../neighbors/faiss_select/DistanceUtils.h | 52 + .../faiss_select/MergeNetworkBlock.cuh | 277 +++ .../faiss_select/MergeNetworkUtils.cuh | 25 + .../faiss_select/MergeNetworkWarp.cuh | 519 ++++++ cpp/src/neighbors/faiss_select/Select.cuh | 569 ++++++ cpp/src/neighbors/faiss_select/StaticUtils.h | 48 + .../faiss_select/key_value_block_select.cuh | 229 +++ cpp/test/CMakeLists.txt | 14 +- cpp/test/neighbors/ball_cover.cu | 392 ++++ cpp/test/neighbors/spatial_data.h | 38 + .../VectorSearch_QuestionRetrieval.ipynb | 2 +- notebooks/ivf_flat_example.ipynb | 319 +++- notebooks/rmm_log.txt | 2 + notebooks/tutorial_ivf_pq.ipynb | 475 ++++- 28 files changed, 6557 insertions(+), 160 deletions(-) create mode 100644 cpp/include/cuvs/neighbors/ball_cover.hpp create mode 100644 cpp/src/neighbors/ball_cover.cu create mode 100644 cpp/src/neighbors/ball_cover.cuh create mode 100644 cpp/src/neighbors/ball_cover/ball_cover.cuh create mode 100644 cpp/src/neighbors/ball_cover/common.cuh create mode 100644 cpp/src/neighbors/ball_cover/registers-ext.cuh create mode 100644 cpp/src/neighbors/ball_cover/registers-inl.cuh create mode 100644 cpp/src/neighbors/ball_cover/registers.cuh create mode 100644 cpp/src/neighbors/ball_cover/registers_types.cuh create mode 100644 cpp/src/neighbors/faiss_select/Comparators.cuh create mode 100644 cpp/src/neighbors/faiss_select/DistanceUtils.h create mode 100644 cpp/src/neighbors/faiss_select/MergeNetworkBlock.cuh create mode 100644 cpp/src/neighbors/faiss_select/MergeNetworkUtils.cuh create mode 100644 cpp/src/neighbors/faiss_select/MergeNetworkWarp.cuh create mode 100644 cpp/src/neighbors/faiss_select/Select.cuh create mode 100644 cpp/src/neighbors/faiss_select/StaticUtils.h create mode 100644 cpp/src/neighbors/faiss_select/key_value_block_select.cuh create mode 100644 cpp/test/neighbors/ball_cover.cu create mode 100644 cpp/test/neighbors/spatial_data.h create mode 100644 notebooks/rmm_log.txt diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 0fe44f511..7c035b9df 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -101,12 +101,8 @@ message(VERBOSE "cuVS: Disable OpenMP: ${DISABLE_OPENMP}") message(VERBOSE "cuVS: Enable kernel resource usage info: ${CUDA_ENABLE_KERNELINFO}") message(VERBOSE "cuVS: Enable lineinfo in nvcc: ${CUDA_ENABLE_LINEINFO}") message(VERBOSE "cuVS: Enable nvtx markers: ${CUVS_NVTX}") -message(VERBOSE - "cuVS: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}" -) -message(VERBOSE - "cuVS: Statically link the CUDA math libraries: ${CUDA_STATIC_MATH_LIBRARIES}" -) +message(VERBOSE "cuVS: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}") +message(VERBOSE "cuVS: Statically link the CUDA math libraries: ${CUDA_STATIC_MATH_LIBRARIES}") message(VERBOSE "cuVS: Build and statically link RAFT libraries: ${CUVS_USE_RAFT_STATIC}") # Set RMM logging level @@ -243,6 +239,7 @@ add_library( src/distance/detail/fused_distance_nn.cu src/distance/distance.cu src/distance/pairwise_distance.cu + src/neighbors/ball_cover.cu src/neighbors/brute_force.cu src/neighbors/cagra_build_float.cu src/neighbors/cagra_build_int8.cu diff --git a/cpp/include/cuvs/neighbors/ball_cover.hpp b/cpp/include/cuvs/neighbors/ball_cover.hpp new file mode 100644 index 000000000..1ca588aa2 --- /dev/null +++ b/cpp/include/cuvs/neighbors/ball_cover.hpp @@ -0,0 +1,362 @@ +/* + * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +#include + +namespace cuvs::neighbors::ball_cover { + +/** + * @ingroup random_ball_cover + * @{ + */ + +/** + * Stores raw index data points, sampled landmarks, the 1-nns of index points + * to their closest landmarks, and the ball radii of each landmark. This + * class is intended to be constructed once and reused across subsequent + * queries. + * @tparam int64_t + * @tparam float + * @tparam int + */ +template +struct index : cuvs::neighbors::index { + public: + explicit index(raft::resources const& handle_, + raft::device_matrix_view X_, + cuvs::distance::DistanceType metric_) + : handle(handle_), + X(X_), + m(X_.extent(0)), + n(X_.extent(1)), + metric(metric_), + /** + * the sqrt() here makes the sqrt(m)^2 a linear-time lower bound + * + * Total memory footprint of index: (2 * sqrt(m)) + (n * sqrt(m)) + (2 * m) + */ + n_landmarks(sqrt(X_.extent(0))), + R_indptr(raft::make_device_vector(handle, sqrt(X_.extent(0)) + 1)), + R_1nn_cols(raft::make_device_vector(handle, X_.extent(0))), + R_1nn_dists(raft::make_device_vector(handle, X_.extent(0))), + R_closest_landmark_dists(raft::make_device_vector(handle, X_.extent(0))), + R(raft::make_device_matrix(handle, sqrt(X_.extent(0)), X_.extent(1))), + X_reordered( + raft::make_device_matrix(handle, X_.extent(0), X_.extent(1))), + R_radius(raft::make_device_vector(handle, sqrt(X_.extent(0)))), + index_trained(false) + { + } + + auto get_R_indptr() const -> raft::device_vector_view + { + return R_indptr.view(); + } + auto get_R_1nn_cols() const -> raft::device_vector_view + { + return R_1nn_cols.view(); + } + auto get_R_1nn_dists() const -> raft::device_vector_view + { + return R_1nn_dists.view(); + } + auto get_R_radius() const -> raft::device_vector_view + { + return R_radius.view(); + } + auto get_R() const -> raft::device_matrix_view + { + return R.view(); + } + auto get_R_closest_landmark_dists() const -> raft::device_vector_view + { + return R_closest_landmark_dists.view(); + } + auto get_X_reordered() const + -> raft::device_matrix_view + { + return X_reordered.view(); + } + + raft::device_vector_view get_R_indptr() { return R_indptr.view(); } + raft::device_vector_view get_R_1nn_cols() { return R_1nn_cols.view(); } + raft::device_vector_view get_R_1nn_dists() { return R_1nn_dists.view(); } + raft::device_vector_view get_R_radius() { return R_radius.view(); } + raft::device_matrix_view get_R() { return R.view(); } + raft::device_vector_view get_R_closest_landmark_dists() + { + return R_closest_landmark_dists.view(); + } + raft::device_matrix_view get_X_reordered() + { + return X_reordered.view(); + } + raft::device_matrix_view get_X() const { return X; } + + cuvs::distance::DistanceType get_metric() const { return metric; } + + int get_n_landmarks() const { return n_landmarks; } + bool is_index_trained() const { return index_trained; }; + + // This should only be set by internal functions + void set_index_trained() { index_trained = true; } + + raft::resources const& handle; + + int_t m; + int_t n; + int_t n_landmarks; + + raft::device_matrix_view X; + + cuvs::distance::DistanceType metric; + + private: + // CSR storing the neighborhoods for each data point + raft::device_vector R_indptr; + raft::device_vector R_1nn_cols; + raft::device_vector R_1nn_dists; + raft::device_vector R_closest_landmark_dists; + + raft::device_vector R_radius; + + raft::device_matrix R; + raft::device_matrix X_reordered; + + protected: + bool index_trained; +}; + +/** @} */ + +/** + * @defgroup random_ball_cover Random Ball Cover algorithm + * @{ + */ + +/** + * Builds and populates a previously unbuilt cuvs::neighbors::ball_cover::index + * + * Usage example: + * @code{.cpp} + * + * #include + * #include + * #include + * using namespace raft::neighbors; + * + * raft::resources handle; + * ... + * auto metric = cuvs::distance::DistanceType::L2Expanded; + * cuvs::neighbors::ball_cover::index index(handle, X, metric); + * + * ball_cover::build_index(handle, index); + * @endcode + * + * @param[in] handle library resource management handle + * @param[inout] index an empty (and not previous built) instance of + * cuvs::neighbors::ball_cover::index + */ +void build(raft::resources const& handle, index& index); + +/** @} */ // end group random_ball_cover + +/** + * @ingroup random_ball_cover + * @{ + */ + +/** + * Performs a faster exact knn in metric spaces using the triangle + * inequality with a number of landmark points to reduce the + * number of distance computations from O(n^2) to O(sqrt(n)). This + * performs an all neighbors knn, which can reuse memory when + * the index and query are the same array. This function will + * build the index and assumes rbc_build_index() has not already + * been called. + * + * Usage example: + * @code{.cpp} + * + * #include + * #include + * #include + * using namespace raft::neighbors; + * + * raft::resources handle; + * ... + * auto metric = cuvs::distance::DistanceType::L2Expanded; + * + * // Construct a ball cover index + * cuvs::neighbors::ball_cover::index index(handle, X, metric); + * + * // Perform all neighbors knn query + * ball_cover::all_knn_query(handle, index, inds, dists, k); + * @endcode + * + * @param[in] handle raft handle for resource management + * @param[in] index ball cover index which has not yet been built + * @param[out] inds output knn indices + * @param[out] dists output knn distances + * @param[in] k number of nearest neighbors to find + * @param[in] perform_post_filtering if this is false, only the closest k landmarks + * are considered (which will return approximate + * results). + * @param[in] weight a weight for overlap between the closest landmark and + * the radius of other landmarks when pruning distances. + * Setting this value below 1 can effectively turn off + * computing distances against many other balls, enabling + * approximate nearest neighbors. Recall can be adjusted + * based on how many relevant balls are ignored. Note that + * many datasets can still have great recall even by only + * looking in the closest landmark. + */ +void all_knn_query(raft::resources const& handle, + index& index, + raft::device_matrix_view inds, + raft::device_matrix_view dists, + int k, + bool perform_post_filtering = true, + float weight = 1.0); + +/** @} */ + +/** + * @brief Computes epsilon neighborhood for the L2 distance metric using rbc + * + * @param[in] handle raft handle for resource management + * @param[in] index ball cover index which has been built + * @param[out] adj adjacency matrix [row-major] [on device] [dim = m x n] + * @param[out] vd vertex degree array [on device] [len = m + 1] + * `vd + m` stores the total number of edges in the adjacency + * matrix. Pass a nullptr if you don't need this info. + * @param[in] query first matrix [row-major] [on device] [dim = m x k] + * @param[in] eps defines epsilon neighborhood radius + */ +void eps_nn(raft::resources const& handle, + const index& index, + raft::device_matrix_view adj, + raft::device_vector_view vd, + raft::device_matrix_view query, + float eps); +/** + * @brief Computes epsilon neighborhood for the L2 distance metric using rbc + * + * @param[in] handle raft handle for resource management + * @param[in] index ball cover index which has been built + * @param[out] adj_ia adjacency matrix CSR row offsets + * @param[out] adj_ja adjacency matrix CSR column indices, needs to be nullptr + * in first pass with max_k nullopt + * @param[out] vd vertex degree array [on device] [len = m + 1] + * `vd + m` stores the total number of edges in the adjacency + * matrix. Pass a nullptr if you don't need this info. + * @param[in] query first matrix [row-major] [on device] [dim = m x k] + * @param[in] eps defines epsilon neighborhood radius + * @param[inout] max_k if nullopt (default), the user needs to make 2 subsequent calls: + * The first call computes row offsets in adj_ia, where adj_ia[m] + * contains the minimum required size for adj_ja. + * The second call fills in adj_ja based on adj_ia. + * If max_k != nullopt the algorithm only fills up neighbors up to a + * maximum number of max_k for each row in a single pass. Note + * that it is not guarantueed to return the nearest neighbors. + * Upon return max_k is overwritten with the actual max_k found during + * computation. + */ +void eps_nn(raft::resources const& handle, + const index& index, + raft::device_vector_view adj_ia, + raft::device_vector_view adj_ja, + raft::device_vector_view vd, + raft::device_matrix_view query, + float eps, + std::optional> max_k = std::nullopt); + +/** + * @ingroup random_ball_cover + * @{ + */ + +/** + * Performs a faster exact knn in metric spaces using the triangle + * inequality with a number of landmark points to reduce the + * number of distance computations from O(n^2) to O(sqrt(n)). This + * function does not build the index and assumes rbc_build_index() has + * already been called. Use this function when the index and + * query arrays are different, otherwise use rbc_all_knn_query(). + * + * Usage example: + * @code{.cpp} + * + * #include + * #include + * #include + * using namespace raft::neighbors; + * + * raft::resources handle; + * ... + * auto metric = cuvs::distance::DistanceType::L2Expanded; + * + * // Build a ball cover index + * cuvs::neighbors::ball_cover::index index(handle, X, metric); + * ball_cover::build_index(handle, index); + * + * // Perform all neighbors knn query + * ball_cover::knn_query(handle, index, inds, dists, k); + * @endcode + * @param[in] handle raft handle for resource management + * @param[in] index ball cover index which has not yet been built + * @param[in] query device matrix containing query data points + * @param[out] inds output knn indices + * @param[out] dists output knn distances + * @param[in] k number of nearest neighbors to find + * @param[in] perform_post_filtering if this is false, only the closest k landmarks + * are considered (which will return approximate + * results). + * @param[in] weight a weight for overlap between the closest landmark and + * the radius of other landmarks when pruning distances. + * Setting this value below 1 can effectively turn off + * computing distances against many other balls, enabling + * approximate nearest neighbors. Recall can be adjusted + * based on how many relevant balls are ignored. Note that + * many datasets can still have great recall even by only + * looking in the closest landmark. + */ +void knn_query(raft::resources const& handle, + const index& index, + raft::device_matrix_view query, + raft::device_matrix_view inds, + raft::device_matrix_view dists, + int k, + bool perform_post_filtering = true, + float weight = 1.0); + +/** @} */ + +} // namespace cuvs::neighbors::ball_cover diff --git a/cpp/include/cuvs/neighbors/brute_force.hpp b/cpp/include/cuvs/neighbors/brute_force.hpp index 13a5ea0cb..d9e72bdac 100644 --- a/cpp/include/cuvs/neighbors/brute_force.hpp +++ b/cpp/include/cuvs/neighbors/brute_force.hpp @@ -194,12 +194,13 @@ auto build(raft::resources const& handle, * @param[in] sample_filter a optional device bitmap filter function that greenlights samples for a * given */ -void search(raft::resources const& handle, - const cuvs::neighbors::brute_force::index& index, - raft::device_matrix_view queries, - raft::device_matrix_view neighbors, - raft::device_matrix_view distances, - std::optional> sample_filter); +void search( + raft::resources const& handle, + const cuvs::neighbors::brute_force::index& index, + raft::device_matrix_view queries, + raft::device_matrix_view neighbors, + raft::device_matrix_view distances, + std::optional> sample_filter = std::nullopt); /** * @} */ diff --git a/cpp/include/cuvs/neighbors/ivf_pq.hpp b/cpp/include/cuvs/neighbors/ivf_pq.hpp index f38b6cbc4..ce102eb46 100644 --- a/cpp/include/cuvs/neighbors/ivf_pq.hpp +++ b/cpp/include/cuvs/neighbors/ivf_pq.hpp @@ -107,7 +107,7 @@ struct index_params : cuvs::neighbors::index_params { * // create index_params for a [N. D] dataset and have InnerProduct as the distance metric * auto dataset = raft::make_device_matrix(res, N, D); * ivf_pq::index_params index_params = - * ivf_pq::index_params::from_dataset(dataset.extents(), raft::distance::InnerProduct); + * ivf_pq::index_params::from_dataset(dataset.extents(), cuvs::distance::InnerProduct); * // modify/update index_params as needed * index_params.add_data_on_build = true; * @endcode diff --git a/cpp/src/neighbors/ball_cover.cu b/cpp/src/neighbors/ball_cover.cu new file mode 100644 index 000000000..84402bb4e --- /dev/null +++ b/cpp/src/neighbors/ball_cover.cu @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ball_cover.cuh" +#include + +namespace cuvs::neighbors::ball_cover { + +void build(raft::resources const& handle, + cuvs::neighbors::ball_cover::index& index) +{ + detail::build_index(handle, index); +} + +void all_knn_query(raft::resources const& handle, + cuvs::neighbors::ball_cover::index& index, + raft::device_matrix_view inds, + raft::device_matrix_view dists, + int64_t k, + bool perform_post_filtering, + float weight) +{ + detail::all_knn_query( + handle, index, inds, dists, k, perform_post_filtering, weight); +} + +void eps_nn(raft::resources const& handle, + const cuvs::neighbors::ball_cover::index& index, + raft::device_matrix_view adj, + raft::device_vector_view vd, + raft::device_matrix_view query, + float eps) +{ + detail::eps_nn(handle, index, adj, vd, query, eps); +} + +void eps_nn(raft::resources const& handle, + const cuvs::neighbors::ball_cover::index& index, + raft::device_vector_view adj_ia, + raft::device_vector_view adj_ja, + raft::device_vector_view vd, + raft::device_matrix_view query, + float eps, + std::optional> max_k) +{ + detail::eps_nn( + handle, index, adj_ia, adj_ja, vd, query, eps, max_k); +} + +void knn_query(raft::resources const& handle, + const cuvs::neighbors::ball_cover::index& index, + raft::device_matrix_view query, + raft::device_matrix_view inds, + raft::device_matrix_view dists, + int64_t k, + bool perform_post_filtering, + float weight) +{ + detail::knn_query( + handle, index, query, inds, dists, k, perform_post_filtering, weight); +} + +} // namespace cuvs::neighbors::ball_cover \ No newline at end of file diff --git a/cpp/src/neighbors/ball_cover.cuh b/cpp/src/neighbors/ball_cover.cuh new file mode 100644 index 000000000..4e06881a4 --- /dev/null +++ b/cpp/src/neighbors/ball_cover.cuh @@ -0,0 +1,492 @@ +/* + * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "ball_cover/ball_cover.cuh" +#include "ball_cover/common.cuh" +#include +#include + +#include + +#include + +namespace cuvs::neighbors::ball_cover::detail { + +/** + * @defgroup random_ball_cover Random Ball Cover algorithm + * @{ + */ + +/** + * Builds and populates a previously unbuilt cuvs::neighbors::ball_cover::index + * + * Usage example: + * @code{.cpp} + * + * #include + * #include + * #include + * using namespace raft::neighbors; + * + * raft::resources handle; + * ... + * auto metric = cuvs::distance::DistanceType::L2Expanded; + * cuvs::neighbors::ball_cover::index index(handle, X, metric); + * + * ball_cover::build_index(handle, index); + * @endcode + * + * @tparam idx_t knn index type + * @tparam value_t knn value type + * @tparam int_t integral type for knn params + * @tparam matrix_idx_t matrix indexing type + * @param[in] handle library resource management handle + * @param[inout] index an empty (and not previous built) instance of + * cuvs::neighbors::ball_cover::index + */ +template +void build_index(raft::resources const& handle, + cuvs::neighbors::ball_cover::index& index) +{ + if (index.metric == cuvs::distance::DistanceType::Haversine) { + cuvs::neighbors::detail::rbc_build_index( + handle, index, cuvs::neighbors::detail::HaversineFunc()); + } else if (index.metric == cuvs::distance::DistanceType::L2SqrtExpanded || + index.metric == cuvs::distance::DistanceType::L2SqrtUnexpanded) { + cuvs::neighbors::detail::rbc_build_index( + handle, index, cuvs::neighbors::detail::EuclideanFunc()); + } else { + RAFT_FAIL("Metric not support"); + } + + index.set_index_trained(); +} + +/** @} */ // end group random_ball_cover + +/** + * Performs a faster exact knn in metric spaces using the triangle + * inequality with a number of landmark points to reduce the + * number of distance computations from O(n^2) to O(sqrt(n)). This + * performs an all neighbors knn, which can reuse memory when + * the index and query are the same array. This function will + * build the index and assumes rbc_build_index() has not already + * been called. + * @tparam idx_t knn index type + * @tparam value_t knn distance type + * @tparam int_t type for integers, such as number of rows/cols + * @param[in] handle raft handle for resource management + * @param[inout] index ball cover index which has not yet been built + * @param[in] k number of nearest neighbors to find + * @param[in] perform_post_filtering if this is false, only the closest k landmarks + * are considered (which will return approximate + * results). + * @param[out] inds output knn indices + * @param[out] dists output knn distances + * @param[in] weight a weight for overlap between the closest landmark and + * the radius of other landmarks when pruning distances. + * Setting this value below 1 can effectively turn off + * computing distances against many other balls, enabling + * approximate nearest neighbors. Recall can be adjusted + * based on how many relevant balls are ignored. Note that + * many datasets can still have great recall even by only + * looking in the closest landmark. + */ +template +void all_knn_query(raft::resources const& handle, + cuvs::neighbors::ball_cover::index& index, + int_t k, + idx_t* inds, + value_t* dists, + bool perform_post_filtering = true, + float weight = 1.0) +{ + ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation"); + if (index.metric == cuvs::distance::DistanceType::Haversine) { + cuvs::neighbors::detail::rbc_all_knn_query( + handle, + index, + k, + inds, + dists, + cuvs::neighbors::detail::HaversineFunc(), + perform_post_filtering, + weight); + } else if (index.metric == cuvs::distance::DistanceType::L2SqrtExpanded || + index.metric == cuvs::distance::DistanceType::L2SqrtUnexpanded) { + cuvs::neighbors::detail::rbc_all_knn_query( + handle, + index, + k, + inds, + dists, + cuvs::neighbors::detail::EuclideanFunc(), + perform_post_filtering, + weight); + } else { + RAFT_FAIL("Metric not supported"); + } + + index.set_index_trained(); +} + +/** + * @ingroup random_ball_cover + * @{ + */ + +/** + * Performs a faster exact knn in metric spaces using the triangle + * inequality with a number of landmark points to reduce the + * number of distance computations from O(n^2) to O(sqrt(n)). This + * performs an all neighbors knn, which can reuse memory when + * the index and query are the same array. This function will + * build the index and assumes rbc_build_index() has not already + * been called. + * + * Usage example: + * @code{.cpp} + * + * #include + * #include + * #include + * using namespace raft::neighbors; + * + * raft::resources handle; + * ... + * auto metric = cuvs::distance::DistanceType::L2Expanded; + * + * // Construct a ball cover index + * cuvs::neighbors::ball_cover::index index(handle, X, metric); + * + * // Perform all neighbors knn query + * ball_cover::all_knn_query(handle, index, inds, dists, k); + * @endcode + * + * @tparam idx_t knn index type + * @tparam value_t knn distance type + * @tparam int_t type for integers, such as number of rows/cols + * @tparam matrix_idx_t matrix indexing type + * + * @param[in] handle raft handle for resource management + * @param[in] index ball cover index which has not yet been built + * @param[out] inds output knn indices + * @param[out] dists output knn distances + * @param[in] k number of nearest neighbors to find + * @param[in] perform_post_filtering if this is false, only the closest k landmarks + * are considered (which will return approximate + * results). + * @param[in] weight a weight for overlap between the closest landmark and + * the radius of other landmarks when pruning distances. + * Setting this value below 1 can effectively turn off + * computing distances against many other balls, enabling + * approximate nearest neighbors. Recall can be adjusted + * based on how many relevant balls are ignored. Note that + * many datasets can still have great recall even by only + * looking in the closest landmark. + */ +template +void all_knn_query(raft::resources const& handle, + cuvs::neighbors::ball_cover::index& index, + raft::device_matrix_view inds, + raft::device_matrix_view dists, + int_t k, + bool perform_post_filtering = true, + float weight = 1.0) +{ + RAFT_EXPECTS(index.n <= 3, "only 2d and 3d vectors are supported in current implementation"); + RAFT_EXPECTS(k <= index.m, + "k must be less than or equal to the number of data points in the index"); + RAFT_EXPECTS(inds.extent(1) == dists.extent(1) && dists.extent(1) == static_cast(k), + "Number of columns in output indices and distances matrices must be equal to k"); + + RAFT_EXPECTS(inds.extent(0) == dists.extent(0) && dists.extent(0) == index.get_X().extent(0), + "Number of rows in output indices and distances matrices must equal number of rows " + "in index matrix."); + + all_knn_query( + handle, index, k, inds.data_handle(), dists.data_handle(), perform_post_filtering, weight); +} + +/** @} */ + +/** + * Performs a faster exact knn in metric spaces using the triangle + * inequality with a number of landmark points to reduce the + * number of distance computations from O(n^2) to O(sqrt(n)). This + * function does not build the index and assumes rbc_build_index() has + * already been called. Use this function when the index and + * query arrays are different, otherwise use rbc_all_knn_query(). + * @tparam idx_t index type + * @tparam value_t distances type + * @tparam int_t integer type for size info + * @param[in] handle raft handle for resource management + * @param[inout] index ball cover index which has not yet been built + * @param[in] k number of nearest neighbors to find + * @param[in] query the + * @param[in] perform_post_filtering if this is false, only the closest k landmarks + * are considered (which will return approximate + * results). + * @param[out] inds output knn indices + * @param[out] dists output knn distances + * @param[in] weight a weight for overlap between the closest landmark and + * the radius of other landmarks when pruning distances. + * Setting this value below 1 can effectively turn off + * computing distances against many other balls, enabling + * approximate nearest neighbors. Recall can be adjusted + * based on how many relevant balls are ignored. Note that + * many datasets can still have great recall even by only + * looking in the closest landmark. + * @param[in] n_query_pts number of query points + */ +template +void knn_query(raft::resources const& handle, + const cuvs::neighbors::ball_cover::index& index, + int_t k, + const value_t* query, + int_t n_query_pts, + idx_t* inds, + value_t* dists, + bool perform_post_filtering = true, + float weight = 1.0) +{ + ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation"); + if (index.metric == cuvs::distance::DistanceType::Haversine) { + cuvs::neighbors::detail::rbc_knn_query(handle, + index, + k, + query, + n_query_pts, + inds, + dists, + cuvs::neighbors::detail::HaversineFunc(), + perform_post_filtering, + weight); + } else if (index.metric == cuvs::distance::DistanceType::L2SqrtExpanded || + index.metric == cuvs::distance::DistanceType::L2SqrtUnexpanded) { + cuvs::neighbors::detail::rbc_knn_query(handle, + index, + k, + query, + n_query_pts, + inds, + dists, + cuvs::neighbors::detail::EuclideanFunc(), + perform_post_filtering, + weight); + } else { + RAFT_FAIL("Metric not supported"); + } +} + +/** + * @brief Computes epsilon neighborhood for the L2 distance metric using rbc + * + * @tparam value_t IO and math type + * @tparam idx_t Index type + * + * @param[in] handle raft handle for resource management + * @param[in] index ball cover index which has been built + * @param[out] adj adjacency matrix [row-major] [on device] [dim = m x n] + * @param[out] vd vertex degree array [on device] [len = m + 1] + * `vd + m` stores the total number of edges in the adjacency + * matrix. Pass a nullptr if you don't need this info. + * @param[in] query first matrix [row-major] [on device] [dim = m x k] + * @param[in] eps defines epsilon neighborhood radius + */ +template +void eps_nn(raft::resources const& handle, + const cuvs::neighbors::ball_cover::index& index, + raft::device_matrix_view adj, + raft::device_vector_view vd, + raft::device_matrix_view query, + value_t eps) +{ + ASSERT(index.n == query.extent(1), "vector dimension needs to be the same for index and queries"); + ASSERT(index.metric == cuvs::distance::DistanceType::L2SqrtExpanded || + index.metric == cuvs::distance::DistanceType::L2SqrtUnexpanded, + "Metric not supported"); + ASSERT(index.is_index_trained(), "index must be previously trained"); + + // run query + cuvs::neighbors::detail::rbc_eps_nn_query( + handle, + index, + eps, + query.data_handle(), + query.extent(0), + adj.data_handle(), + vd.data_handle(), + cuvs::neighbors::detail::EuclideanSqFunc()); +} + +/** + * @brief Computes epsilon neighborhood for the L2 distance metric using rbc + * + * @tparam value_t IO and math type + * @tparam idx_t Index type + * + * @param[in] handle raft handle for resource management + * @param[in] index ball cover index which has been built + * @param[out] adj_ia adjacency matrix CSR row offsets + * @param[out] adj_ja adjacency matrix CSR column indices, needs to be nullptr + * in first pass with max_k nullopt + * @param[out] vd vertex degree array [on device] [len = m + 1] + * `vd + m` stores the total number of edges in the adjacency + * matrix. Pass a nullptr if you don't need this info. + * @param[in] query first matrix [row-major] [on device] [dim = m x k] + * @param[in] eps defines epsilon neighborhood radius + * @param[inout] max_k if nullopt (default), the user needs to make 2 subsequent calls: + * The first call computes row offsets in adj_ia, where adj_ia[m] + * contains the minimum required size for adj_ja. + * The second call fills in adj_ja based on adj_ia. + * If max_k != nullopt the algorithm only fills up neighbors up to a + * maximum number of max_k for each row in a single pass. Note + * that it is not guarantueed to return the nearest neighbors. + * Upon return max_k is overwritten with the actual max_k found during + * computation. + */ +template +void eps_nn(raft::resources const& handle, + const cuvs::neighbors::ball_cover::index& index, + raft::device_vector_view adj_ia, + raft::device_vector_view adj_ja, + raft::device_vector_view vd, + raft::device_matrix_view query, + value_t eps, + std::optional> max_k = std::nullopt) +{ + ASSERT(index.n == query.extent(1), "vector dimension needs to be the same for index and queries"); + ASSERT(index.metric == cuvs::distance::DistanceType::L2SqrtExpanded || + index.metric == cuvs::distance::DistanceType::L2SqrtUnexpanded, + "Metric not supported"); + ASSERT(index.is_index_trained(), "index must be previously trained"); + + int_t* max_k_ptr = nullptr; + if (max_k.has_value()) { max_k_ptr = max_k.value().data_handle(); } + + // run query + cuvs::neighbors::detail::rbc_eps_nn_query( + handle, + index, + eps, + max_k_ptr, + query.data_handle(), + query.extent(0), + adj_ia.data_handle(), + adj_ja.data_handle(), + vd.data_handle(), + cuvs::neighbors::detail::EuclideanSqFunc()); +} + +/** + * @ingroup random_ball_cover + * @{ + */ + +/** + * Performs a faster exact knn in metric spaces using the triangle + * inequality with a number of landmark points to reduce the + * number of distance computations from O(n^2) to O(sqrt(n)). This + * function does not build the index and assumes rbc_build_index() has + * already been called. Use this function when the index and + * query arrays are different, otherwise use rbc_all_knn_query(). + * + * Usage example: + * @code{.cpp} + * + * #include + * #include + * #include + * using namespace raft::neighbors; + * + * raft::resources handle; + * ... + * auto metric = cuvs::distance::DistanceType::L2Expanded; + * + * // Build a ball cover index + * cuvs::neighbors::ball_cover::index index(handle, X, metric); + * ball_cover::build_index(handle, index); + * + * // Perform all neighbors knn query + * ball_cover::knn_query(handle, index, inds, dists, k); + * @endcode + + * + * @tparam idx_t index type + * @tparam value_t distances type + * @tparam int_t integer type for size info + * @tparam matrix_idx_t + * @param[in] handle raft handle for resource management + * @param[in] index ball cover index which has not yet been built + * @param[in] query device matrix containing query data points + * @param[out] inds output knn indices + * @param[out] dists output knn distances + * @param[in] k number of nearest neighbors to find + * @param[in] perform_post_filtering if this is false, only the closest k landmarks + * are considered (which will return approximate + * results). + * @param[in] weight a weight for overlap between the closest landmark and + * the radius of other landmarks when pruning distances. + * Setting this value below 1 can effectively turn off + * computing distances against many other balls, enabling + * approximate nearest neighbors. Recall can be adjusted + * based on how many relevant balls are ignored. Note that + * many datasets can still have great recall even by only + * looking in the closest landmark. + */ +template +void knn_query(raft::resources const& handle, + const cuvs::neighbors::ball_cover::index& index, + raft::device_matrix_view query, + raft::device_matrix_view inds, + raft::device_matrix_view dists, + int_t k, + bool perform_post_filtering = true, + float weight = 1.0) +{ + RAFT_EXPECTS(k <= index.m, + "k must be less than or equal to the number of data points in the index"); + RAFT_EXPECTS(inds.extent(1) == dists.extent(1) && dists.extent(1) == static_cast(k), + "Number of columns in output indices and distances matrices must be equal to k"); + + RAFT_EXPECTS(inds.extent(0) == dists.extent(0) && dists.extent(0) == query.extent(0), + "Number of rows in output indices and distances matrices must equal number of rows " + "in search matrix."); + + RAFT_EXPECTS(query.extent(1) == index.get_X().extent(1), + "Number of columns in query and index matrices must match."); + + knn_query(handle, + index, + k, + query.data_handle(), + (int_t)query.extent(0), + inds.data_handle(), + dists.data_handle(), + perform_post_filtering, + weight); +} + +/** @} */ + +// TODO: implement functions for: +// 4. rbc_eps_neigh() - given a populated index, perform query against different query array +// 5. rbc_all_eps_neigh() - populate a cuvs::neighbors::ball_cover::index and query against +// training data + +} // namespace cuvs::neighbors::ball_cover::detail \ No newline at end of file diff --git a/cpp/src/neighbors/ball_cover/ball_cover.cuh b/cpp/src/neighbors/ball_cover/ball_cover.cuh new file mode 100644 index 000000000..8b03a18e6 --- /dev/null +++ b/cpp/src/neighbors/ball_cover/ball_cover.cuh @@ -0,0 +1,718 @@ +/* + * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "../detail/haversine_distance.cuh" +#include "common.cuh" +#include "registers.cuh" +#include "registers_types.cuh" +#include + +#include "../faiss_select/key_value_block_select.cuh" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +namespace cuvs::neighbors::detail { + +/** + * Given a set of points in row-major order which are to be + * used as a set of index points, uniformly samples a subset + * of points to be used as landmarks. + * @tparam value_idx + * @tparam value_t + * @param handle + * @param index + */ +template +void sample_landmarks( + raft::resources const& handle, + cuvs::neighbors::ball_cover::index& index) +{ + rmm::device_uvector R_1nn_cols2(index.n_landmarks, + raft::resource::get_cuda_stream(handle)); + rmm::device_uvector R_1nn_ones(index.m, raft::resource::get_cuda_stream(handle)); + rmm::device_uvector R_indices(index.n_landmarks, + raft::resource::get_cuda_stream(handle)); + + thrust::sequence(raft::resource::get_thrust_policy(handle), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_cols().data_handle() + index.m, + (value_idx)0); + + thrust::fill(raft::resource::get_thrust_policy(handle), + R_1nn_ones.data(), + R_1nn_ones.data() + R_1nn_ones.size(), + 1.0); + + thrust::fill(raft::resource::get_thrust_policy(handle), + R_indices.data(), + R_indices.data() + R_indices.size(), + 0.0); + + /** + * 1. Randomly sample sqrt(n) points from X + */ + raft::random::RngState rng_state(12345); + raft::random::sampleWithoutReplacement(handle, + rng_state, + R_indices.data(), + R_1nn_cols2.data(), + index.get_R_1nn_cols().data_handle(), + R_1nn_ones.data(), + (value_idx)index.n_landmarks, + (value_idx)index.m); + + auto x = index.get_X(); + auto r = index.get_R(); + + raft::matrix::copy_rows( + handle, + raft::make_device_matrix_view( + x.data_handle(), x.extent(0), x.extent(1)), + raft::make_device_matrix_view(r.data_handle(), r.extent(0), r.extent(1)), + raft::make_device_vector_view(R_1nn_cols2.data(), index.n_landmarks)); +} + +/** + * Constructs a 1-nn index mapping each landmark to their closest points. + * @tparam value_idx + * @tparam value_t + * @param handle + * @param R_knn_inds_ptr + * @param R_knn_dists_ptr + * @param k + * @param index + */ +template +void construct_landmark_1nn( + raft::resources const& handle, + const value_idx* R_knn_inds_ptr, + const value_t* R_knn_dists_ptr, + value_int k, + cuvs::neighbors::ball_cover::index& index) +{ + rmm::device_uvector R_1nn_inds(index.m, raft::resource::get_cuda_stream(handle)); + + thrust::fill(raft::resource::get_thrust_policy(handle), + R_1nn_inds.data(), + R_1nn_inds.data() + index.m, + std::numeric_limits::max()); + + value_idx* R_1nn_inds_ptr = R_1nn_inds.data(); + value_t* R_1nn_dists_ptr = index.get_R_1nn_dists().data_handle(); + + auto idxs = thrust::make_counting_iterator(0); + thrust::for_each( + raft::resource::get_thrust_policy(handle), idxs, idxs + index.m, [=] __device__(value_idx i) { + R_1nn_inds_ptr[i] = R_knn_inds_ptr[i * k]; + R_1nn_dists_ptr[i] = R_knn_dists_ptr[i * k]; + }); + + auto keys = thrust::make_zip_iterator( + thrust::make_tuple(R_1nn_inds.data(), index.get_R_1nn_dists().data_handle())); + + // group neighborhoods for each reference landmark and sort each group by distance + thrust::sort_by_key(raft::resource::get_thrust_policy(handle), + keys, + keys + index.m, + index.get_R_1nn_cols().data_handle(), + NNComp()); + + // convert to CSR for fast lookup + raft::sparse::convert::sorted_coo_to_csr(R_1nn_inds.data(), + index.m, + index.get_R_indptr().data_handle(), + index.n_landmarks + 1, + raft::resource::get_cuda_stream(handle)); + + // reorder X to allow aligned access + raft::matrix::copy_rows( + handle, index.get_X(), index.get_X_reordered(), index.get_R_1nn_cols()); +} + +/** + * Computes the k closest landmarks to a set of query points. + * @tparam value_idx + * @tparam value_t + * @tparam value_int + * @param handle + * @param index + * @param query_pts + * @param n_query_pts + * @param k + * @param R_knn_inds + * @param R_knn_dists + */ +template +void k_closest_landmarks( + raft::resources const& handle, + const cuvs::neighbors::ball_cover::index& index, + const value_t* query_pts, + value_int n_query_pts, + value_int k, + value_idx* R_knn_inds, + value_t* R_knn_dists) +{ + raft::device_matrix_view inputs = index.get_R(); + + auto bfknn = cuvs::neighbors::brute_force::build(handle, inputs, index.get_metric()); + cuvs::neighbors::brute_force::search( + handle, + bfknn, + raft::make_device_matrix_view(query_pts, n_query_pts, inputs.extent(1)), + raft::make_device_matrix_view(R_knn_inds, n_query_pts, k), + raft::make_device_matrix_view(R_knn_dists, n_query_pts, k)); +} + +/** + * Uses the sorted data points in the 1-nn landmark index to compute + * an array of radii for each landmark. + * @tparam value_idx + * @tparam value_t + * @param handle + * @param index + */ +template +void compute_landmark_radii( + raft::resources const& handle, + cuvs::neighbors::ball_cover::index& index) +{ + auto entries = thrust::make_counting_iterator(0); + + const value_idx* R_indptr_ptr = index.get_R_indptr().data_handle(); + const value_t* R_1nn_dists_ptr = index.get_R_1nn_dists().data_handle(); + value_t* R_radius_ptr = index.get_R_radius().data_handle(); + thrust::for_each(raft::resource::get_thrust_policy(handle), + entries, + entries + index.n_landmarks, + [=] __device__(value_idx input) { + value_idx last_row_idx = R_indptr_ptr[input + 1] - 1; + R_radius_ptr[input] = R_1nn_dists_ptr[last_row_idx]; + }); +} + +/** + * 4. Perform k-select over original KNN, using L_r to filter distances + * + * a. Map 1 row to each warp/block + * b. Add closest k R points to heap + * c. Iterate through batches of R, having each thread in the warp load a set + * of distances y from R (only if d(q, r) < 3 * distance to closest r) and + * marking the distance to be computed between x, y only + * if knn[k].distance >= d(x_i, R_k) + d(R_k, y) + */ +template +void perform_rbc_query( + raft::resources const& handle, + const cuvs::neighbors::ball_cover::index& index, + const value_t* query, + value_int n_query_pts, + value_int k, + const value_idx* R_knn_inds, + const value_t* R_knn_dists, + dist_func dfunc, + value_idx* inds, + value_t* dists, + value_int* dists_counter, + value_int* post_dists_counter, + float weight = 1.0, + bool perform_post_filtering = true) +{ + // initialize output inds and dists + thrust::fill(raft::resource::get_thrust_policy(handle), + inds, + inds + (k * n_query_pts), + std::numeric_limits::max()); + thrust::fill(raft::resource::get_thrust_policy(handle), + dists, + dists + (k * n_query_pts), + std::numeric_limits::max()); + + if (index.n == 2) { + // Compute nearest k for each neighborhood in each closest R + rbc_low_dim_pass_one(handle, + index, + query, + n_query_pts, + k, + R_knn_inds, + R_knn_dists, + dfunc, + inds, + dists, + weight, + dists_counter); + + if (perform_post_filtering) { + rbc_low_dim_pass_two(handle, + index, + query, + n_query_pts, + k, + R_knn_inds, + R_knn_dists, + dfunc, + inds, + dists, + weight, + post_dists_counter); + } + + } else if (index.n == 3) { + // Compute nearest k for each neighborhood in each closest R + rbc_low_dim_pass_one(handle, + index, + query, + n_query_pts, + k, + R_knn_inds, + R_knn_dists, + dfunc, + inds, + dists, + weight, + dists_counter); + + if (perform_post_filtering) { + rbc_low_dim_pass_two(handle, + index, + query, + n_query_pts, + k, + R_knn_inds, + R_knn_dists, + dfunc, + inds, + dists, + weight, + post_dists_counter); + } + } +} + +/** + * Perform eps-select + * + */ +template +void perform_rbc_eps_nn_query( + raft::resources const& handle, + const cuvs::neighbors::ball_cover::index& index, + const value_t* query, + value_int n_query_pts, + value_t eps, + const value_t* landmarks, + dist_func dfunc, + bool* adj, + value_idx* vd) +{ + // initialize output + RAFT_CUDA_TRY(cudaMemsetAsync( + adj, 0, index.m * n_query_pts * sizeof(bool), raft::resource::get_cuda_stream(handle))); + + raft::resource::sync_stream(handle); + + rbc_eps_pass( + handle, index, query, n_query_pts, eps, landmarks, dfunc, adj, vd); + + raft::resource::sync_stream(handle); +} + +template +void perform_rbc_eps_nn_query( + raft::resources const& handle, + const cuvs::neighbors::ball_cover::index& index, + const value_t* query, + value_int n_query_pts, + value_t eps, + value_int* max_k, + const value_t* landmarks, + dist_func dfunc, + value_idx* adj_ia, + value_idx* adj_ja, + value_idx* vd) +{ + rbc_eps_pass( + handle, index, query, n_query_pts, eps, max_k, landmarks, dfunc, adj_ia, adj_ja, vd); + + raft::resource::sync_stream(handle); +} + +/** + * Similar to a ball tree, the random ball cover algorithm + * uses the triangle inequality to prune distance computations + * in any metric space with a guarantee of sqrt(n) * c^{3/2} + * where `c` is an expansion constant based on the distance + * metric. + * + * This function variant performs an all nearest neighbors + * query which is useful for algorithms that need to perform + * A * A.T. + */ +template +void rbc_build_index( + raft::resources const& handle, + cuvs::neighbors::ball_cover::index& index, + distance_func dfunc) +{ + ASSERT(!index.is_index_trained(), "index cannot be previously trained"); + + rmm::device_uvector R_knn_inds(index.m, raft::resource::get_cuda_stream(handle)); + + // Initialize the uvectors + thrust::fill(raft::resource::get_thrust_policy(handle), + R_knn_inds.begin(), + R_knn_inds.end(), + std::numeric_limits::max()); + thrust::fill(raft::resource::get_thrust_policy(handle), + index.get_R_closest_landmark_dists().data_handle(), + index.get_R_closest_landmark_dists().data_handle() + index.m, + std::numeric_limits::max()); + + /** + * 1. Randomly sample sqrt(n) points from X + */ + sample_landmarks(handle, index); + + /** + * 2. Perform knn = bfknn(X, R, k) + */ + value_int k = 1; + k_closest_landmarks(handle, + index, + index.get_X().data_handle(), + index.m, + k, + R_knn_inds.data(), + index.get_R_closest_landmark_dists().data_handle()); + + /** + * 3. Create L_r = knn[:,0].T (CSR) + * + * Slice closest neighboring R + * Secondary sort by (R_knn_inds, R_knn_dists) + */ + construct_landmark_1nn( + handle, R_knn_inds.data(), index.get_R_closest_landmark_dists().data_handle(), k, index); + + /** + * Compute radius of each R for filtering: p(q, r) <= p(q, q_r) + radius(r) + * (need to take the + */ + compute_landmark_radii(handle, index); +} + +/** + * Performs an all neighbors knn query (e.g. index == query) + */ +template +void rbc_all_knn_query( + raft::resources const& handle, + cuvs::neighbors::ball_cover::index& index, + value_int k, + value_idx* inds, + value_t* dists, + distance_func dfunc, + // approximate nn options + bool perform_post_filtering = true, + float weight = 1.0) +{ + ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation"); + ASSERT(index.n_landmarks >= k, "number of landmark samples must be >= k"); + ASSERT(!index.is_index_trained(), "index cannot be previously trained"); + + rmm::device_uvector R_knn_inds(k * index.m, raft::resource::get_cuda_stream(handle)); + rmm::device_uvector R_knn_dists(k * index.m, raft::resource::get_cuda_stream(handle)); + + // Initialize the uvectors + thrust::fill(raft::resource::get_thrust_policy(handle), + R_knn_inds.begin(), + R_knn_inds.end(), + std::numeric_limits::max()); + thrust::fill(raft::resource::get_thrust_policy(handle), + R_knn_dists.begin(), + R_knn_dists.end(), + std::numeric_limits::max()); + + thrust::fill(raft::resource::get_thrust_policy(handle), + inds, + inds + (k * index.m), + std::numeric_limits::max()); + thrust::fill(raft::resource::get_thrust_policy(handle), + dists, + dists + (k * index.m), + std::numeric_limits::max()); + + // For debugging / verification. Remove before releasing + rmm::device_uvector dists_counter(index.m, raft::resource::get_cuda_stream(handle)); + rmm::device_uvector post_dists_counter(index.m, + raft::resource::get_cuda_stream(handle)); + + sample_landmarks(handle, index); + + k_closest_landmarks( + handle, index, index.get_X().data_handle(), index.m, k, R_knn_inds.data(), R_knn_dists.data()); + + construct_landmark_1nn(handle, R_knn_inds.data(), R_knn_dists.data(), k, index); + + compute_landmark_radii(handle, index); + + perform_rbc_query(handle, + index, + index.get_X().data_handle(), + index.m, + k, + R_knn_inds.data(), + R_knn_dists.data(), + dfunc, + inds, + dists, + dists_counter.data(), + post_dists_counter.data(), + weight, + perform_post_filtering); +} + +/** + * Performs a knn query against an index. This assumes the index has + * already been built. + */ +template +void rbc_knn_query( + raft::resources const& handle, + const cuvs::neighbors::ball_cover::index& index, + value_int k, + const value_t* query, + value_int n_query_pts, + value_idx* inds, + value_t* dists, + distance_func dfunc, + // approximate nn options + bool perform_post_filtering = true, + float weight = 1.0) +{ + ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation"); + ASSERT(index.n_landmarks >= k, "number of landmark samples must be >= k"); + ASSERT(index.is_index_trained(), "index must be previously trained"); + + rmm::device_uvector R_knn_inds(k * n_query_pts, + raft::resource::get_cuda_stream(handle)); + rmm::device_uvector R_knn_dists(k * n_query_pts, + raft::resource::get_cuda_stream(handle)); + + // Initialize the uvectors + thrust::fill(raft::resource::get_thrust_policy(handle), + R_knn_inds.begin(), + R_knn_inds.end(), + std::numeric_limits::max()); + thrust::fill(raft::resource::get_thrust_policy(handle), + R_knn_dists.begin(), + R_knn_dists.end(), + std::numeric_limits::max()); + + thrust::fill(raft::resource::get_thrust_policy(handle), + inds, + inds + (k * n_query_pts), + std::numeric_limits::max()); + thrust::fill(raft::resource::get_thrust_policy(handle), + dists, + dists + (k * n_query_pts), + std::numeric_limits::max()); + + k_closest_landmarks(handle, index, query, n_query_pts, k, R_knn_inds.data(), R_knn_dists.data()); + + // For debugging / verification. Remove before releasing + rmm::device_uvector dists_counter(index.m, raft::resource::get_cuda_stream(handle)); + rmm::device_uvector post_dists_counter(index.m, + raft::resource::get_cuda_stream(handle)); + thrust::fill(raft::resource::get_thrust_policy(handle), + post_dists_counter.data(), + post_dists_counter.data() + post_dists_counter.size(), + 0); + thrust::fill(raft::resource::get_thrust_policy(handle), + dists_counter.data(), + dists_counter.data() + dists_counter.size(), + 0); + + perform_rbc_query(handle, + index, + query, + n_query_pts, + k, + R_knn_inds.data(), + R_knn_dists.data(), + dfunc, + inds, + dists, + dists_counter.data(), + post_dists_counter.data(), + weight, + perform_post_filtering); +} + +template +void compute_landmark_dists( + raft::resources const& handle, + const cuvs::neighbors::ball_cover::index& index, + const value_t* query_pts, + value_int n_query_pts, + value_t* R_dists) +{ + // compute distances for all queries against all landmarks + // index.get_R() -- landmark points in row order (index.n_landmarks x index.k) + // query_pts -- query points in row order (n_query_pts x index.k) + RAFT_EXPECTS(std::max(index.n_landmarks, n_query_pts) * index.n < + static_cast(std::numeric_limits::max()), + "Too large input for pairwise_distance with `int` index."); + RAFT_EXPECTS(n_query_pts * static_cast(index.n_landmarks) < + static_cast(std::numeric_limits::max()), + "Too large input for pairwise_distance with `int` index."); + cuvs::distance::pairwise_distance(handle, + query_pts, + index.get_R().data_handle(), + R_dists, + n_query_pts, + index.n_landmarks, + index.n, + index.get_metric()); +} + +/** + * Performs a knn query against an index. This assumes the index has + * already been built. + * Modified version that takes an eps as threshold and outputs to a dense adj matrix (row-major) + * we are assuming that there are sufficiently many landmarks + */ +template +void rbc_eps_nn_query( + raft::resources const& handle, + const cuvs::neighbors::ball_cover::index& index, + const value_t eps, + const value_t* query, + value_int n_query_pts, + bool* adj, + value_idx* vd, + distance_func dfunc) +{ + ASSERT(index.is_index_trained(), "index must be previously trained"); + + // query all points and write to adj + perform_rbc_eps_nn_query( + handle, index, query, n_query_pts, eps, index.get_R().data_handle(), dfunc, adj, vd); +} + +template +void rbc_eps_nn_query( + raft::resources const& handle, + const cuvs::neighbors::ball_cover::index& index, + const value_t eps, + value_int* max_k, + const value_t* query, + value_int n_query_pts, + value_idx* adj_ia, + value_idx* adj_ja, + value_idx* vd, + distance_func dfunc) +{ + ASSERT(index.is_index_trained(), "index must be previously trained"); + + // query all points and write to adj + perform_rbc_eps_nn_query(handle, + index, + query, + n_query_pts, + eps, + max_k, + index.get_R().data_handle(), + dfunc, + adj_ia, + adj_ja, + vd); +} + +}; // namespace cuvs::neighbors::detail diff --git a/cpp/src/neighbors/ball_cover/common.cuh b/cpp/src/neighbors/ball_cover/common.cuh new file mode 100644 index 000000000..505c58a11 --- /dev/null +++ b/cpp/src/neighbors/ball_cover/common.cuh @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "../detail/haversine_distance.cuh" +#include "registers_types.cuh" + +#include +#include + +#include + +namespace cuvs::neighbors::detail { + +struct NNComp { + template + __host__ __device__ bool operator()(const one& t1, const two& t2) + { + // sort first by each sample's reference landmark, + if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; + if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false; + + // then by closest neighbor, + return thrust::get<1>(t1) < thrust::get<1>(t2); + } +}; + +/** + * Zeros the bit at location h in a one-hot encoded 32-bit int array + */ +__device__ inline void _zero_bit(std::uint32_t* arr, std::uint32_t h) +{ + int bit = h % 32; + int idx = h / 32; + + std::uint32_t assumed; + std::uint32_t old = arr[idx]; + do { + assumed = old; + old = atomicCAS(arr + idx, assumed, assumed & ~(1 << bit)); + } while (assumed != old); +} + +/** + * Returns whether or not bit at location h is nonzero in a one-hot + * encoded 32-bit in array. + */ +__device__ inline bool _get_val(std::uint32_t* arr, std::uint32_t h) +{ + int bit = h % 32; + int idx = h / 32; + return (arr[idx] & (1 << bit)) > 0; +} + +}; // namespace cuvs::neighbors::detail diff --git a/cpp/src/neighbors/ball_cover/registers-ext.cuh b/cpp/src/neighbors/ball_cover/registers-ext.cuh new file mode 100644 index 000000000..10ff30a1f --- /dev/null +++ b/cpp/src/neighbors/ball_cover/registers-ext.cuh @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "registers_types.cuh" // DistFunc +#include // cuvs::neighbors::ball_cover::index + +#include //RAFT_EXPLICIT + +#include // uint32_t + +#if defined(RAFT_EXPLICIT_INSTANTIATE_ONLY) + +namespace cuvs::neighbors::detail { + +template +void rbc_low_dim_pass_one( + raft::resources const& handle, + const cuvs::neighbors::ball_cover::index& index, + const value_t* query, + const value_int n_query_rows, + value_int k, + const value_idx* R_knn_inds, + const value_t* R_knn_dists, + dist_func& dfunc, + value_idx* inds, + value_t* dists, + float weight, + value_int* dists_counter) RAFT_EXPLICIT; + +template +void rbc_low_dim_pass_two( + raft::resources const& handle, + const cuvs::neighbors::ball_cover::index& index, + const value_t* query, + const value_int n_query_rows, + value_int k, + const value_idx* R_knn_inds, + const value_t* R_knn_dists, + dist_func& dfunc, + value_idx* inds, + value_t* dists, + float weight, + value_int* post_dists_counter) RAFT_EXPLICIT; + +template +void rbc_eps_pass( + raft::resources const& handle, + const cuvs::neighbors::ball_cover::index& index, + const value_t* query, + const value_int n_query_rows, + value_t eps, + const value_t* R_dists, + dist_func& dfunc, + bool* adj, + value_idx* vd) RAFT_EXPLICIT; + +template +void rbc_eps_pass( + raft::resources const& handle, + const cuvs::neighbors::ball_cover::index& index, + const value_t* query, + const value_int n_query_rows, + value_t eps, + value_int* max_k, + const value_t* R_dists, + dist_func& dfunc, + value_idx* adj_ia, + value_idx* adj_ja, + value_idx* vd) RAFT_EXPLICIT; + +}; // namespace cuvs::neighbors::detail + +#endif // RAFT_EXPLICIT_INSTANTIATE_ONLY + +#define instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( \ + Mvalue_idx, Mvalue_t, Mvalue_int, Mmatrix_idx, Mdims, Mdist_func) \ + extern template void cuvs::neighbors::detail:: \ + rbc_low_dim_pass_one( \ + raft::resources const& handle, \ + const cuvs::neighbors::ball_cover::index& \ + index, \ + const Mvalue_t* query, \ + const Mvalue_int n_query_rows, \ + Mvalue_int k, \ + const Mvalue_idx* R_knn_inds, \ + const Mvalue_t* R_knn_dists, \ + Mdist_func& dfunc, \ + Mvalue_idx* inds, \ + Mvalue_t* dists, \ + float weight, \ + Mvalue_int* dists_counter) + +#define instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( \ + Mvalue_idx, Mvalue_t, Mvalue_int, Mmatrix_idx, Mdims, Mdist_func) \ + extern template void cuvs::neighbors::detail:: \ + rbc_low_dim_pass_two( \ + raft::resources const& handle, \ + const cuvs::neighbors::ball_cover::index& \ + index, \ + const Mvalue_t* query, \ + const Mvalue_int n_query_rows, \ + Mvalue_int k, \ + const Mvalue_idx* R_knn_inds, \ + const Mvalue_t* R_knn_dists, \ + Mdist_func& dfunc, \ + Mvalue_idx* inds, \ + Mvalue_t* dists, \ + float weight, \ + Mvalue_int* dists_counter) + +#define instantiate_cuvs_neighbors_detail_rbc_eps_pass( \ + Mvalue_idx, Mvalue_t, Mvalue_int, Mmatrix_idx, Mdist_func) \ + extern template void \ + cuvs::neighbors::detail::rbc_eps_pass( \ + raft::resources const& handle, \ + const cuvs::neighbors::ball_cover::index& \ + index, \ + const Mvalue_t* query, \ + const Mvalue_int n_query_rows, \ + Mvalue_t eps, \ + const Mvalue_t* R_dists, \ + Mdist_func& dfunc, \ + bool* adj, \ + Mvalue_idx* vd); \ + \ + extern template void \ + cuvs::neighbors::detail::rbc_eps_pass( \ + raft::resources const& handle, \ + const cuvs::neighbors::ball_cover::index& \ + index, \ + const Mvalue_t* query, \ + const Mvalue_int n_query_rows, \ + Mvalue_t eps, \ + Mvalue_int* max_k, \ + const Mvalue_t* R_dists, \ + Mdist_func& dfunc, \ + Mvalue_idx* adj_ia, \ + Mvalue_idx* adj_ja, \ + Mvalue_idx* vd); + +instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( + std::int64_t, float, std::int64_t, std::int64_t, 2, cuvs::neighbors::detail::HaversineFunc); +instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( + std::int64_t, float, std::int64_t, std::int64_t, 3, cuvs::neighbors::detail::HaversineFunc); +instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( + std::int64_t, float, std::int64_t, std::int64_t, 2, cuvs::neighbors::detail::EuclideanFunc); +instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( + std::int64_t, float, std::int64_t, std::int64_t, 3, cuvs::neighbors::detail::EuclideanFunc); +instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( + std::int64_t, float, std::int64_t, std::int64_t, 2, cuvs::neighbors::detail::DistFunc); +instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( + std::int64_t, float, std::int64_t, std::int64_t, 3, cuvs::neighbors::detail::DistFunc); + +instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( + std::int64_t, float, std::int64_t, std::int64_t, 2, cuvs::neighbors::detail::HaversineFunc); +instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( + std::int64_t, float, std::int64_t, std::int64_t, 3, cuvs::neighbors::detail::HaversineFunc); +instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( + std::int64_t, float, std::int64_t, std::int64_t, 2, cuvs::neighbors::detail::EuclideanFunc); +instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( + std::int64_t, float, std::int64_t, std::int64_t, 3, cuvs::neighbors::detail::EuclideanFunc); +instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( + std::int64_t, float, std::int64_t, std::int64_t, 2, cuvs::neighbors::detail::DistFunc); +instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( + std::int64_t, float, std::int64_t, std::int64_t, 3, cuvs::neighbors::detail::DistFunc); + +instantiate_cuvs_neighbors_detail_rbc_eps_pass( + std::int64_t, float, std::int64_t, std::int64_t, cuvs::neighbors::detail::EuclideanSqFunc); + +#undef instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two +#undef instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one +#undef instantiate_cuvs_neighbors_detail_rbc_eps_pass diff --git a/cpp/src/neighbors/ball_cover/registers-inl.cuh b/cpp/src/neighbors/ball_cover/registers-inl.cuh new file mode 100644 index 000000000..2565a48fc --- /dev/null +++ b/cpp/src/neighbors/ball_cover/registers-inl.cuh @@ -0,0 +1,1630 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "../detail/haversine_distance.cuh" +#include "common.cuh" +#include "registers_types.cuh" // DistFunc +#include + +#include "../faiss_select/key_value_block_select.cuh" +#include +#include +#include + +#include +#include +#include + +#include + +#include + +namespace cuvs::neighbors::detail { + +/** + * To find exact neighbors, we perform a post-processing stage + * that filters out those points which might have neighbors outside + * of their k closest landmarks. This is usually a very small portion + * of the total points. + * @tparam value_idx + * @tparam value_t + * @tparam value_int + * @tparam tpb + * @param X + * @param n_cols + * @param R_knn_inds + * @param R_knn_dists + * @param R_radius + * @param landmarks + * @param n_landmarks + * @param bitset_size + * @param k + * @param output + * @param weight + */ +template +RAFT_KERNEL perform_post_filter_registers(const value_t* X, + value_int n_cols, + const value_idx* R_knn_inds, + const value_t* R_knn_dists, + const value_t* R_radius, + const value_t* landmarks, + int n_landmarks, + value_int bitset_size, + value_int k, + distance_func dfunc, + std::uint32_t* output, + float weight = 1.0) +{ + // allocate array of size n_landmarks / 32 ints + extern __shared__ std::uint32_t shared_mem[]; + + // Start with all bits on + for (value_int i = threadIdx.x; i < bitset_size; i += tpb) { + shared_mem[i] = 0xffffffff; + } + + __syncthreads(); + + // TODO: Would it be faster to use L1 for this? + value_t local_x_ptr[col_q]; + for (value_int j = 0; j < n_cols; ++j) { + local_x_ptr[j] = X[n_cols * blockIdx.x + j]; + } + + value_t closest_R_dist = R_knn_dists[blockIdx.x * k + (k - 1)]; + + // zero out bits for closest k landmarks + for (value_int j = threadIdx.x; j < k; j += tpb) { + _zero_bit(shared_mem, (std::uint32_t)R_knn_inds[blockIdx.x * k + j]); + } + + __syncthreads(); + + // Discard any landmarks where p(q, r) > p(q, r_q) + radius(r) + // That is, the distance between the current point and the current + // landmark is > the distance between the current point and + // its closest landmark + the radius of the current landmark. + for (value_int l = threadIdx.x; l < n_landmarks; l += tpb) { + // compute p(q, r) + value_t dist = dfunc(local_x_ptr, landmarks + (n_cols * l), n_cols); + if (dist > weight * (closest_R_dist + R_radius[l]) || dist > 3 * closest_R_dist) { + _zero_bit(shared_mem, l); + } + } + + __syncthreads(); + + /** + * Output bitset + */ + for (value_int l = threadIdx.x; l < bitset_size; l += tpb) { + output[blockIdx.x * bitset_size + l] = shared_mem[l]; + } +} + +/** + * @tparam value_idx + * @tparam value_t + * @tparam value_int + * @tparam bitset_type + * @tparam warp_q number of registers to use per warp + * @tparam thread_q number of registers to use within each thread + * @tparam tpb number of threads per block + * @param X + * @param n_cols + * @param bitset + * @param bitset_size + * @param R_knn_dists + * @param R_indptr + * @param R_1nn_inds + * @param R_1nn_dists + * @param knn_inds + * @param knn_dists + * @param n_landmarks + * @param k + * @param dist_counter + */ +template +RAFT_KERNEL compute_final_dists_registers(const value_t* X_reordered, + const value_t* X, + const value_int n_cols, + bitset_type* bitset, + value_int bitset_size, + const value_t* R_closest_landmark_dists, + const value_idx* R_indptr, + const value_idx* R_1nn_inds, + const value_t* R_1nn_dists, + value_idx* knn_inds, + value_t* knn_dists, + value_int n_landmarks, + value_int k, + dist_func dfunc, + value_int* dist_counter) +{ + static constexpr int kNumWarps = tpb / raft::WarpSize; + + __shared__ value_t shared_memK[kNumWarps * warp_q]; + __shared__ raft::KeyValuePair shared_memV[kNumWarps * warp_q]; + + const value_t* x_ptr = X + (n_cols * blockIdx.x); + value_t local_x_ptr[col_q]; + for (value_int j = 0; j < n_cols; ++j) { + local_x_ptr[j] = x_ptr[j]; + } + + using namespace cuvs::neighbors::detail::faiss_select; + KeyValueBlockSelect, warp_q, thread_q, tpb> heap( + std::numeric_limits::max(), + std::numeric_limits::max(), + -1, + shared_memK, + shared_memV, + k); + + const value_int n_k = raft::Pow2::roundDown(k); + value_int i = threadIdx.x; + for (; i < n_k; i += tpb) { + value_idx ind = knn_inds[blockIdx.x * k + i]; + heap.add(knn_dists[blockIdx.x * k + i], R_closest_landmark_dists[ind], ind); + } + + if (i < k) { + value_idx ind = knn_inds[blockIdx.x * k + i]; + heap.addThreadQ(knn_dists[blockIdx.x * k + i], R_closest_landmark_dists[ind], ind); + } + + heap.checkThreadQ(); + + for (value_int cur_R_ind = 0; cur_R_ind < n_landmarks; ++cur_R_ind) { + // if cur R overlaps cur point's closest R, it could be a + // candidate + if (_get_val(bitset + (blockIdx.x * bitset_size), cur_R_ind)) { + value_idx R_start_offset = R_indptr[cur_R_ind]; + value_idx R_stop_offset = R_indptr[cur_R_ind + 1]; + value_idx R_size = R_stop_offset - R_start_offset; + + // Loop through R's neighborhood in parallel + + // Round R_size to the nearest warp threads so they can + // all be computing in parallel. + + const value_int limit = raft::Pow2::roundDown(R_size); + + i = threadIdx.x; + for (; i < limit; i += tpb) { + value_idx cur_candidate_ind = R_1nn_inds[R_start_offset + i]; + value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i]; + + value_t z = heap.warpKTopRDist == 0.00 ? 0.0 + : (abs(heap.warpKTop - heap.warpKTopRDist) * + abs(heap.warpKTopRDist - cur_candidate_dist) - + heap.warpKTop * cur_candidate_dist) / + heap.warpKTopRDist; + z = isnan(z) || isinf(z) ? 0.0 : z; + + // If lower bound on distance could possibly be in + // the closest k neighbors, compute it and add to k-select + value_t dist = std::numeric_limits::max(); + if (z <= heap.warpKTop) { + const value_t* y_ptr = X_reordered + (n_cols * (R_start_offset + i)); + value_t local_y_ptr[col_q]; + for (value_int j = 0; j < n_cols; ++j) { + local_y_ptr[j] = y_ptr[j]; + } + + dist = dfunc(local_x_ptr, local_y_ptr, n_cols); + } + + heap.add(dist, cur_candidate_dist, cur_candidate_ind); + } + + // second round guarantees to be only a single warp. + if (i < R_size) { + value_idx cur_candidate_ind = R_1nn_inds[R_start_offset + i]; + value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i]; + + value_t z = heap.warpKTopRDist == 0.00 ? 0.0 + : (abs(heap.warpKTop - heap.warpKTopRDist) * + abs(heap.warpKTopRDist - cur_candidate_dist) - + heap.warpKTop * cur_candidate_dist) / + heap.warpKTopRDist; + + z = isnan(z) || isinf(z) ? 0.0 : z; + + // If lower bound on distance could possibly be in + // the closest k neighbors, compute it and add to k-select + value_t dist = std::numeric_limits::max(); + if (z <= heap.warpKTop) { + const value_t* y_ptr = X_reordered + (n_cols * (R_start_offset + i)); + value_t local_y_ptr[col_q]; + for (value_int j = 0; j < n_cols; ++j) { + local_y_ptr[j] = y_ptr[j]; + } + dist = dfunc(local_x_ptr, local_y_ptr, n_cols); + } + heap.addThreadQ(dist, cur_candidate_dist, cur_candidate_ind); + } + heap.checkThreadQ(); + } + } + + heap.reduce(); + + for (value_int i = threadIdx.x; i < k; i += tpb) { + knn_dists[blockIdx.x * k + i] = shared_memK[i]; + knn_inds[blockIdx.x * k + i] = shared_memV[i].value; + } +} + +/** + * Random ball cover kernel for n_dims == 2 + * @tparam value_idx + * @tparam value_t + * @tparam warp_q + * @tparam thread_q + * @tparam tpb + * @tparam value_idx + * @tparam value_t + * @param R_knn_inds + * @param R_knn_dists + * @param m + * @param k + * @param R_indptr + * @param R_1nn_cols + * @param R_1nn_dists + */ +template +RAFT_KERNEL block_rbc_kernel_registers(const value_t* X_reordered, + const value_t* X, + value_int n_cols, // n_cols should be 2 or 3 dims + const value_idx* R_knn_inds, + const value_t* R_knn_dists, + value_int m, + value_int k, + const value_idx* R_indptr, + const value_idx* R_1nn_cols, + const value_t* R_1nn_dists, + value_idx* out_inds, + value_t* out_dists, + value_int* dist_counter, + const value_t* R_radius, + distance_func dfunc, + float weight = 1.0) +{ + static constexpr value_int kNumWarps = tpb / raft::WarpSize; + + __shared__ value_t shared_memK[kNumWarps * warp_q]; + __shared__ raft::KeyValuePair shared_memV[kNumWarps * warp_q]; + + // TODO: Separate kernels for different widths: + // 1. Very small (between 3 and 32) just use registers for columns of "blockIdx.x" + // 2. Can fit comfortably in shared memory (32 to a few thousand?) + // 3. Load each time individually. + const value_t* x_ptr = X + (n_cols * blockIdx.x); + + // Use registers only for 2d or 3d + value_t local_x_ptr[col_q]; + for (value_int i = 0; i < n_cols; ++i) { + local_x_ptr[i] = x_ptr[i]; + } + + // Each warp works on 1 R + using namespace cuvs::neighbors::detail::faiss_select; + KeyValueBlockSelect, warp_q, thread_q, tpb> heap( + std::numeric_limits::max(), + std::numeric_limits::max(), + -1, + shared_memK, + shared_memV, + k); + + value_t min_R_dist = R_knn_dists[blockIdx.x * k + (k - 1)]; + value_int n_dists_computed = 0; + + /** + * First add distances for k closest neighbors of R + * to the heap + */ + // Start iterating through elements of each set from closest R elements, + // determining if the distance could even potentially be in the heap. + for (value_int cur_k = 0; cur_k < k; ++cur_k) { + // index and distance to current blockIdx.x's closest landmark + value_t cur_R_dist = R_knn_dists[blockIdx.x * k + cur_k]; + value_idx cur_R_ind = R_knn_inds[blockIdx.x * k + cur_k]; + + // Equation (2) in Cayton's paper- prune out R's which are > 3 * p(q, r_q) + if (cur_R_dist > weight * (min_R_dist + R_radius[cur_R_ind])) continue; + if (cur_R_dist > 3 * min_R_dist) return; + + // The whole warp should iterate through the elements in the current R + value_idx R_start_offset = R_indptr[cur_R_ind]; + value_idx R_stop_offset = R_indptr[cur_R_ind + 1]; + + value_idx R_size = R_stop_offset - R_start_offset; + + value_int limit = raft::Pow2::roundDown(R_size); + value_int i = threadIdx.x; + for (; i < limit; i += tpb) { + // Index and distance of current candidate's nearest landmark + value_idx cur_candidate_ind = R_1nn_cols[R_start_offset + i]; + value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i]; + + // Take 2 landmarks l_1 and l_2 where l_1 is the furthest point in the heap + // and l_2 is the current landmark R. s is the current data point and + // t is the new candidate data point. We know that: + // d(s, t) cannot possibly be any smaller than | d(s, l_1) - d(l_1, l_2) | * | d(l_1, l_2) - + // d(l_2, t) | - d(s, l_1) * d(l_2, t) + + // Therefore, if d(s, t) >= d(s, l_1) from the computation above, we know that the distance to + // the candidate point cannot possibly be in the nearest neighbors. However, if d(s, t) < d(s, + // l_1) then we should compute the distance because it's possible it could be smaller. + // + value_t z = heap.warpKTopRDist == 0.00 ? 0.0 + : (abs(heap.warpKTop - heap.warpKTopRDist) * + abs(heap.warpKTopRDist - cur_candidate_dist) - + heap.warpKTop * cur_candidate_dist) / + heap.warpKTopRDist; + + z = isnan(z) || isinf(z) ? 0.0 : z; + value_t dist = std::numeric_limits::max(); + + if (z <= heap.warpKTop) { + const value_t* y_ptr = X_reordered + (n_cols * (R_start_offset + i)); + value_t local_y_ptr[col_q]; + for (value_int j = 0; j < n_cols; ++j) { + local_y_ptr[j] = y_ptr[j]; + } + dist = dfunc(local_x_ptr, local_y_ptr, n_cols); + ++n_dists_computed; + } + + heap.add(dist, cur_candidate_dist, cur_candidate_ind); + } + + if (i < R_size) { + value_idx cur_candidate_ind = R_1nn_cols[R_start_offset + i]; + value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i]; + value_t z = heap.warpKTopRDist == 0.0 ? 0.0 + : (abs(heap.warpKTop - heap.warpKTopRDist) * + abs(heap.warpKTopRDist - cur_candidate_dist) - + heap.warpKTop * cur_candidate_dist) / + heap.warpKTopRDist; + + z = isnan(z) || isinf(z) ? 0.0 : z; + value_t dist = std::numeric_limits::max(); + + if (z <= heap.warpKTop) { + const value_t* y_ptr = X_reordered + (n_cols * (R_start_offset + i)); + value_t local_y_ptr[col_q]; + for (value_int j = 0; j < n_cols; ++j) { + local_y_ptr[j] = y_ptr[j]; + } + dist = dfunc(local_x_ptr, local_y_ptr, n_cols); + ++n_dists_computed; + } + + heap.addThreadQ(dist, cur_candidate_dist, cur_candidate_ind); + } + + heap.checkThreadQ(); + } + + heap.reduce(); + + for (int i = threadIdx.x; i < k; i += tpb) { + out_dists[blockIdx.x * k + i] = shared_memK[i]; + out_inds[blockIdx.x * k + i] = shared_memV[i].value; + } +} + +template +__device__ value_t squared(const value_t& a) +{ + return a * a; +} + +template +RAFT_KERNEL block_rbc_kernel_eps_dense(const value_t* X_reordered, + const value_t* X, + const value_int n_queries, + const value_int n_cols, + const value_t* R, + const value_int m, + const value_t eps, + const value_int n_landmarks, + const value_idx* R_indptr, + const value_idx* R_1nn_cols, + const value_t* R_1nn_dists, + const value_t* R_radius, + distance_func dfunc, + bool* adj, + value_idx* vd) +{ + constexpr int num_warps = tpb / raft::WarpSize; + + // process 1 query per warp + const uint32_t lid = raft::laneId(); + + // this should help the compiler to prevent branches + const int query_id = raft::shfl(blockIdx.x * num_warps + (threadIdx.x / raft::WarpSize), 0); + + // this is an early out for a full warp + if (query_id >= n_queries) return; + + value_idx column_count = 0; + + const value_t* x_ptr = X + (n_cols * query_id); + adj += query_id * m; + + // we omit the sqrt() in the inner distance compute + const value_t eps2 = eps * eps; + +#pragma nounroll + for (uint32_t cur_k0 = 0; cur_k0 < n_landmarks; cur_k0 += raft::WarpSize) { + // Pre-compute landmark_dist & triangularization checks for 32 iterations + const uint32_t lane_k = cur_k0 + lid; + const value_t lane_R_dist_sq = lane_k < n_landmarks ? dfunc(x_ptr, R + lane_k * n_cols, n_cols) + : std::numeric_limits::max(); + const int lane_check = lane_k < n_landmarks + ? static_cast(lane_R_dist_sq <= squared(eps + R_radius[lane_k])) + : 0; + + int lane_mask = raft::ballot(lane_check); + if (lane_mask == 0) continue; + + // reverse to use __clz instead of __ffs + lane_mask = __brev(lane_mask); + do { + // look for next k_offset + const uint32_t k_offset = __clz(lane_mask); + + const uint32_t cur_k = cur_k0 + k_offset; + + // The whole warp should iterate through the elements in the current R + const value_idx R_start_offset = R_indptr[cur_k]; + + // update lane_mask for next iteration - erase bits up to k_offset + lane_mask &= (0x7fffffff >> k_offset); + + const uint32_t R_size = R_indptr[cur_k + 1] - R_start_offset; + + // we have precomputed the query<->landmark distance + const value_t cur_R_dist = raft::sqrt(raft::shfl(lane_R_dist_sq, k_offset)); + + const uint32_t limit = raft::Pow2::roundDown(R_size); + uint32_t i = limit + lid; + + // R_1nn_dists are sorted ascendingly for each landmark + // Iterating backwards, after pruning the first point w.r.t. triangle + // inequality all subsequent points can be pruned as well + const value_t* y_ptr = X_reordered + (n_cols * (R_start_offset + i)); + { + const value_t min_warp_dist = + limit < R_size ? R_1nn_dists[R_start_offset + limit] : cur_R_dist; + const value_t dist = + (i < R_size) ? dfunc(x_ptr, y_ptr, n_cols) : std::numeric_limits::max(); + const bool in_range = (dist <= eps2); + if (in_range) { + auto index = R_1nn_cols[R_start_offset + i]; + column_count++; + adj[index] = true; + } + // abort in case subsequent points cannot possibly be in reach + i *= (cur_R_dist - min_warp_dist <= eps); + } + + uint32_t i0 = raft::shfl(i, 0); + + while (i0 >= raft::WarpSize) { + y_ptr -= raft::WarpSize * n_cols; + i0 -= raft::WarpSize; + const value_t min_warp_dist = R_1nn_dists[R_start_offset + i0]; + const value_t dist = dfunc(x_ptr, y_ptr, n_cols); + const bool in_range = (dist <= eps2); + if (in_range) { + auto index = R_1nn_cols[R_start_offset + i0 + lid]; + column_count++; + adj[index] = true; + } + // abort in case subsequent points cannot possibly be in reach + i0 *= (cur_R_dist - min_warp_dist <= eps); + } + } while (lane_mask); + } + + if (vd != nullptr) { + value_idx row_sum = raft::warpReduce(column_count); + if (lid == 0) vd[query_id] = row_sum; + } +} + +template +RAFT_KERNEL block_rbc_kernel_eps_csr_pass(const value_t* X_reordered, + const value_t* X, + const value_int n_queries, + const value_int n_cols, + const value_t* R, + const value_int m, + const value_t eps, + const value_int n_landmarks, + const value_idx* R_indptr, + const value_idx* R_1nn_cols, + const value_t* R_1nn_dists, + const value_t* R_radius, + distance_func dfunc, + value_idx* adj_ia, + value_idx* adj_ja) +{ + constexpr int num_warps = tpb / raft::WarpSize; + + // process 1 query per warp + const uint32_t lid = raft::laneId(); + const uint32_t lid_mask = (1 << lid) - 1; + + // this should help the compiler to prevent branches + const int query_id = raft::shfl(blockIdx.x * num_warps + (threadIdx.x / raft::WarpSize), 0); + + // this is an early out for a full warp + if (query_id >= n_queries) return; + + uint32_t column_index_offset = 0; + + if constexpr (write_pass) { + value_idx offset = adj_ia[query_id]; + // we have no neighbors to fill for this query + if (offset == adj_ia[query_id + 1]) return; + adj_ja += offset; + } + + const value_t* x_ptr = X + (n_cols * query_id); + + // we omit the sqrt() in the inner distance compute + const value_t eps2 = eps * eps; + +#pragma nounroll + for (uint32_t cur_k0 = 0; cur_k0 < n_landmarks; cur_k0 += raft::WarpSize) { + // Pre-compute landmark_dist & triangularization checks for 32 iterations + const uint32_t lane_k = cur_k0 + lid; + const value_t lane_R_dist_sq = lane_k < n_landmarks ? dfunc(x_ptr, R + lane_k * n_cols, n_cols) + : std::numeric_limits::max(); + const int lane_check = lane_k < n_landmarks + ? static_cast(lane_R_dist_sq <= squared(eps + R_radius[lane_k])) + : 0; + + int lane_mask = raft::ballot(lane_check); + if (lane_mask == 0) continue; + + // reverse to use __clz instead of __ffs + lane_mask = __brev(lane_mask); + do { + // look for next k_offset + const uint32_t k_offset = __clz(lane_mask); + + const uint32_t cur_k = cur_k0 + k_offset; + + // The whole warp should iterate through the elements in the current R + const value_idx R_start_offset = R_indptr[cur_k]; + + // update lane_mask for next iteration - erase bits up to k_offset + lane_mask &= (0x7fffffff >> k_offset); + + const uint32_t R_size = R_indptr[cur_k + 1] - R_start_offset; + + // we have precomputed the query<->landmark distance + const value_t cur_R_dist = raft::sqrt(raft::shfl(lane_R_dist_sq, k_offset)); + + const uint32_t limit = raft::Pow2::roundDown(R_size); + uint32_t i = limit + lid; + + // R_1nn_dists are sorted ascendingly for each landmark + // Iterating backwards, after pruning the first point w.r.t. triangle + // inequality all subsequent points can be pruned as well + const value_t* y_ptr = X_reordered + (n_cols * (R_start_offset + i)); + { + const value_t min_warp_dist = + limit < R_size ? R_1nn_dists[R_start_offset + limit] : cur_R_dist; + const value_t dist = + (i < R_size) ? dfunc(x_ptr, y_ptr, n_cols) : std::numeric_limits::max(); + const bool in_range = (dist <= eps2); + if constexpr (write_pass) { + const int mask = raft::ballot(in_range); + if (in_range) { + const uint32_t index = R_1nn_cols[R_start_offset + i]; + const uint32_t row_pos = __popc(mask & lid_mask); + adj_ja[row_pos] = index; + } + adj_ja += __popc(mask); + } else { + column_index_offset += (in_range); + } + // abort in case subsequent points cannot possibly be in reach + i *= (cur_R_dist - min_warp_dist <= eps); + } + + uint32_t i0 = raft::shfl(i, 0); + + while (i0 >= raft::WarpSize) { + y_ptr -= raft::WarpSize * n_cols; + i0 -= raft::WarpSize; + const value_t min_warp_dist = R_1nn_dists[R_start_offset + i0]; + const value_t dist = dfunc(x_ptr, y_ptr, n_cols); + const bool in_range = (dist <= eps2); + if constexpr (write_pass) { + const int mask = raft::ballot(in_range); + if (in_range) { + const uint32_t index = R_1nn_cols[R_start_offset + i0 + lid]; + const uint32_t row_pos = __popc(mask & lid_mask); + adj_ja[row_pos] = index; + } + adj_ja += __popc(mask); + } else { + column_index_offset += (in_range); + } + // abort in case subsequent points cannot possibly be in reach + i0 *= (cur_R_dist - min_warp_dist <= eps); + } + } while (lane_mask); + } + + if constexpr (!write_pass) { + value_idx row_sum = raft::warpReduce(column_index_offset); + if (lid == 0) adj_ia[query_id] = row_sum; + } +} + +template +RAFT_KERNEL __launch_bounds__(tpb) + block_rbc_kernel_eps_csr_pass_xd(const value_t* __restrict__ X_reordered, + const value_t* __restrict__ X, + const value_int n_queries, + const value_int n_cols, + const value_t* __restrict__ R, + const value_int m, + const value_t eps, + const value_int n_landmarks, + const value_idx* __restrict__ R_indptr, + const value_idx* __restrict__ R_1nn_cols, + const value_t* __restrict__ R_1nn_dists, + const value_t* __restrict__ R_radius, + distance_func dfunc, + value_idx* __restrict__ adj_ia, + value_idx* adj_ja) +{ + constexpr int num_warps = tpb / raft::WarpSize; + + // process 1 query per warp + const uint32_t lid = raft::laneId(); + const uint32_t lid_mask = (1 << lid) - 1; + + // this should help the compiler to prevent branches + const int query_id = raft::shfl(blockIdx.x * num_warps + (threadIdx.x / raft::WarpSize), 0); + + // this is an early out for a full warp + if (query_id >= n_queries) return; + + uint32_t column_index_offset = 0; + + if constexpr (write_pass) { + value_idx offset = adj_ia[query_id]; + // we have no neighbors to fill for this query + if (offset == adj_ia[query_id + 1]) return; + adj_ja += offset; + } + + const value_t* x_ptr = X + (dim * query_id); + value_t local_x_ptr[dim]; +#pragma unroll + for (uint32_t i = 0; i < dim; ++i) { + local_x_ptr[i] = x_ptr[i]; + } + + // we omit the sqrt() in the inner distance compute + const value_t eps2 = eps * eps; + +#pragma nounroll + for (uint32_t cur_k0 = 0; cur_k0 < n_landmarks; cur_k0 += raft::WarpSize) { + // Pre-compute landmark_dist & triangularization checks for 32 iterations + const uint32_t lane_k = cur_k0 + lid; + const value_t lane_R_dist_sq = lane_k < n_landmarks ? dfunc(local_x_ptr, R + lane_k * dim, dim) + : std::numeric_limits::max(); + const int lane_check = lane_k < n_landmarks + ? static_cast(lane_R_dist_sq <= squared(eps + R_radius[lane_k])) + : 0; + + int lane_mask = raft::ballot(lane_check); + if (lane_mask == 0) continue; + + // reverse to use __clz instead of __ffs + lane_mask = __brev(lane_mask); + do { + // look for next k_offset + const uint32_t k_offset = __clz(lane_mask); + + const uint32_t cur_k = cur_k0 + k_offset; + + // The whole warp should iterate through the elements in the current R + const value_idx R_start_offset = R_indptr[cur_k]; + + // update lane_mask for next iteration - erase bits up to k_offset + lane_mask &= (0x7fffffff >> k_offset); + + const uint32_t R_size = R_indptr[cur_k + 1] - R_start_offset; + + // we have precomputed the query<->landmark distance + const value_t cur_R_dist = raft::sqrt(raft::shfl(lane_R_dist_sq, k_offset)); + + const uint32_t limit = raft::Pow2::roundDown(R_size); + uint32_t i = limit + lid; + + // R_1nn_dists are sorted ascendingly for each landmark + // Iterating backwards, after pruning the first point w.r.t. triangle + // inequality all subsequent points can be pruned as well + const value_t* y_ptr = X_reordered + (dim * (R_start_offset + i)); + { + const value_t min_warp_dist = + limit < R_size ? R_1nn_dists[R_start_offset + limit] : cur_R_dist; + const value_t dist = + (i < R_size) ? dfunc(local_x_ptr, y_ptr, dim) : std::numeric_limits::max(); + const bool in_range = (dist <= eps2); + if constexpr (write_pass) { + const int mask = raft::ballot(in_range); + if (in_range) { + const uint32_t index = R_1nn_cols[R_start_offset + i]; + const uint32_t row_pos = __popc(mask & lid_mask); + adj_ja[row_pos] = index; + } + adj_ja += __popc(mask); + } else { + column_index_offset += (in_range); + } + // abort in case subsequent points cannot possibly be in reach + i *= (cur_R_dist - min_warp_dist <= eps); + } + + uint32_t i0 = raft::shfl(i, 0); + + while (i0 >= raft::WarpSize) { + y_ptr -= raft::WarpSize * dim; + i0 -= raft::WarpSize; + const value_t min_warp_dist = R_1nn_dists[R_start_offset + i0]; + const value_t dist = dfunc(local_x_ptr, y_ptr, dim); + const bool in_range = (dist <= eps2); + if constexpr (write_pass) { + const int mask = raft::ballot(in_range); + if (in_range) { + const uint32_t index = R_1nn_cols[R_start_offset + i0 + lid]; + const uint32_t row_pos = __popc(mask & lid_mask); + adj_ja[row_pos] = index; + } + adj_ja += __popc(mask); + } else { + column_index_offset += (in_range); + } + // abort in case subsequent points cannot possibly be in reach + i0 *= (cur_R_dist - min_warp_dist <= eps); + } + } while (lane_mask); + } + + if constexpr (!write_pass) { + value_idx row_sum = raft::warpReduce(column_index_offset); + if (lid == 0) adj_ia[query_id] = row_sum; + } +} + +template +RAFT_KERNEL block_rbc_kernel_eps_max_k(const value_t* X_reordered, + const value_t* X, + const value_int n_queries, + const value_int n_cols, + const value_t* R, + const value_int m, + const value_t eps, + const value_int n_landmarks, + const value_idx* R_indptr, + const value_idx* R_1nn_cols, + const value_t* R_1nn_dists, + const value_t* R_radius, + distance_func dfunc, + value_idx* vd, + const value_int max_k, + value_idx* tmp) +{ + constexpr int num_warps = tpb / raft::WarpSize; + + // process 1 query per warp + const uint32_t lid = raft::laneId(); + const uint32_t lid_mask = (1 << lid) - 1; + + // this should help the compiler to prevent branches + const int query_id = raft::shfl(blockIdx.x * num_warps + (threadIdx.x / raft::WarpSize), 0); + + // this is an early out for a full warp + if (query_id >= n_queries) return; + + value_idx column_count = 0; + + const value_t* x_ptr = X + (n_cols * query_id); + tmp += query_id * max_k; + + // we omit the sqrt() in the inner distance compute + const value_t eps2 = eps * eps; + +#pragma nounroll + for (uint32_t cur_k0 = 0; cur_k0 < n_landmarks; cur_k0 += raft::WarpSize) { + // Pre-compute landmark_dist & triangularization checks for 32 iterations + const uint32_t lane_k = cur_k0 + lid; + const value_t lane_R_dist_sq = lane_k < n_landmarks ? dfunc(x_ptr, R + lane_k * n_cols, n_cols) + : std::numeric_limits::max(); + const int lane_check = lane_k < n_landmarks + ? static_cast(lane_R_dist_sq <= squared(eps + R_radius[lane_k])) + : 0; + + int lane_mask = raft::ballot(lane_check); + if (lane_mask == 0) continue; + + // reverse to use __clz instead of __ffs + lane_mask = __brev(lane_mask); + do { + // look for next k_offset + const uint32_t k_offset = __clz(lane_mask); + + const uint32_t cur_k = cur_k0 + k_offset; + + // The whole warp should iterate through the elements in the current R + const value_idx R_start_offset = R_indptr[cur_k]; + + // update lane_mask for next iteration - erase bits up to k_offset + lane_mask &= (0x7fffffff >> k_offset); + + const uint32_t R_size = R_indptr[cur_k + 1] - R_start_offset; + + // we have precomputed the query<->landmark distance + const value_t cur_R_dist = raft::sqrt(raft::shfl(lane_R_dist_sq, k_offset)); + + const uint32_t limit = raft::Pow2::roundDown(R_size); + uint32_t i = limit + lid; + + // R_1nn_dists are sorted ascendingly for each landmark + // Iterating backwards, after pruning the first point w.r.t. triangle + // inequality all subsequent points can be pruned as well + const value_t* y_ptr = X_reordered + (n_cols * (R_start_offset + i)); + { + const value_t min_warp_dist = + limit < R_size ? R_1nn_dists[R_start_offset + limit] : cur_R_dist; + const value_t dist = + (i < R_size) ? dfunc(x_ptr, y_ptr, n_cols) : std::numeric_limits::max(); + const bool in_range = (dist <= eps2); + const int mask = raft::ballot(in_range); + if (in_range) { + auto row_pos = column_count + __popc(mask & lid_mask); + // we still continue to look for more hits to return valid vd + if (row_pos < max_k) { + auto index = R_1nn_cols[R_start_offset + i]; + tmp[row_pos] = index; + } + } + column_count += __popc(mask); + // abort in case subsequent points cannot possibly be in reach + i *= (cur_R_dist - min_warp_dist <= eps); + } + + uint32_t i0 = raft::shfl(i, 0); + + while (i0 >= raft::WarpSize) { + y_ptr -= raft::WarpSize * n_cols; + i0 -= raft::WarpSize; + const value_t min_warp_dist = R_1nn_dists[R_start_offset + i0]; + const value_t dist = dfunc(x_ptr, y_ptr, n_cols); + const bool in_range = (dist <= eps2); + const int mask = raft::ballot(in_range); + if (in_range) { + auto row_pos = column_count + __popc(mask & lid_mask); + // we still continue to look for more hits to return valid vd + if (row_pos < max_k) { + auto index = R_1nn_cols[R_start_offset + i0 + lid]; + tmp[row_pos] = index; + } + } + column_count += __popc(mask); + // abort in case subsequent points cannot possibly be in reach + i0 *= (cur_R_dist - min_warp_dist <= eps); + } + } while (lane_mask); + } + + if (lid == 0) vd[query_id] = column_count; +} + +template +RAFT_KERNEL block_rbc_kernel_eps_max_k_copy(const value_int max_k, + const value_idx* adj_ia, + const value_idx* tmp, + value_idx* adj_ja) +{ + value_int offset = blockIdx.x * max_k; + + value_int row_idx = blockIdx.x; + value_idx col_start_idx = adj_ia[row_idx]; + value_idx num_cols = adj_ia[row_idx + 1] - col_start_idx; + + value_int limit = raft::Pow2::roundDown(num_cols); + value_int i = threadIdx.x; + for (; i < limit; i += tpb) { + adj_ja[col_start_idx + i] = tmp[offset + i]; + } + if (i < num_cols) { adj_ja[col_start_idx + i] = tmp[offset + i]; } +} + +template +void rbc_low_dim_pass_one( + raft::resources const& handle, + const cuvs::neighbors::ball_cover::index& index, + const value_t* query, + const value_int n_query_rows, + value_int k, + const value_idx* R_knn_inds, + const value_t* R_knn_dists, + dist_func& dfunc, + value_idx* inds, + value_t* dists, + float weight, + value_int* dists_counter) +{ + if (k <= 32) + block_rbc_kernel_registers + <<>>( + index.get_X_reordered().data_handle(), + query, + index.n, + R_knn_inds, + R_knn_dists, + index.m, + k, + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + inds, + dists, + dists_counter, + index.get_R_radius().data_handle(), + dfunc, + weight); + + else if (k <= 64) + block_rbc_kernel_registers + <<>>( + index.get_X_reordered().data_handle(), + query, + index.n, + R_knn_inds, + R_knn_dists, + index.m, + k, + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + inds, + dists, + dists_counter, + index.get_R_radius().data_handle(), + dfunc, + weight); + else if (k <= 128) + block_rbc_kernel_registers + <<>>( + index.get_X_reordered().data_handle(), + query, + index.n, + R_knn_inds, + R_knn_dists, + index.m, + k, + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + inds, + dists, + dists_counter, + index.get_R_radius().data_handle(), + dfunc, + weight); + + else if (k <= 256) + block_rbc_kernel_registers + <<>>( + index.get_X_reordered().data_handle(), + query, + index.n, + R_knn_inds, + R_knn_dists, + index.m, + k, + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + inds, + dists, + dists_counter, + index.get_R_radius().data_handle(), + dfunc, + weight); + + else if (k <= 512) + block_rbc_kernel_registers + <<>>( + index.get_X_reordered().data_handle(), + query, + index.n, + R_knn_inds, + R_knn_dists, + index.m, + k, + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + inds, + dists, + dists_counter, + index.get_R_radius().data_handle(), + dfunc, + weight); + + else if (k <= 1024) + block_rbc_kernel_registers + <<>>( + index.get_X_reordered().data_handle(), + query, + index.n, + R_knn_inds, + R_knn_dists, + index.m, + k, + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + inds, + dists, + dists_counter, + index.get_R_radius().data_handle(), + dfunc, + weight); +} + +template +void rbc_low_dim_pass_two( + raft::resources const& handle, + const cuvs::neighbors::ball_cover::index& index, + const value_t* query, + const value_int n_query_rows, + value_int k, + const value_idx* R_knn_inds, + const value_t* R_knn_dists, + dist_func& dfunc, + value_idx* inds, + value_t* dists, + float weight, + value_int* post_dists_counter) +{ + const value_int bitset_size = ceil(index.n_landmarks / 32.0); + + rmm::device_uvector bitset(bitset_size * n_query_rows, + raft::resource::get_cuda_stream(handle)); + thrust::fill( + raft::resource::get_thrust_policy(handle), bitset.data(), bitset.data() + bitset.size(), 0); + + perform_post_filter_registers + <<>>(query, + index.n, + R_knn_inds, + R_knn_dists, + index.get_R_radius().data_handle(), + index.get_R().data_handle(), + index.n_landmarks, + bitset_size, + k, + dfunc, + bitset.data(), + weight); + + if (k <= 32) + compute_final_dists_registers + <<>>( + index.get_X_reordered().data_handle(), + query, + index.n, + bitset.data(), + bitset_size, + index.get_R_closest_landmark_dists().data_handle(), + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + inds, + dists, + index.n_landmarks, + k, + dfunc, + post_dists_counter); + else if (k <= 64) + compute_final_dists_registers + <<>>( + index.get_X_reordered().data_handle(), + query, + index.n, + bitset.data(), + bitset_size, + index.get_R_closest_landmark_dists().data_handle(), + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + inds, + dists, + index.n_landmarks, + k, + dfunc, + post_dists_counter); + else if (k <= 128) + compute_final_dists_registers + <<>>( + index.get_X_reordered().data_handle(), + query, + index.n, + bitset.data(), + bitset_size, + index.get_R_closest_landmark_dists().data_handle(), + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + inds, + dists, + index.n_landmarks, + k, + dfunc, + post_dists_counter); + else if (k <= 256) + compute_final_dists_registers + <<>>( + index.get_X_reordered().data_handle(), + query, + index.n, + bitset.data(), + bitset_size, + index.get_R_closest_landmark_dists().data_handle(), + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + inds, + dists, + index.n_landmarks, + k, + dfunc, + post_dists_counter); + else if (k <= 512) + compute_final_dists_registers + <<>>( + index.get_X_reordered().data_handle(), + query, + index.n, + bitset.data(), + bitset_size, + index.get_R_closest_landmark_dists().data_handle(), + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + inds, + dists, + index.n_landmarks, + k, + dfunc, + post_dists_counter); + else if (k <= 1024) + compute_final_dists_registers + <<>>( + index.get_X_reordered().data_handle(), + query, + index.n, + bitset.data(), + bitset_size, + index.get_R_closest_landmark_dists().data_handle(), + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + inds, + dists, + index.n_landmarks, + k, + dfunc, + post_dists_counter); +} + +template +void rbc_eps_pass( + raft::resources const& handle, + const cuvs::neighbors::ball_cover::index& index, + const value_t* query, + const value_int n_query_rows, + value_t eps, + const value_t* R, + dist_func& dfunc, + bool* adj, + value_idx* vd) +{ + block_rbc_kernel_eps_dense + <<>>( + index.get_X_reordered().data_handle(), + query, + n_query_rows, + index.n, + R, + index.m, + eps, + index.n_landmarks, + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + index.get_R_radius().data_handle(), + dfunc, + adj, + vd); + + if (vd != nullptr) { + value_idx sum = + thrust::reduce(raft::resource::get_thrust_policy(handle), vd, vd + n_query_rows); + // copy sum to last element + RAFT_CUDA_TRY(cudaMemcpyAsync(vd + n_query_rows, + &sum, + sizeof(value_idx), + cudaMemcpyHostToDevice, + raft::resource::get_cuda_stream(handle))); + } + + raft::resource::sync_stream(handle); +} + +template +void rbc_eps_pass( + raft::resources const& handle, + const cuvs::neighbors::ball_cover::index& index, + const value_t* query, + const value_int n_query_rows, + value_t eps, + value_int* max_k, + const value_t* R, + dist_func& dfunc, + value_idx* adj_ia, + value_idx* adj_ja, + value_idx* vd) +{ + // if max_k == nullptr we are either pass 1 or pass 2 + if (max_k == nullptr) { + if (adj_ja == nullptr) { + // pass 1 -> only compute adj_ia / vd + value_idx* vd_ptr = (vd != nullptr) ? vd : adj_ia; + if (index.n == 2) { + block_rbc_kernel_eps_csr_pass_xd + <<(n_query_rows, 2), + 64, + 0, + raft::resource::get_cuda_stream(handle)>>>(index.get_X_reordered().data_handle(), + query, + n_query_rows, + index.n, + R, + index.m, + eps, + index.n_landmarks, + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + index.get_R_radius().data_handle(), + dfunc, + vd_ptr, + nullptr); + } else if (index.n == 3) { + block_rbc_kernel_eps_csr_pass_xd + <<(n_query_rows, 2), + 64, + 0, + raft::resource::get_cuda_stream(handle)>>>(index.get_X_reordered().data_handle(), + query, + n_query_rows, + index.n, + R, + index.m, + eps, + index.n_landmarks, + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + index.get_R_radius().data_handle(), + dfunc, + vd_ptr, + nullptr); + } else { + block_rbc_kernel_eps_csr_pass + <<(n_query_rows, 2), + 64, + 0, + raft::resource::get_cuda_stream(handle)>>>(index.get_X_reordered().data_handle(), + query, + n_query_rows, + index.n, + R, + index.m, + eps, + index.n_landmarks, + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + index.get_R_radius().data_handle(), + dfunc, + vd_ptr, + nullptr); + } + + thrust::exclusive_scan(raft::resource::get_thrust_policy(handle), + vd_ptr, + vd_ptr + n_query_rows + 1, + adj_ia, + (value_idx)0); + + } else { + // pass 2 -> fill in adj_ja + if (index.n == 2) { + block_rbc_kernel_eps_csr_pass_xd + <<(n_query_rows, 2), + 64, + 0, + raft::resource::get_cuda_stream(handle)>>>(index.get_X_reordered().data_handle(), + query, + n_query_rows, + index.n, + R, + index.m, + eps, + index.n_landmarks, + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + index.get_R_radius().data_handle(), + dfunc, + adj_ia, + adj_ja); + } else if (index.n == 3) { + block_rbc_kernel_eps_csr_pass_xd + <<(n_query_rows, 2), + 64, + 0, + raft::resource::get_cuda_stream(handle)>>>(index.get_X_reordered().data_handle(), + query, + n_query_rows, + index.n, + R, + index.m, + eps, + index.n_landmarks, + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + index.get_R_radius().data_handle(), + dfunc, + adj_ia, + adj_ja); + } else { + block_rbc_kernel_eps_csr_pass + <<(n_query_rows, 2), + 64, + 0, + raft::resource::get_cuda_stream(handle)>>>(index.get_X_reordered().data_handle(), + query, + n_query_rows, + index.n, + R, + index.m, + eps, + index.n_landmarks, + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + index.get_R_radius().data_handle(), + dfunc, + adj_ia, + adj_ja); + } + } + } else { + value_int max_k_in = *max_k; + value_idx* vd_ptr = (vd != nullptr) ? vd : adj_ia; + + rmm::device_uvector tmp(n_query_rows * max_k_in, + raft::resource::get_cuda_stream(handle)); + + block_rbc_kernel_eps_max_k + <<(n_query_rows, 2), + 64, + 0, + raft::resource::get_cuda_stream(handle)>>>(index.get_X_reordered().data_handle(), + query, + n_query_rows, + index.n, + R, + index.m, + eps, + index.n_landmarks, + index.get_R_indptr().data_handle(), + index.get_R_1nn_cols().data_handle(), + index.get_R_1nn_dists().data_handle(), + index.get_R_radius().data_handle(), + dfunc, + vd_ptr, + max_k_in, + tmp.data()); + + value_int actual_max = thrust::reduce(raft::resource::get_thrust_policy(handle), + vd_ptr, + vd_ptr + n_query_rows, + (value_idx)0, + thrust::maximum()); + + if (actual_max > max_k_in) { + // ceil vd to max_k + thrust::transform(raft::resource::get_thrust_policy(handle), + vd_ptr, + vd_ptr + n_query_rows, + vd_ptr, + [max_k_in] __device__(value_idx vd_count) { + return vd_count > max_k_in ? max_k_in : vd_count; + }); + } + + thrust::exclusive_scan(raft::resource::get_thrust_policy(handle), + vd_ptr, + vd_ptr + n_query_rows + 1, + adj_ia, + (value_idx)0); + + block_rbc_kernel_eps_max_k_copy + <<>>( + max_k_in, adj_ia, tmp.data(), adj_ja); + + // return 'new' max-k + *max_k = actual_max; + } + + if (vd != nullptr && (max_k != nullptr || adj_ja == nullptr)) { + // copy sum to last element + RAFT_CUDA_TRY(cudaMemcpyAsync(vd + n_query_rows, + adj_ia + n_query_rows, + sizeof(value_idx), + cudaMemcpyDeviceToDevice, + raft::resource::get_cuda_stream(handle))); + } + + raft::resource::sync_stream(handle); +} + +}; // namespace cuvs::neighbors::detail diff --git a/cpp/src/neighbors/ball_cover/registers.cuh b/cpp/src/neighbors/ball_cover/registers.cuh new file mode 100644 index 000000000..1cd32ba00 --- /dev/null +++ b/cpp/src/neighbors/ball_cover/registers.cuh @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY +#include "registers-inl.cuh" +#endif + +#ifdef RAFT_COMPILED +#include "registers-ext.cuh" +#endif diff --git a/cpp/src/neighbors/ball_cover/registers_types.cuh b/cpp/src/neighbors/ball_cover/registers_types.cuh new file mode 100644 index 000000000..bf9d21452 --- /dev/null +++ b/cpp/src/neighbors/ball_cover/registers_types.cuh @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "../detail/haversine_distance.cuh" // compute_haversine + +#include // uint32_t + +namespace cuvs::neighbors::detail { + +template +struct DistFunc { + virtual __device__ __host__ __forceinline__ value_t operator()(const value_t* a, + const value_t* b, + const value_int n_dims) + { + return -1; + }; +}; + +template +struct HaversineFunc : public DistFunc { + __device__ __host__ __forceinline__ value_t operator()(const value_t* a, + const value_t* b, + const value_int n_dims) override + { + return cuvs::neighbors::detail::compute_haversine(a[0], b[0], a[1], b[1]); + } +}; + +template +struct EuclideanFunc : public DistFunc { + __device__ __host__ __forceinline__ value_t operator()(const value_t* a, + const value_t* b, + const value_int n_dims) override + { + value_t sum_sq = 0; + for (value_int i = 0; i < n_dims; ++i) { + value_t diff = a[i] - b[i]; + sum_sq += diff * diff; + } + + return raft::sqrt(sum_sq); + } +}; + +template +struct EuclideanSqFunc : public DistFunc { + __device__ __host__ __forceinline__ value_t operator()(const value_t* a, + const value_t* b, + const value_int n_dims) override + { + value_t sum_sq = 0; + for (value_int i = 0; i < n_dims; ++i) { + value_t diff = a[i] - b[i]; + sum_sq += diff * diff; + } + return sum_sq; + } +}; + +}; // namespace cuvs::neighbors::detail diff --git a/cpp/src/neighbors/brute_force.cu b/cpp/src/neighbors/brute_force.cu index 13554c0b5..a8ff471ef 100644 --- a/cpp/src/neighbors/brute_force.cu +++ b/cpp/src/neighbors/brute_force.cu @@ -85,32 +85,31 @@ void index::update_dataset(raft::resources const& res, dataset_view_ = raft::make_const_mdspan(dataset_.view()); } -#define CUVS_INST_BFKNN(T) \ - auto build(raft::resources const& res, \ - raft::device_matrix_view dataset, \ - cuvs::distance::DistanceType metric, \ - T metric_arg) \ - ->cuvs::neighbors::brute_force::index \ - { \ - return detail::build(res, dataset, metric, metric_arg); \ - } \ - \ - void search( \ - raft::resources const& res, \ - const cuvs::neighbors::brute_force::index& idx, \ - raft::device_matrix_view queries, \ - raft::device_matrix_view neighbors, \ - raft::device_matrix_view distances, \ - std::optional> sample_filter = std::nullopt) \ - { \ - if (!sample_filter.has_value()) { \ - detail::brute_force_search(res, idx, queries, neighbors, distances); \ - } else { \ - detail::brute_force_search_filtered( \ - res, idx, queries, *sample_filter, neighbors, distances); \ - } \ - } \ - \ +#define CUVS_INST_BFKNN(T) \ + auto build(raft::resources const& res, \ + raft::device_matrix_view dataset, \ + cuvs::distance::DistanceType metric, \ + T metric_arg) \ + ->cuvs::neighbors::brute_force::index \ + { \ + return detail::build(res, dataset, metric, metric_arg); \ + } \ + \ + void search(raft::resources const& res, \ + const cuvs::neighbors::brute_force::index& idx, \ + raft::device_matrix_view queries, \ + raft::device_matrix_view neighbors, \ + raft::device_matrix_view distances, \ + std::optional> sample_filter) \ + { \ + if (!sample_filter.has_value()) { \ + detail::brute_force_search(res, idx, queries, neighbors, distances); \ + } else { \ + detail::brute_force_search_filtered( \ + res, idx, queries, *sample_filter, neighbors, distances); \ + } \ + } \ + \ template struct cuvs::neighbors::brute_force::index; CUVS_INST_BFKNN(float); diff --git a/cpp/src/neighbors/faiss_select/Comparators.cuh b/cpp/src/neighbors/faiss_select/Comparators.cuh new file mode 100644 index 000000000..9ced61e13 --- /dev/null +++ b/cpp/src/neighbors/faiss_select/Comparators.cuh @@ -0,0 +1,29 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file thirdparty/LICENSES/LICENSE.faiss + */ + +#pragma once + +#include +#include + +namespace cuvs::neighbors::detail::faiss_select { + +template +struct Comparator { + __device__ static inline bool lt(T a, T b) { return a < b; } + + __device__ static inline bool gt(T a, T b) { return a > b; } +}; + +template <> +struct Comparator { + __device__ static inline bool lt(half a, half b) { return __hlt(a, b); } + + __device__ static inline bool gt(half a, half b) { return __hgt(a, b); } +}; + +} // namespace cuvs::neighbors::detail::faiss_select diff --git a/cpp/src/neighbors/faiss_select/DistanceUtils.h b/cpp/src/neighbors/faiss_select/DistanceUtils.h new file mode 100644 index 000000000..e8a41c1aa --- /dev/null +++ b/cpp/src/neighbors/faiss_select/DistanceUtils.h @@ -0,0 +1,52 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file thirdparty/LICENSES/LICENSE.faiss + */ + +#pragma once + +namespace cuvs::neighbors::detail::faiss_select { +// If the inner size (dim) of the vectors is small, we want a larger query tile +// size, like 1024 +inline void chooseTileSize(size_t numQueries, + size_t numCentroids, + size_t dim, + size_t elementSize, + size_t totalMem, + size_t& tileRows, + size_t& tileCols) +{ + // The matrix multiplication should be large enough to be efficient, but if + // it is too large, we seem to lose efficiency as opposed to + // double-streaming. Each tile size here defines 1/2 of the memory use due + // to double streaming. We ignore available temporary memory, as that is + // adjusted independently by the user and can thus meet these requirements + // (or not). For <= 4 GB GPUs, prefer 512 MB of usage. For <= 8 GB GPUs, + // prefer 768 MB of usage. Otherwise, prefer 1 GB of usage. + size_t targetUsage = 0; + + if (totalMem <= ((size_t)4) * 1024 * 1024 * 1024) { + targetUsage = 512 * 1024 * 1024; + } else if (totalMem <= ((size_t)8) * 1024 * 1024 * 1024) { + targetUsage = 768 * 1024 * 1024; + } else { + targetUsage = 1024 * 1024 * 1024; + } + + targetUsage /= 2 * elementSize; + + // 512 seems to be a batch size sweetspot for float32. + // If we are on float16, increase to 512. + // If the k size (vec dim) of the matrix multiplication is small (<= 32), + // increase to 1024. + size_t preferredTileRows = 512; + if (dim <= 32) { preferredTileRows = 1024; } + + tileRows = std::min(preferredTileRows, numQueries); + + // tileCols is the remainder size + tileCols = std::min(targetUsage / preferredTileRows, numCentroids); +} +} // namespace cuvs::neighbors::detail::faiss_select diff --git a/cpp/src/neighbors/faiss_select/MergeNetworkBlock.cuh b/cpp/src/neighbors/faiss_select/MergeNetworkBlock.cuh new file mode 100644 index 000000000..345b9186a --- /dev/null +++ b/cpp/src/neighbors/faiss_select/MergeNetworkBlock.cuh @@ -0,0 +1,277 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file thirdparty/LICENSES/LICENSE.faiss + */ + +#pragma once + +#include "MergeNetworkUtils.cuh" +#include "StaticUtils.h" + +#include + +namespace cuvs::neighbors::detail::faiss_select { + +// Merge pairs of lists smaller than blockDim.x (NumThreads) +template +inline __device__ void blockMergeSmall(K* listK, V* listV) +{ + static_assert(utils::isPowerOf2(L), "L must be a power-of-2"); + static_assert(utils::isPowerOf2(NumThreads), "NumThreads must be a power-of-2"); + static_assert(L <= NumThreads, "merge list size must be <= NumThreads"); + + // Which pair of lists we are merging + int mergeId = threadIdx.x / L; + + // Which thread we are within the merge + int tid = threadIdx.x % L; + + // listK points to a region of size N * 2 * L + listK += 2 * L * mergeId; + listV += 2 * L * mergeId; + + // It's not a bitonic merge, both lists are in the same direction, + // so handle the first swap assuming the second list is reversed + int pos = L - 1 - tid; + int stride = 2 * tid + 1; + + if (AllThreads || (threadIdx.x < N * L)) { + K ka = listK[pos]; + K kb = listK[pos + stride]; + + bool swap = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); + listK[pos] = swap ? kb : ka; + listK[pos + stride] = swap ? ka : kb; + + V va = listV[pos]; + V vb = listV[pos + stride]; + listV[pos] = swap ? vb : va; + listV[pos + stride] = swap ? va : vb; + + // FIXME: is this a CUDA 9 compiler bug? + // K& ka = listK[pos]; + // K& kb = listK[pos + stride]; + + // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); + // swap(s, ka, kb); + + // V& va = listV[pos]; + // V& vb = listV[pos + stride]; + // swap(s, va, vb); + } + + __syncthreads(); + +#pragma unroll + for (int stride = L / 2; stride > 0; stride /= 2) { + int pos = 2 * tid - (tid & (stride - 1)); + + if (AllThreads || (threadIdx.x < N * L)) { + K ka = listK[pos]; + K kb = listK[pos + stride]; + + bool swap = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); + listK[pos] = swap ? kb : ka; + listK[pos + stride] = swap ? ka : kb; + + V va = listV[pos]; + V vb = listV[pos + stride]; + listV[pos] = swap ? vb : va; + listV[pos + stride] = swap ? va : vb; + + // FIXME: is this a CUDA 9 compiler bug? + // K& ka = listK[pos]; + // K& kb = listK[pos + stride]; + + // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); + // swap(s, ka, kb); + + // V& va = listV[pos]; + // V& vb = listV[pos + stride]; + // swap(s, va, vb); + } + + __syncthreads(); + } +} + +// Merge pairs of sorted lists larger than blockDim.x (NumThreads) +template +inline __device__ void blockMergeLarge(K* listK, V* listV) +{ + static_assert(utils::isPowerOf2(L), "L must be a power-of-2"); + static_assert(L >= raft::WarpSize, "merge list size must be >= 32"); + static_assert(utils::isPowerOf2(NumThreads), "NumThreads must be a power-of-2"); + static_assert(L >= NumThreads, "merge list size must be >= NumThreads"); + + // For L > NumThreads, each thread has to perform more work + // per each stride. + constexpr int kLoopPerThread = L / NumThreads; + + // It's not a bitonic merge, both lists are in the same direction, + // so handle the first swap assuming the second list is reversed +#pragma unroll + for (int loop = 0; loop < kLoopPerThread; ++loop) { + int tid = loop * NumThreads + threadIdx.x; + int pos = L - 1 - tid; + int stride = 2 * tid + 1; + + K ka = listK[pos]; + K kb = listK[pos + stride]; + + bool swap = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); + listK[pos] = swap ? kb : ka; + listK[pos + stride] = swap ? ka : kb; + + V va = listV[pos]; + V vb = listV[pos + stride]; + listV[pos] = swap ? vb : va; + listV[pos + stride] = swap ? va : vb; + + // FIXME: is this a CUDA 9 compiler bug? + // K& ka = listK[pos]; + // K& kb = listK[pos + stride]; + + // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); + // swap(s, ka, kb); + + // V& va = listV[pos]; + // V& vb = listV[pos + stride]; + // swap(s, va, vb); + } + + __syncthreads(); + + constexpr int kSecondLoopPerThread = FullMerge ? kLoopPerThread : kLoopPerThread / 2; + +#pragma unroll + for (int stride = L / 2; stride > 0; stride /= 2) { +#pragma unroll + for (int loop = 0; loop < kSecondLoopPerThread; ++loop) { + int tid = loop * NumThreads + threadIdx.x; + int pos = 2 * tid - (tid & (stride - 1)); + + K ka = listK[pos]; + K kb = listK[pos + stride]; + + bool swap = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); + listK[pos] = swap ? kb : ka; + listK[pos + stride] = swap ? ka : kb; + + V va = listV[pos]; + V vb = listV[pos + stride]; + listV[pos] = swap ? vb : va; + listV[pos + stride] = swap ? va : vb; + + // FIXME: is this a CUDA 9 compiler bug? + // K& ka = listK[pos]; + // K& kb = listK[pos + stride]; + + // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); + // swap(s, ka, kb); + + // V& va = listV[pos]; + // V& vb = listV[pos + stride]; + // swap(s, va, vb); + } + + __syncthreads(); + } +} + +/// Class template to prevent static_assert from firing for +/// mixing smaller/larger than block cases +template +struct BlockMerge {}; + +/// Merging lists smaller than a block +template +struct BlockMerge { + static inline __device__ void merge(K* listK, V* listV) + { + constexpr int kNumParallelMerges = NumThreads / L; + constexpr int kNumIterations = N / kNumParallelMerges; + + static_assert(L <= NumThreads, "list must be <= NumThreads"); + static_assert((N < kNumParallelMerges) || (kNumIterations * kNumParallelMerges == N), + "improper selection of N and L"); + + if (N < kNumParallelMerges) { + // We only need L threads per each list to perform the merge + blockMergeSmall(listK, listV); + } else { + // All threads participate +#pragma unroll + for (int i = 0; i < kNumIterations; ++i) { + int start = i * kNumParallelMerges * 2 * L; + + blockMergeSmall(listK + start, + listV + start); + } + } + } +}; + +/// Merging lists larger than a block +template +struct BlockMerge { + static inline __device__ void merge(K* listK, V* listV) + { + // Each pair of lists is merged sequentially +#pragma unroll + for (int i = 0; i < N; ++i) { + int start = i * 2 * L; + + blockMergeLarge(listK + start, listV + start); + } + } +}; + +template +inline __device__ void blockMerge(K* listK, V* listV) +{ + constexpr bool kSmallerThanBlock = (L <= NumThreads); + + BlockMerge::merge(listK, listV); +} + +} // namespace cuvs::neighbors::detail::faiss_select diff --git a/cpp/src/neighbors/faiss_select/MergeNetworkUtils.cuh b/cpp/src/neighbors/faiss_select/MergeNetworkUtils.cuh new file mode 100644 index 000000000..7f7796fad --- /dev/null +++ b/cpp/src/neighbors/faiss_select/MergeNetworkUtils.cuh @@ -0,0 +1,25 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file thirdparty/LICENSES/LICENSE.faiss + */ + +#pragma once + +namespace cuvs::neighbors::detail::faiss_select { + +template +inline __device__ void swap(bool swap, T& x, T& y) +{ + T tmp = x; + x = swap ? y : x; + y = swap ? tmp : y; +} + +template +inline __device__ void assign(bool assign, T& x, T y) +{ + x = assign ? y : x; +} +} // namespace cuvs::neighbors::detail::faiss_select diff --git a/cpp/src/neighbors/faiss_select/MergeNetworkWarp.cuh b/cpp/src/neighbors/faiss_select/MergeNetworkWarp.cuh new file mode 100644 index 000000000..0a9226e77 --- /dev/null +++ b/cpp/src/neighbors/faiss_select/MergeNetworkWarp.cuh @@ -0,0 +1,519 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file thirdparty/LICENSES/LICENSE.faiss + */ + +#pragma once + +#include "MergeNetworkUtils.cuh" +#include "StaticUtils.h" +#include + +namespace cuvs::neighbors::detail::faiss_select { + +// +// This file contains functions to: +// +// -perform bitonic merges on pairs of sorted lists, held in +// registers. Each list contains N * raft::WarpSize (multiple of 32) +// elements for some N. +// The bitonic merge is implemented for arbitrary sizes; +// sorted list A of size N1 * raft::WarpSize registers +// sorted list B of size N2 * raft::WarpSize registers => +// sorted list C if size (N1 + N2) * raft::WarpSize registers. N1 and N2 +// are >= 1 and don't have to be powers of 2. +// +// -perform bitonic sorts on a set of N * raft::WarpSize key/value pairs +// held in registers, by using the above bitonic merge as a +// primitive. +// N can be an arbitrary N >= 1; i.e., the bitonic sort here supports +// odd sizes and doesn't require the input to be a power of 2. +// +// The sort or merge network is completely statically instantiated via +// template specialization / expansion and constexpr, and it uses warp +// shuffles to exchange values between warp lanes. +// +// A note about comparisons: +// +// For a sorting network of keys only, we only need one +// comparison (a < b). However, what we really need to know is +// if one lane chooses to exchange a value, then the +// corresponding lane should also do the exchange. +// Thus, if one just uses the negation !(x < y) in the higher +// lane, this will also include the case where (x == y). Thus, one +// lane in fact performs an exchange and the other doesn't, but +// because the only value being exchanged is equivalent, nothing has +// changed. +// So, you can get away with just one comparison and its negation. +// +// If we're sorting keys and values, where equivalent keys can +// exist, then this is a problem, since we want to treat (x, v1) +// as not equivalent to (x, v2). +// +// To remedy this, you can either compare with a lexicographic +// ordering (a.k < b.k || (a.k == b.k && a.v < b.v)), which since +// we're predicating all of the choices results in 3 comparisons +// being executed, or we can invert the selection so that there is no +// middle choice of equality; the other lane will likewise +// check that (b.k > a.k) (the higher lane has the values +// swapped). Then, the first lane swaps if and only if the +// second lane swaps; if both lanes have equivalent keys, no +// swap will be performed. This results in only two comparisons +// being executed. +// +// If you don't consider values as well, then this does not produce a +// consistent ordering among (k, v) pairs with equivalent keys but +// different values; for us, we don't really care about ordering or +// stability here. +// +// I have tried both re-arranging the order in the higher lane to get +// away with one comparison or adding the value to the check; both +// result in greater register consumption or lower speed than just +// performing both < and > comparisons with the variables, so I just +// stick with this. + +// This function merges raft::WarpSize / 2L lists in parallel using warp +// shuffles. +// It works on at most size-16 lists, as we need 32 threads for this +// shuffle merge. +// +// If IsBitonic is false, the first stage is reversed, so we don't +// need to sort directionally. It's still technically a bitonic sort. +template +inline __device__ void warpBitonicMergeLE16(K& k, V& v) +{ + static_assert(utils::isPowerOf2(L), "L must be a power-of-2"); + static_assert(L <= raft::WarpSize / 2, "merge list size must be <= 16"); + + int laneId = raft::laneId(); + + if (!IsBitonic) { + // Reverse the first comparison stage. + // For example, merging a list of size 8 has the exchanges: + // 0 <-> 15, 1 <-> 14, ... + K otherK = raft::shfl_xor(k, 2 * L - 1); + V otherV = raft::shfl_xor(v, 2 * L - 1); + + // Whether we are the lesser thread in the exchange + bool small = !(laneId & L); + + if (Dir) { + // See the comment above how performing both of these + // comparisons in the warp seems to win out over the + // alternatives in practice + bool s = small ? Comp::gt(k, otherK) : Comp::lt(k, otherK); + assign(s, k, otherK); + assign(s, v, otherV); + + } else { + bool s = small ? Comp::lt(k, otherK) : Comp::gt(k, otherK); + assign(s, k, otherK); + assign(s, v, otherV); + } + } + +#pragma unroll + for (int stride = IsBitonic ? L : L / 2; stride > 0; stride /= 2) { + K otherK = raft::shfl_xor(k, stride); + V otherV = raft::shfl_xor(v, stride); + + // Whether we are the lesser thread in the exchange + bool small = !(laneId & stride); + + if (Dir) { + bool s = small ? Comp::gt(k, otherK) : Comp::lt(k, otherK); + assign(s, k, otherK); + assign(s, v, otherV); + + } else { + bool s = small ? Comp::lt(k, otherK) : Comp::gt(k, otherK); + assign(s, k, otherK); + assign(s, v, otherV); + } + } +} + +// Template for performing a bitonic merge of an arbitrary set of +// registers +template +struct BitonicMergeStep {}; + +// +// Power-of-2 merge specialization +// + +// All merges eventually call this +template +struct BitonicMergeStep { + static inline __device__ void merge(K k[1], V v[1]) + { + // Use warp shuffles + warpBitonicMergeLE16(k[0], v[0]); + } +}; + +template +struct BitonicMergeStep { + static inline __device__ void merge(K k[N], V v[N]) + { + static_assert(utils::isPowerOf2(N), "must be power of 2"); + static_assert(N > 1, "must be N > 1"); + +#pragma unroll + for (int i = 0; i < N / 2; ++i) { + K& ka = k[i]; + V& va = v[i]; + + K& kb = k[i + N / 2]; + V& vb = v[i + N / 2]; + + bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); + swap(s, ka, kb); + swap(s, va, vb); + } + + { + K newK[N / 2]; + V newV[N / 2]; + +#pragma unroll + for (int i = 0; i < N / 2; ++i) { + newK[i] = k[i]; + newV[i] = v[i]; + } + + BitonicMergeStep::merge(newK, newV); + +#pragma unroll + for (int i = 0; i < N / 2; ++i) { + k[i] = newK[i]; + v[i] = newV[i]; + } + } + + { + K newK[N / 2]; + V newV[N / 2]; + +#pragma unroll + for (int i = 0; i < N / 2; ++i) { + newK[i] = k[i + N / 2]; + newV[i] = v[i + N / 2]; + } + + BitonicMergeStep::merge(newK, newV); + +#pragma unroll + for (int i = 0; i < N / 2; ++i) { + k[i + N / 2] = newK[i]; + v[i + N / 2] = newV[i]; + } + } + } +}; + +// +// Non-power-of-2 merge specialization +// + +// Low recursion +template +struct BitonicMergeStep { + static inline __device__ void merge(K k[N], V v[N]) + { + static_assert(!utils::isPowerOf2(N), "must be non-power-of-2"); + static_assert(N >= 3, "must be N >= 3"); + + constexpr int kNextHighestPowerOf2 = utils::nextHighestPowerOf2(N); + +#pragma unroll + for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) { + K& ka = k[i]; + V& va = v[i]; + + K& kb = k[i + kNextHighestPowerOf2 / 2]; + V& vb = v[i + kNextHighestPowerOf2 / 2]; + + bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); + swap(s, ka, kb); + swap(s, va, vb); + } + + constexpr int kLowSize = N - kNextHighestPowerOf2 / 2; + constexpr int kHighSize = kNextHighestPowerOf2 / 2; + { + K newK[kLowSize]; + V newV[kLowSize]; + +#pragma unroll + for (int i = 0; i < kLowSize; ++i) { + newK[i] = k[i]; + newV[i] = v[i]; + } + + constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(N - kNextHighestPowerOf2 / 2); + // FIXME: compiler doesn't like this expression? compiler bug? + // constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize); + BitonicMergeStep::merge(newK, newV); + +#pragma unroll + for (int i = 0; i < kLowSize; ++i) { + k[i] = newK[i]; + v[i] = newV[i]; + } + } + + { + K newK[kHighSize]; + V newV[kHighSize]; + +#pragma unroll + for (int i = 0; i < kHighSize; ++i) { + newK[i] = k[i + kLowSize]; + newV[i] = v[i + kLowSize]; + } + + constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kNextHighestPowerOf2 / 2); + // FIXME: compiler doesn't like this expression? compiler bug? + // constexpr bool kHighIsPowerOf2 = + // utils::isPowerOf2(kHighSize); + BitonicMergeStep::merge(newK, newV); + +#pragma unroll + for (int i = 0; i < kHighSize; ++i) { + k[i + kLowSize] = newK[i]; + v[i + kLowSize] = newV[i]; + } + } + } +}; + +// High recursion +template +struct BitonicMergeStep { + static inline __device__ void merge(K k[N], V v[N]) + { + static_assert(!utils::isPowerOf2(N), "must be non-power-of-2"); + static_assert(N >= 3, "must be N >= 3"); + + constexpr int kNextHighestPowerOf2 = utils::nextHighestPowerOf2(N); + +#pragma unroll + for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) { + K& ka = k[i]; + V& va = v[i]; + + K& kb = k[i + kNextHighestPowerOf2 / 2]; + V& vb = v[i + kNextHighestPowerOf2 / 2]; + + bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); + swap(s, ka, kb); + swap(s, va, vb); + } + + constexpr int kLowSize = kNextHighestPowerOf2 / 2; + constexpr int kHighSize = N - kNextHighestPowerOf2 / 2; + { + K newK[kLowSize]; + V newV[kLowSize]; + +#pragma unroll + for (int i = 0; i < kLowSize; ++i) { + newK[i] = k[i]; + newV[i] = v[i]; + } + + constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kNextHighestPowerOf2 / 2); + // FIXME: compiler doesn't like this expression? compiler bug? + // constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize); + BitonicMergeStep::merge(newK, newV); + +#pragma unroll + for (int i = 0; i < kLowSize; ++i) { + k[i] = newK[i]; + v[i] = newV[i]; + } + } + + { + K newK[kHighSize]; + V newV[kHighSize]; + +#pragma unroll + for (int i = 0; i < kHighSize; ++i) { + newK[i] = k[i + kLowSize]; + newV[i] = v[i + kLowSize]; + } + + constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(N - kNextHighestPowerOf2 / 2); + // FIXME: compiler doesn't like this expression? compiler bug? + // constexpr bool kHighIsPowerOf2 = + // utils::isPowerOf2(kHighSize); + BitonicMergeStep::merge(newK, newV); + +#pragma unroll + for (int i = 0; i < kHighSize; ++i) { + k[i + kLowSize] = newK[i]; + v[i + kLowSize] = newV[i]; + } + } + } +}; + +/// Merges two sets of registers across the warp of any size; +/// i.e., merges a sorted k/v list of size raft::WarpSize * N1 with a +/// sorted k/v list of size raft::WarpSize * N2, where N1 and N2 are any +/// value >= 1 +template +inline __device__ void warpMergeAnyRegisters(K k1[N1], V v1[N1], K k2[N2], V v2[N2]) +{ + constexpr int kSmallestN = N1 < N2 ? N1 : N2; + +#pragma unroll + for (int i = 0; i < kSmallestN; ++i) { + K& ka = k1[N1 - 1 - i]; + V& va = v1[N1 - 1 - i]; + + K& kb = k2[i]; + V& vb = v2[i]; + + K otherKa; + V otherVa; + + if (FullMerge) { + // We need the other values + otherKa = raft::shfl_xor(ka, raft::WarpSize - 1); + otherVa = raft::shfl_xor(va, raft::WarpSize - 1); + } + + K otherKb = raft::shfl_xor(kb, raft::WarpSize - 1); + V otherVb = raft::shfl_xor(vb, raft::WarpSize - 1); + + // ka is always first in the list, so we needn't use our lane + // in this comparison + bool swapa = Dir ? Comp::gt(ka, otherKb) : Comp::lt(ka, otherKb); + assign(swapa, ka, otherKb); + assign(swapa, va, otherVb); + + // kb is always second in the list, so we needn't use our lane + // in this comparison + if (FullMerge) { + bool swapb = Dir ? Comp::lt(kb, otherKa) : Comp::gt(kb, otherKa); + assign(swapb, kb, otherKa); + assign(swapb, vb, otherVa); + + } else { + // We don't care about updating elements in the second list + } + } + + BitonicMergeStep::merge(k1, v1); + if (FullMerge) { + // Only if we care about N2 do we need to bother merging it fully + BitonicMergeStep::merge(k2, v2); + } +} + +// Recursive template that uses the above bitonic merge to perform a +// bitonic sort +template +struct BitonicSortStep { + static inline __device__ void sort(K k[N], V v[N]) + { + static_assert(N > 1, "did not hit specialized case"); + + // Sort recursively + constexpr int kSizeA = N / 2; + constexpr int kSizeB = N - kSizeA; + + K aK[kSizeA]; + V aV[kSizeA]; + +#pragma unroll + for (int i = 0; i < kSizeA; ++i) { + aK[i] = k[i]; + aV[i] = v[i]; + } + + BitonicSortStep::sort(aK, aV); + + K bK[kSizeB]; + V bV[kSizeB]; + +#pragma unroll + for (int i = 0; i < kSizeB; ++i) { + bK[i] = k[i + kSizeA]; + bV[i] = v[i + kSizeA]; + } + + BitonicSortStep::sort(bK, bV); + + // Merge halves + warpMergeAnyRegisters(aK, aV, bK, bV); + +#pragma unroll + for (int i = 0; i < kSizeA; ++i) { + k[i] = aK[i]; + v[i] = aV[i]; + } + +#pragma unroll + for (int i = 0; i < kSizeB; ++i) { + k[i + kSizeA] = bK[i]; + v[i + kSizeA] = bV[i]; + } + } +}; + +// Single warp (N == 1) sorting specialization +template +struct BitonicSortStep { + static inline __device__ void sort(K k[1], V v[1]) + { + // Update this code if this changes + // should go from 1 -> raft::WarpSize in multiples of 2 + static_assert(raft::WarpSize == 32, "unexpected warp size"); + + warpBitonicMergeLE16(k[0], v[0]); + warpBitonicMergeLE16(k[0], v[0]); + warpBitonicMergeLE16(k[0], v[0]); + warpBitonicMergeLE16(k[0], v[0]); + warpBitonicMergeLE16(k[0], v[0]); + } +}; + +/// Sort a list of raft::WarpSize * N elements in registers, where N is an +/// arbitrary >= 1 +template +inline __device__ void warpSortAnyRegisters(K k[N], V v[N]) +{ + BitonicSortStep::sort(k, v); +} + +} // namespace cuvs::neighbors::detail::faiss_select diff --git a/cpp/src/neighbors/faiss_select/Select.cuh b/cpp/src/neighbors/faiss_select/Select.cuh new file mode 100644 index 000000000..ccd2a110c --- /dev/null +++ b/cpp/src/neighbors/faiss_select/Select.cuh @@ -0,0 +1,569 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file thirdparty/LICENSES/LICENSE.faiss + */ + +#pragma once + +#include "Comparators.cuh" +#include "MergeNetworkBlock.cuh" +#include "MergeNetworkWarp.cuh" +#include +#include + +namespace cuvs::neighbors::detail::faiss_select { + +// Specialization for block-wide monotonic merges producing a merge sort +// since what we really want is a constexpr loop expansion +template +struct FinalBlockMerge {}; + +template +struct FinalBlockMerge<1, NumThreads, K, V, NumWarpQ, Dir, Comp> { + static inline __device__ void merge(K* sharedK, V* sharedV) + { + // no merge required; single warp + } +}; + +template +struct FinalBlockMerge<2, NumThreads, K, V, NumWarpQ, Dir, Comp> { + static inline __device__ void merge(K* sharedK, V* sharedV) + { + // Final merge doesn't need to fully merge the second list + blockMerge( + sharedK, sharedV); + } +}; + +template +struct FinalBlockMerge<4, NumThreads, K, V, NumWarpQ, Dir, Comp> { + static inline __device__ void merge(K* sharedK, V* sharedV) + { + blockMerge(sharedK, + sharedV); + // Final merge doesn't need to fully merge the second list + blockMerge(sharedK, sharedV); + } +}; + +template +struct FinalBlockMerge<8, NumThreads, K, V, NumWarpQ, Dir, Comp> { + static inline __device__ void merge(K* sharedK, V* sharedV) + { + blockMerge(sharedK, + sharedV); + blockMerge( + sharedK, sharedV); + // Final merge doesn't need to fully merge the second list + blockMerge(sharedK, sharedV); + } +}; + +// `Dir` true, produce largest values. +// `Dir` false, produce smallest values. +template +struct BlockSelect { + static constexpr int kNumWarps = ThreadsPerBlock / raft::WarpSize; + static constexpr int kTotalWarpSortSize = NumWarpQ; + + __device__ inline BlockSelect(K initKVal, V initVVal, K* smemK, V* smemV, int k) + : initK(initKVal), + initV(initVVal), + numVals(0), + warpKTop(initKVal), + sharedK(smemK), + sharedV(smemV), + kMinus1(k - 1) + { + static_assert(utils::isPowerOf2(ThreadsPerBlock), "threads must be a power-of-2"); + static_assert(utils::isPowerOf2(NumWarpQ), "warp queue must be power-of-2"); + + // Fill the per-thread queue keys with the default value +#pragma unroll + for (int i = 0; i < NumThreadQ; ++i) { + threadK[i] = initK; + threadV[i] = initV; + } + + int laneId = raft::laneId(); + int warpId = threadIdx.x / raft::WarpSize; + warpK = sharedK + warpId * kTotalWarpSortSize; + warpV = sharedV + warpId * kTotalWarpSortSize; + + // Fill warp queue (only the actual queue space is fine, not where + // we write the per-thread queues for merging) + for (int i = laneId; i < NumWarpQ; i += raft::WarpSize) { + warpK[i] = initK; + warpV[i] = initV; + } + + warpFence(); + } + + __device__ inline void addThreadQ(K k, V v) + { + if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) { + // Rotate right +#pragma unroll + for (int i = NumThreadQ - 1; i > 0; --i) { + threadK[i] = threadK[i - 1]; + threadV[i] = threadV[i - 1]; + } + + threadK[0] = k; + threadV[0] = v; + ++numVals; + } + } + + __device__ inline void checkThreadQ() + { + bool needSort = (numVals == NumThreadQ); + +#if CUDA_VERSION >= 9000 + needSort = __any_sync(0xffffffff, needSort); +#else + needSort = __any(needSort); +#endif + + if (!needSort) { + // no lanes have triggered a sort + return; + } + + // This has a trailing warpFence + mergeWarpQ(); + + // Any top-k elements have been merged into the warp queue; we're + // free to reset the thread queues + numVals = 0; + +#pragma unroll + for (int i = 0; i < NumThreadQ; ++i) { + threadK[i] = initK; + threadV[i] = initV; + } + + // We have to beat at least this element + warpKTop = warpK[kMinus1]; + + warpFence(); + } + + /// This function handles sorting and merging together the + /// per-thread queues with the warp-wide queue, creating a sorted + /// list across both + __device__ inline void mergeWarpQ() + { + int laneId = raft::laneId(); + + // Sort all of the per-thread queues + warpSortAnyRegisters(threadK, threadV); + + constexpr int kNumWarpQRegisters = NumWarpQ / raft::WarpSize; + K warpKRegisters[kNumWarpQRegisters]; + V warpVRegisters[kNumWarpQRegisters]; + +#pragma unroll + for (int i = 0; i < kNumWarpQRegisters; ++i) { + warpKRegisters[i] = warpK[i * raft::WarpSize + laneId]; + warpVRegisters[i] = warpV[i * raft::WarpSize + laneId]; + } + + warpFence(); + + // The warp queue is already sorted, and now that we've sorted the + // per-thread queue, merge both sorted lists together, producing + // one sorted list + warpMergeAnyRegisters( + warpKRegisters, warpVRegisters, threadK, threadV); + + // Write back out the warp queue +#pragma unroll + for (int i = 0; i < kNumWarpQRegisters; ++i) { + warpK[i * raft::WarpSize + laneId] = warpKRegisters[i]; + warpV[i * raft::WarpSize + laneId] = warpVRegisters[i]; + } + + warpFence(); + } + + /// WARNING: all threads in a warp must participate in this. + /// Otherwise, you must call the constituent parts separately. + __device__ inline void add(K k, V v) + { + addThreadQ(k, v); + checkThreadQ(); + } + + __device__ inline void reduce() + { + // Have all warps dump and merge their queues; this will produce + // the final per-warp results + mergeWarpQ(); + + // block-wide dep; thus far, all warps have been completely + // independent + __syncthreads(); + + // All warp queues are contiguous in smem. + // Now, we have kNumWarps lists of NumWarpQ elements. + // This is a power of 2. + FinalBlockMerge::merge(sharedK, sharedV); + + // The block-wide merge has a trailing syncthreads + } + + // Default element key + const K initK; + + // Default element value + const V initV; + + // Number of valid elements in our thread queue + int numVals; + + // The k-th highest (Dir) or lowest (!Dir) element + K warpKTop; + + // Thread queue values + K threadK[NumThreadQ]; + V threadV[NumThreadQ]; + + // Queues for all warps + K* sharedK; + V* sharedV; + + // Our warp's queue (points into sharedK/sharedV) + // warpK[0] is highest (Dir) or lowest (!Dir) + K* warpK; + V* warpV; + + // This is a cached k-1 value + int kMinus1; +}; + +/// Specialization for k == 1 (NumWarpQ == 1) +template +struct BlockSelect { + static constexpr int kNumWarps = ThreadsPerBlock / raft::WarpSize; + + __device__ inline BlockSelect(K initK, V initV, K* smemK, V* smemV, int k) + : threadK(initK), threadV(initV), sharedK(smemK), sharedV(smemV) + { + } + + __device__ inline void addThreadQ(K k, V v) + { + bool swap = Dir ? Comp::gt(k, threadK) : Comp::lt(k, threadK); + threadK = swap ? k : threadK; + threadV = swap ? v : threadV; + } + + __device__ inline void checkThreadQ() + { + // We don't need to do anything here, since the warp doesn't + // cooperate until the end + } + + __device__ inline void add(K k, V v) { addThreadQ(k, v); } + + __device__ inline void reduce() + { + // Reduce within the warp + KeyValuePair pair(threadK, threadV); + + if (Dir) { + pair = warpReduce(pair, max_op{}); + } else { + pair = warpReduce(pair, min_op{}); + } + + // Each warp writes out a single value + int laneId = raft::laneId(); + int warpId = threadIdx.x / raft::WarpSize; + + if (laneId == 0) { + sharedK[warpId] = pair.key; + sharedV[warpId] = pair.value; + } + + __syncthreads(); + + // We typically use this for small blocks (<= 128), just having the + // first thread in the block perform the reduction across warps is + // faster + if (threadIdx.x == 0) { + threadK = sharedK[0]; + threadV = sharedV[0]; + +#pragma unroll + for (int i = 1; i < kNumWarps; ++i) { + K k = sharedK[i]; + V v = sharedV[i]; + + bool swap = Dir ? Comp::gt(k, threadK) : Comp::lt(k, threadK); + threadK = swap ? k : threadK; + threadV = swap ? v : threadV; + } + + // Hopefully a thread's smem reads/writes are ordered wrt + // itself, so no barrier needed :) + sharedK[0] = threadK; + sharedV[0] = threadV; + } + + // In case other threads wish to read this value + __syncthreads(); + } + + // threadK is lowest (Dir) or highest (!Dir) + K threadK; + V threadV; + + // Where we reduce in smem + K* sharedK; + V* sharedV; +}; + +// +// per-warp WarpSelect +// + +// `Dir` true, produce largest values. +// `Dir` false, produce smallest values. +template +struct WarpSelect { + static constexpr int kNumWarpQRegisters = NumWarpQ / raft::WarpSize; + + __device__ inline WarpSelect(K initKVal, V initVVal, int k) + : initK(initKVal), + initV(initVVal), + numVals(0), + warpKTop(initKVal), + kLane((k - 1) % raft::WarpSize) + { + static_assert(utils::isPowerOf2(ThreadsPerBlock), "threads must be a power-of-2"); + static_assert(utils::isPowerOf2(NumWarpQ), "warp queue must be power-of-2"); + + // Fill the per-thread queue keys with the default value +#pragma unroll + for (int i = 0; i < NumThreadQ; ++i) { + threadK[i] = initK; + threadV[i] = initV; + } + + // Fill the warp queue with the default value +#pragma unroll + for (int i = 0; i < kNumWarpQRegisters; ++i) { + warpK[i] = initK; + warpV[i] = initV; + } + } + + __device__ inline void addThreadQ(K k, V v) + { + if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) { + // Rotate right +#pragma unroll + for (int i = NumThreadQ - 1; i > 0; --i) { + threadK[i] = threadK[i - 1]; + threadV[i] = threadV[i - 1]; + } + + threadK[0] = k; + threadV[0] = v; + ++numVals; + } + } + + __device__ inline void checkThreadQ() + { + bool needSort = (numVals == NumThreadQ); + +#if CUDA_VERSION >= 9000 + needSort = __any_sync(0xffffffff, needSort); +#else + needSort = __any(needSort); +#endif + + if (!needSort) { + // no lanes have triggered a sort + return; + } + + mergeWarpQ(); + + // Any top-k elements have been merged into the warp queue; we're + // free to reset the thread queues + numVals = 0; + +#pragma unroll + for (int i = 0; i < NumThreadQ; ++i) { + threadK[i] = initK; + threadV[i] = initV; + } + + // We have to beat at least this element + warpKTop = shfl(warpK[kNumWarpQRegisters - 1], kLane); + } + + /// This function handles sorting and merging together the + /// per-thread queues with the warp-wide queue, creating a sorted + /// list across both + __device__ inline void mergeWarpQ() + { + // Sort all of the per-thread queues + warpSortAnyRegisters(threadK, threadV); + + // The warp queue is already sorted, and now that we've sorted the + // per-thread queue, merge both sorted lists together, producing + // one sorted list + warpMergeAnyRegisters( + warpK, warpV, threadK, threadV); + } + + /// WARNING: all threads in a warp must participate in this. + /// Otherwise, you must call the constituent parts separately. + __device__ inline void add(K k, V v) + { + addThreadQ(k, v); + checkThreadQ(); + } + + __device__ inline void reduce() + { + // Have all warps dump and merge their queues; this will produce + // the final per-warp results + mergeWarpQ(); + } + + /// Dump final k selected values for this warp out + __device__ inline void writeOut(K* outK, V* outV, int k) + { + int laneId = raft::laneId(); + +#pragma unroll + for (int i = 0; i < kNumWarpQRegisters; ++i) { + int idx = i * raft::WarpSize + laneId; + + if (idx < k) { + outK[idx] = warpK[i]; + outV[idx] = warpV[i]; + } + } + } + + // Default element key + const K initK; + + // Default element value + const V initV; + + // Number of valid elements in our thread queue + int numVals; + + // The k-th highest (Dir) or lowest (!Dir) element + K warpKTop; + + // Thread queue values + K threadK[NumThreadQ]; + V threadV[NumThreadQ]; + + // warpK[0] is highest (Dir) or lowest (!Dir) + K warpK[kNumWarpQRegisters]; + V warpV[kNumWarpQRegisters]; + + // This is what lane we should load an approximation (>=k) to the + // kth element from the last register in the warp queue (i.e., + // warpK[kNumWarpQRegisters - 1]). + int kLane; +}; + +/// Specialization for k == 1 (NumWarpQ == 1) +template +struct WarpSelect { + static constexpr int kNumWarps = ThreadsPerBlock / raft::WarpSize; + + __device__ inline WarpSelect(K initK, V initV, int k) : threadK(initK), threadV(initV) {} + + __device__ inline void addThreadQ(K k, V v) + { + bool swap = Dir ? Comp::gt(k, threadK) : Comp::lt(k, threadK); + threadK = swap ? k : threadK; + threadV = swap ? v : threadV; + } + + __device__ inline void checkThreadQ() + { + // We don't need to do anything here, since the warp doesn't + // cooperate until the end + } + + __device__ inline void add(K k, V v) { addThreadQ(k, v); } + + __device__ inline void reduce() + { + // Reduce within the warp + KeyValuePair pair(threadK, threadV); + + if (Dir) { + pair = warpReduce(pair, max_op{}); + } else { + pair = warpReduce(pair, min_op{}); + } + + threadK = pair.key; + threadV = pair.value; + } + + /// Dump final k selected values for this warp out + __device__ inline void writeOut(K* outK, V* outV, int k) + { + if (raft::laneId() == 0) { + *outK = threadK; + *outV = threadV; + } + } + + // threadK is lowest (Dir) or highest (!Dir) + K threadK; + V threadV; +}; + +} // namespace cuvs::neighbors::detail::faiss_select diff --git a/cpp/src/neighbors/faiss_select/StaticUtils.h b/cpp/src/neighbors/faiss_select/StaticUtils.h new file mode 100644 index 000000000..198c28b60 --- /dev/null +++ b/cpp/src/neighbors/faiss_select/StaticUtils.h @@ -0,0 +1,48 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file thirdparty/LICENSES/LICENSE.faiss + */ + +#pragma once + +#include + +// allow usage for non-CUDA files +#ifndef __host__ +#define __host__ +#define __device__ +#endif + +namespace cuvs::neighbors::detail::faiss_select::utils { + +template +constexpr __host__ __device__ bool isPowerOf2(T v) +{ + return (v && !(v & (v - 1))); +} + +static_assert(isPowerOf2(2048), "isPowerOf2"); +static_assert(!isPowerOf2(3333), "isPowerOf2"); + +template +constexpr __host__ __device__ T nextHighestPowerOf2(T v) +{ + return (isPowerOf2(v) ? (T)2 * v : ((T)1 << (log2(v) + (T)1))); +} + +static_assert(nextHighestPowerOf2(1) == 2, "nextHighestPowerOf2"); +static_assert(nextHighestPowerOf2(2) == 4, "nextHighestPowerOf2"); +static_assert(nextHighestPowerOf2(3) == 4, "nextHighestPowerOf2"); +static_assert(nextHighestPowerOf2(4) == 8, "nextHighestPowerOf2"); + +static_assert(nextHighestPowerOf2(15) == 16, "nextHighestPowerOf2"); +static_assert(nextHighestPowerOf2(16) == 32, "nextHighestPowerOf2"); +static_assert(nextHighestPowerOf2(17) == 32, "nextHighestPowerOf2"); + +static_assert(nextHighestPowerOf2(1536000000u) == 2147483648u, "nextHighestPowerOf2"); +static_assert(nextHighestPowerOf2((size_t)2147483648ULL) == (size_t)4294967296ULL, + "nextHighestPowerOf2"); + +} // namespace cuvs::neighbors::detail::faiss_select::utils diff --git a/cpp/src/neighbors/faiss_select/key_value_block_select.cuh b/cpp/src/neighbors/faiss_select/key_value_block_select.cuh new file mode 100644 index 000000000..2bb5f84cc --- /dev/null +++ b/cpp/src/neighbors/faiss_select/key_value_block_select.cuh @@ -0,0 +1,229 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file thirdparty/LICENSES/LICENSE.faiss + */ + +#pragma once + +#include "MergeNetworkUtils.cuh" +#include "Select.cuh" + +// TODO: Need to think further about the impact (and new boundaries created) on the registers +// because this will change the max k that can be processed. One solution might be to break +// up k into multiple batches for larger k. + +namespace cuvs::neighbors::detail::faiss_select { + +// `Dir` true, produce largest values. +// `Dir` false, produce smallest values. +template +struct KeyValueBlockSelect { + static constexpr int kNumWarps = ThreadsPerBlock / raft::WarpSize; + static constexpr int kTotalWarpSortSize = NumWarpQ; + + __device__ inline KeyValueBlockSelect( + K initKVal, K initVKey, V initVVal, K* smemK, raft::KeyValuePair* smemV, int k) + : initK(initKVal), + initVk(initVKey), + initVv(initVVal), + numVals(0), + warpKTop(initKVal), + warpKTopRDist(initKVal), + sharedK(smemK), + sharedV(smemV), + kMinus1(k - 1) + { + static_assert(utils::isPowerOf2(ThreadsPerBlock), "threads must be a power-of-2"); + static_assert(utils::isPowerOf2(NumWarpQ), "warp queue must be power-of-2"); + + // Fill the per-thread queue keys with the default value +#pragma unroll + for (int i = 0; i < NumThreadQ; ++i) { + threadK[i] = initK; + threadV[i].key = initVk; + threadV[i].value = initVv; + } + + int laneId = raft::laneId(); + int warpId = threadIdx.x / raft::WarpSize; + warpK = sharedK + warpId * kTotalWarpSortSize; + warpV = sharedV + warpId * kTotalWarpSortSize; + + // Fill warp queue (only the actual queue space is fine, not where + // we write the per-thread queues for merging) + for (int i = laneId; i < NumWarpQ; i += raft::WarpSize) { + warpK[i] = initK; + warpV[i].key = initVk; + warpV[i].value = initVv; + } + + raft::warpFence(); + } + + __device__ inline void addThreadQ(K k, K vk, V vv) + { + if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) { + // Rotate right +#pragma unroll + for (int i = NumThreadQ - 1; i > 0; --i) { + threadK[i] = threadK[i - 1]; + threadV[i].key = threadV[i - 1].key; + threadV[i].value = threadV[i - 1].value; + } + + threadK[0] = k; + threadV[0].key = vk; + threadV[0].value = vv; + ++numVals; + } + } + + __device__ inline void checkThreadQ() + { + bool needSort = (numVals == NumThreadQ); + +#if CUDA_VERSION >= 9000 + needSort = __any_sync(0xffffffff, needSort); +#else + needSort = __any(needSort); +#endif + + if (!needSort) { + // no lanes have triggered a sort + return; + } + + // This has a trailing raft::warpFence + mergeWarpQ(); + + // Any top-k elements have been merged into the warp queue; we're + // free to reset the thread queues + numVals = 0; + +#pragma unroll + for (int i = 0; i < NumThreadQ; ++i) { + threadK[i] = initK; + threadV[i].key = initVk; + threadV[i].value = initVv; + } + + // We have to beat at least this element + warpKTop = warpK[kMinus1]; + warpKTopRDist = warpV[kMinus1].key; + + raft::warpFence(); + } + + /// This function handles sorting and merging together the + /// per-thread queues with the warp-wide queue, creating a sorted + /// list across both + __device__ inline void mergeWarpQ() + { + int laneId = raft::laneId(); + + // Sort all of the per-thread queues + warpSortAnyRegisters, NumThreadQ, !Dir, Comp>(threadK, threadV); + + constexpr int kNumWarpQRegisters = NumWarpQ / raft::WarpSize; + K warpKRegisters[kNumWarpQRegisters]; + raft::KeyValuePair warpVRegisters[kNumWarpQRegisters]; + +#pragma unroll + for (int i = 0; i < kNumWarpQRegisters; ++i) { + warpKRegisters[i] = warpK[i * raft::WarpSize + laneId]; + warpVRegisters[i].key = warpV[i * raft::WarpSize + laneId].key; + warpVRegisters[i].value = warpV[i * raft::WarpSize + laneId].value; + } + + raft::warpFence(); + + // The warp queue is already sorted, and now that we've sorted the + // per-thread queue, merge both sorted lists together, producing + // one sorted list + warpMergeAnyRegisters, + kNumWarpQRegisters, + NumThreadQ, + !Dir, + Comp, + false>(warpKRegisters, warpVRegisters, threadK, threadV); + + // Write back out the warp queue +#pragma unroll + for (int i = 0; i < kNumWarpQRegisters; ++i) { + warpK[i * raft::WarpSize + laneId] = warpKRegisters[i]; + warpV[i * raft::WarpSize + laneId].key = warpVRegisters[i].key; + warpV[i * raft::WarpSize + laneId].value = warpVRegisters[i].value; + } + + raft::warpFence(); + } + + /// WARNING: all threads in a warp must participate in this. + /// Otherwise, you must call the constituent parts separately. + __device__ inline void add(K k, K vk, V vv) + { + addThreadQ(k, vk, vv); + checkThreadQ(); + } + + __device__ inline void reduce() + { + // Have all warps dump and merge their queues; this will produce + // the final per-warp results + mergeWarpQ(); + + // block-wide dep; thus far, all warps have been completely + // independent + __syncthreads(); + + // All warp queues are contiguous in smem. + // Now, we have kNumWarps lists of NumWarpQ elements. + // This is a power of 2. + FinalBlockMerge, NumWarpQ, Dir, Comp>:: + merge(sharedK, sharedV); + + // The block-wide merge has a trailing syncthreads + } + + // Default element key + const K initK; + + // Default element value + const K initVk; + const V initVv; + + // Number of valid elements in our thread queue + int numVals; + + // The k-th highest (Dir) or lowest (!Dir) element + K warpKTop; + + K warpKTopRDist; + + // Thread queue values + K threadK[NumThreadQ]; + raft::KeyValuePair threadV[NumThreadQ]; + + // Queues for all warps + K* sharedK; + raft::KeyValuePair* sharedV; + + // Our warp's queue (points into sharedK/sharedV) + // warpK[0] is highest (Dir) or lowest (!Dir) + K* warpK; + raft::KeyValuePair* warpV; + + // This is a cached k-1 value + int kMinus1; +}; + +} // namespace cuvs::neighbors::detail::faiss_select diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 1fae2f70b..e5997c5f9 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -91,16 +91,8 @@ endfunction() if(BUILD_TESTS) ConfigureTest( - NAME - NEIGHBORS_TEST - PATH - test/neighbors/brute_force.cu - test/neighbors/brute_force_prefiltered.cu - test/neighbors/refine.cu - GPUS - 1 - PERCENT - 100 + NAME NEIGHBORS_TEST PATH test/neighbors/brute_force.cu + test/neighbors/brute_force_prefiltered.cu test/neighbors/refine.cu GPUS 1 PERCENT 100 ) ConfigureTest( @@ -160,6 +152,8 @@ if(BUILD_TESTS) 100 ) + ConfigureTest(NAME NEIGHBORS_BALL_COVER_TEST PATH test/neighbors/ball_cover.cu GPUS 1 PERCENT 100) + ConfigureTest( NAME DISTANCE_TEST diff --git a/cpp/test/neighbors/ball_cover.cu b/cpp/test/neighbors/ball_cover.cu new file mode 100644 index 000000000..1545982f5 --- /dev/null +++ b/cpp/test/neighbors/ball_cover.cu @@ -0,0 +1,392 @@ +/* + * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.cuh" +#include "spatial_data.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include + +namespace cuvs::neighbors::ball_cover { +using namespace std; + +template +RAFT_KERNEL count_discrepancies_kernel(value_idx* actual_idx, + value_idx* expected_idx, + value_t* actual, + value_t* expected, + uint32_t m, + uint32_t n, + uint32_t* out, + float thres = 1e-3) +{ + uint32_t row = blockDim.x * blockIdx.x + threadIdx.x; + + int n_diffs = 0; + if (row < m) { + for (uint32_t i = 0; i < n; i++) { + value_t d = actual[row * n + i] - expected[row * n + i]; + bool matches = (fabsf(d) <= thres) || (actual_idx[row * n + i] == expected_idx[row * n + i] && + actual_idx[row * n + i] == row); + + if (!matches) { + printf( + "row=%ud, n=%ud, actual_dist=%f, actual_ind=%ld, expected_dist=%f, expected_ind=%ld\n", + row, + i, + actual[row * n + i], + actual_idx[row * n + i], + expected[row * n + i], + expected_idx[row * n + i]); + } + n_diffs += !matches; + out[row] = n_diffs; + } + } +} + +struct is_nonzero { + __host__ __device__ bool operator()(uint32_t& i) { return i > 0; } +}; + +template +uint32_t count_discrepancies(value_idx* actual_idx, + value_idx* expected_idx, + value_t* actual, + value_t* expected, + uint32_t m, + uint32_t n, + uint32_t* out, + cudaStream_t stream) +{ + uint32_t tpb = 256; + count_discrepancies_kernel<<>>( + actual_idx, expected_idx, actual, expected, m, n, out); + + auto exec_policy = rmm::exec_policy(stream); + + uint32_t result = thrust::count_if(exec_policy, out, out + m, is_nonzero()); + return result; +} + +template +void compute_bfknn(const raft::resources& handle, + const value_t* X1, + const value_t* X2, + uint32_t n_rows, + uint32_t n_query_rows, + uint32_t d, + uint32_t k, + const cuvs::distance::DistanceType metric, + value_t* dists, + int64_t* inds) +{ + raft::device_matrix_view input_vec = + raft::make_device_matrix_view(X1, n_rows, d); + + auto bfindex = cuvs::neighbors::brute_force::build(handle, input_vec, metric); + cuvs::neighbors::brute_force::search(handle, + bfindex, + raft::make_device_matrix_view(X2, n_query_rows, d), + raft::make_device_matrix_view(inds, n_query_rows, k), + raft::make_device_matrix_view(dists, n_query_rows, k)); +} + +struct ToRadians { + __device__ __host__ float operator()(float a) { return a * (CUDART_PI_F / 180.0); } +}; + +template +struct BallCoverInputs { + value_int k; + value_int n_rows; + value_int n_cols; + float weight; + value_int n_query; + cuvs::distance::DistanceType metric; +}; + +template +class BallCoverKNNQueryTest : public ::testing::TestWithParam> { + protected: + void basicTest() + { + params = ::testing::TestWithParam>::GetParam(); + raft::resources handle; + + uint32_t k = params.k; + uint32_t n_centers = 25; + float weight = params.weight; + auto metric = params.metric; + + rmm::device_uvector X(params.n_rows * params.n_cols, + raft::resource::get_cuda_stream(handle)); + rmm::device_uvector Y(params.n_rows, raft::resource::get_cuda_stream(handle)); + + // Make sure the train and query sets are completely disjoint + rmm::device_uvector X2(params.n_query * params.n_cols, + raft::resource::get_cuda_stream(handle)); + rmm::device_uvector Y2(params.n_query, raft::resource::get_cuda_stream(handle)); + + raft::random::make_blobs(X.data(), + Y.data(), + params.n_rows, + params.n_cols, + n_centers, + raft::resource::get_cuda_stream(handle)); + + raft::random::make_blobs(X2.data(), + Y2.data(), + params.n_query, + params.n_cols, + n_centers, + raft::resource::get_cuda_stream(handle)); + + rmm::device_uvector d_ref_I(params.n_query * k, + raft::resource::get_cuda_stream(handle)); + rmm::device_uvector d_ref_D(params.n_query * k, + raft::resource::get_cuda_stream(handle)); + + if (metric == cuvs::distance::DistanceType::Haversine) { + thrust::transform(raft::resource::get_thrust_policy(handle), + X.data(), + X.data() + X.size(), + X.data(), + ToRadians()); + thrust::transform(raft::resource::get_thrust_policy(handle), + X2.data(), + X2.data() + X2.size(), + X2.data(), + ToRadians()); + } + + compute_bfknn(handle, + X.data(), + X2.data(), + params.n_rows, + params.n_query, + params.n_cols, + k, + metric, + d_ref_D.data(), + d_ref_I.data()); + + raft::resource::sync_stream(handle); + + // Allocate predicted arrays + rmm::device_uvector d_pred_I(params.n_query * k, + raft::resource::get_cuda_stream(handle)); + rmm::device_uvector d_pred_D(params.n_query * k, + raft::resource::get_cuda_stream(handle)); + + auto X_view = + raft::make_device_matrix_view(X.data(), params.n_rows, params.n_cols); + auto X2_view = raft::make_device_matrix_view( + (const value_t*)X2.data(), params.n_query, params.n_cols); + + auto d_pred_I_view = + raft::make_device_matrix_view(d_pred_I.data(), params.n_query, k); + auto d_pred_D_view = + raft::make_device_matrix_view(d_pred_D.data(), params.n_query, k); + + cuvs::neighbors::ball_cover::index index( + handle, X_view, metric); + cuvs::neighbors::ball_cover::build(handle, index); + cuvs::neighbors::ball_cover::knn_query( + handle, index, X2_view, d_pred_I_view, d_pred_D_view, k, true); + + raft::resource::sync_stream(handle); + // What we really want are for the distances to match exactly. The + // indices may or may not match exactly, depending upon the ordering which + // can be nondeterministic. + + rmm::device_uvector discrepancies(params.n_query, + raft::resource::get_cuda_stream(handle)); + thrust::fill(raft::resource::get_thrust_policy(handle), + discrepancies.data(), + discrepancies.data() + discrepancies.size(), + 0); + // + int res = count_discrepancies(d_ref_I.data(), + d_pred_I.data(), + d_ref_D.data(), + d_pred_D.data(), + params.n_query, + k, + discrepancies.data(), + raft::resource::get_cuda_stream(handle)); + + ASSERT_TRUE(res == 0); + } + + void SetUp() override {} + + void TearDown() override {} + + protected: + uint32_t d = 2; + BallCoverInputs params; +}; + +template +class BallCoverAllKNNTest : public ::testing::TestWithParam> { + protected: + void basicTest() + { + params = ::testing::TestWithParam>::GetParam(); + raft::resources handle; + + uint32_t k = params.k; + uint32_t n_centers = 25; + float weight = params.weight; + auto metric = params.metric; + + rmm::device_uvector X(params.n_rows * params.n_cols, + raft::resource::get_cuda_stream(handle)); + rmm::device_uvector Y(params.n_rows, raft::resource::get_cuda_stream(handle)); + + raft::random::make_blobs(X.data(), + Y.data(), + params.n_rows, + params.n_cols, + n_centers, + raft::resource::get_cuda_stream(handle)); + + rmm::device_uvector d_ref_I(params.n_rows * k, + raft::resource::get_cuda_stream(handle)); + rmm::device_uvector d_ref_D(params.n_rows * k, + raft::resource::get_cuda_stream(handle)); + + auto X_view = raft::make_device_matrix_view( + (const value_t*)X.data(), params.n_rows, params.n_cols); + + if (metric == cuvs::distance::DistanceType::Haversine) { + thrust::transform(raft::resource::get_thrust_policy(handle), + X.data(), + X.data() + X.size(), + X.data(), + ToRadians()); + } + + compute_bfknn(handle, + X.data(), + X.data(), + params.n_rows, + params.n_rows, + params.n_cols, + k, + metric, + d_ref_D.data(), + d_ref_I.data()); + + raft::resource::sync_stream(handle); + + // Allocate predicted arrays + rmm::device_uvector d_pred_I(params.n_rows * k, + raft::resource::get_cuda_stream(handle)); + rmm::device_uvector d_pred_D(params.n_rows * k, + raft::resource::get_cuda_stream(handle)); + + auto d_pred_I_view = + raft::make_device_matrix_view(d_pred_I.data(), params.n_rows, k); + auto d_pred_D_view = + raft::make_device_matrix_view(d_pred_D.data(), params.n_rows, k); + + cuvs::neighbors::ball_cover::index index(handle, X_view, metric); + + cuvs::neighbors::ball_cover::all_knn_query( + handle, index, d_pred_I_view, d_pred_D_view, k, true); + + raft::resource::sync_stream(handle); + // What we really want are for the distances to match exactly. The + // indices may or may not match exactly, depending upon the ordering which + // can be nondeterministic. + + rmm::device_uvector discrepancies(params.n_rows, + raft::resource::get_cuda_stream(handle)); + thrust::fill(raft::resource::get_thrust_policy(handle), + discrepancies.data(), + discrepancies.data() + discrepancies.size(), + 0); + // + uint32_t res = count_discrepancies(d_ref_I.data(), + d_pred_I.data(), + d_ref_D.data(), + d_pred_D.data(), + params.n_rows, + k, + discrepancies.data(), + raft::resource::get_cuda_stream(handle)); + + // TODO: There seem to be discrepancies here only when + // the entire test suite is executed. + // Ref: https://github.com/rapidsai/raft/issues/ + // 1-5 mismatches in 8000 samples is 0.0125% - 0.0625% + ASSERT_TRUE(res <= 5); + } + + void SetUp() override {} + + void TearDown() override {} + + protected: + BallCoverInputs params; +}; + +typedef BallCoverAllKNNTest BallCoverAllKNNTestF; +typedef BallCoverKNNQueryTest BallCoverKNNQueryTestF; + +const std::vector> ballcover_inputs = { + {11, 5000, 2, 1.0, 10000, cuvs::distance::DistanceType::Haversine}, + {25, 10000, 2, 1.0, 5000, cuvs::distance::DistanceType::Haversine}, + {2, 10000, 2, 1.0, 5000, cuvs::distance::DistanceType::L2SqrtUnexpanded}, + {2, 5000, 2, 1.0, 10000, cuvs::distance::DistanceType::Haversine}, + {11, 10000, 2, 1.0, 5000, cuvs::distance::DistanceType::L2SqrtUnexpanded}, + {25, 5000, 2, 1.0, 10000, cuvs::distance::DistanceType::L2SqrtUnexpanded}, + {5, 8000, 3, 1.0, 10000, cuvs::distance::DistanceType::L2SqrtUnexpanded}, + {11, 6000, 3, 1.0, 10000, cuvs::distance::DistanceType::L2SqrtUnexpanded}, + {25, 10000, 3, 1.0, 5000, cuvs::distance::DistanceType::L2SqrtUnexpanded}}; + +INSTANTIATE_TEST_CASE_P(BallCoverAllKNNTest, + BallCoverAllKNNTestF, + ::testing::ValuesIn(ballcover_inputs)); +INSTANTIATE_TEST_CASE_P(BallCoverKNNQueryTest, + BallCoverKNNQueryTestF, + ::testing::ValuesIn(ballcover_inputs)); + +TEST_P(BallCoverAllKNNTestF, Fit) { basicTest(); } +TEST_P(BallCoverKNNQueryTestF, Fit) { basicTest(); } + +} // namespace cuvs::neighbors::ball_cover diff --git a/cpp/test/neighbors/spatial_data.h b/cpp/test/neighbors/spatial_data.h new file mode 100644 index 000000000..3936d6320 --- /dev/null +++ b/cpp/test/neighbors/spatial_data.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +namespace cuvs { +namespace spatial { + +// Latitude and longitude coordinates of 51 US states / territories +std::vector spatial_data = { + 63.588753, -154.493062, 32.318231, -86.902298, 35.20105, -91.831833, 34.048928, -111.093731, + 36.778261, -119.417932, 39.550051, -105.782067, 41.603221, -73.087749, 38.905985, -77.033418, + 38.910832, -75.52767, 27.664827, -81.515754, 32.157435, -82.907123, 19.898682, -155.665857, + 41.878003, -93.097702, 44.068202, -114.742041, 40.633125, -89.398528, 40.551217, -85.602364, + 39.011902, -98.484246, 37.839333, -84.270018, 31.244823, -92.145024, 42.407211, -71.382437, + 39.045755, -76.641271, 45.253783, -69.445469, 44.314844, -85.602364, 46.729553, -94.6859, + 37.964253, -91.831833, 32.354668, -89.398528, 46.879682, -110.362566, 35.759573, -79.0193, + 47.551493, -101.002012, 41.492537, -99.901813, 43.193852, -71.572395, 40.058324, -74.405661, + 34.97273, -105.032363, 38.80261, -116.419389, 43.299428, -74.217933, 40.417287, -82.907123, + 35.007752, -97.092877, 43.804133, -120.554201, 41.203322, -77.194525, 18.220833, -66.590149, + 41.580095, -71.477429, 33.836081, -81.163725, 43.969515, -99.901813, 35.517491, -86.580447, + 31.968599, -99.901813, 39.32098, -111.093731, 37.431573, -78.656894, 44.558803, -72.577841, + 47.751074, -120.740139, 43.78444, -88.787868, 38.597626, -80.454903, 43.075968, -107.290284}; +}; // namespace spatial +}; // namespace cuvs \ No newline at end of file diff --git a/notebooks/VectorSearch_QuestionRetrieval.ipynb b/notebooks/VectorSearch_QuestionRetrieval.ipynb index 4023a1821..21d59975b 100644 --- a/notebooks/VectorSearch_QuestionRetrieval.ipynb +++ b/notebooks/VectorSearch_QuestionRetrieval.ipynb @@ -344,7 +344,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/notebooks/ivf_flat_example.ipynb b/notebooks/ivf_flat_example.ipynb index 38bacb8a7..e39c0ebee 100644 --- a/notebooks/ivf_flat_example.ipynb +++ b/notebooks/ivf_flat_example.ipynb @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "fe73ada7-7b7f-4005-9440-85428194311b", "metadata": {}, "outputs": [], @@ -46,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "5350e4d9-0993-406a-80af-29538b5677c2", "metadata": {}, "outputs": [], @@ -71,10 +71,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "a5daa4b4-96de-4e74-bfd6-505b13595f62", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wed Jul 10 17:19:06 2024 \n", + "+-----------------------------------------------------------------------------+\n", + "| NVIDIA-SMI 520.61.05 Driver Version: 520.61.05 CUDA Version: 11.8 |\n", + "|-------------------------------+----------------------+----------------------+\n", + "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", + "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", + "| | | MIG M. |\n", + "|===============================+======================+======================|\n", + "| 0 NVIDIA RTX A6000 Off | 00000000:B3:00.0 On | Off |\n", + "| 35% 60C P2 88W / 300W | 3226MiB / 49140MiB | 11% Default |\n", + "| | | N/A |\n", + "+-------------------------------+----------------------+----------------------+\n", + " \n", + "+-----------------------------------------------------------------------------+\n", + "| Processes: |\n", + "| GPU GI CI PID Type Process name GPU Memory |\n", + "| ID ID Usage |\n", + "|=============================================================================|\n", + "| 0 N/A N/A 1346 G /usr/lib/xorg/Xorg 687MiB |\n", + "| 0 N/A N/A 1901 G /usr/bin/gnome-shell 60MiB |\n", + "| 0 N/A N/A 263673 C ...vs_062724_2408/bin/python 2078MiB |\n", + "| 0 N/A N/A 3393713 G ...372896767459192031,262144 253MiB |\n", + "| 0 N/A N/A 3456207 G ...--variations-seed-version 49MiB |\n", + "+-----------------------------------------------------------------------------+\n" + ] + } + ], "source": [ "# Report the GPU in use\n", "!nvidia-smi" @@ -94,10 +125,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "5f529ad6-b0bd-495c-bf7c-43f10fb6aa14", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The index and data will be saved in /tmp/cuvs_example\n" + ] + } + ], "source": [ "WORK_FOLDER = os.path.join(tempfile.gettempdir(), \"cuvs_example\")\n", "f = load_dataset(\"http://ann-benchmarks.com/sift-128-euclidean.hdf5\", work_folder=WORK_FOLDER)" @@ -105,10 +144,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "3d68a7db-bcf4-449c-96c3-1e8ab146c84d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded dataset of size (1000000, 128), 0.5 GiB; metric: 'euclidean'.\n", + "Number of test queries: 10000\n" + ] + } + ], "source": [ "metric = f.attrs['distance']\n", "\n", @@ -134,10 +182,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "737f8841-93f9-4c8e-b2e1-787d4474ef94", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 123 ms, sys: 27.7 ms, total: 150 ms\n", + "Wall time: 149 ms\n" + ] + } + ], "source": [ "%%time\n", "build_params = ivf_flat.IndexParams(\n", @@ -161,10 +218,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "1aec7024-6e5d-4d2c-82e6-7b5734aec958", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(type=IvfFlat)\n" + ] + } + ], "source": [ "print(index)" ] @@ -187,7 +252,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "46e0421b-9335-47a2-8451-a91f56c2f086", "metadata": {}, "outputs": [], @@ -205,10 +270,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "595454e1-7240-4b43-9a73-963d5670b00c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 710 ms, sys: 293 ms, total: 1 s\n", + "Wall time: 996 ms\n" + ] + } + ], "source": [ "%%time\n", "n_queries=10000\n", @@ -216,7 +290,7 @@ "search_params = ivf_flat.SearchParams(n_probes=30)\n", "\n", "# Search 10 nearest neighbors.\n", - "distances, indices = ivf_flat.search(search_params, index, cp.asarray(queries[:n_queries,:]), k=10, handle=handle)\n", + "distances, indices = ivf_flat.search(search_params, index, cp.asarray(queries[:n_queries,:]), k=10, resources=handle)\n", " \n", "# cuVS calls are asynchronous (when handle arg is provided), we need to sync before accessing the results.\n", "handle.sync()\n", @@ -233,10 +307,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "8cd9cd20-ca00-4a35-a0a0-86636521b31a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.97398" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "calc_recall(neighbors, gt_neighbors)" ] @@ -252,7 +337,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "bf94e45c-e7fb-4aa3-a611-ddaee7ac41ae", "metadata": {}, "outputs": [], @@ -263,7 +348,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "1622d9be-be41-4d25-be99-d348c5e54957", "metadata": {}, "outputs": [], @@ -284,10 +369,57 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "ace0c31f-af75-4352-a438-123a9a03612c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Benchmarking search with n_probes = 10\n", + "recall 0.86668\n", + "Average search time: 0.075 +/- 0.00267 s\n", + "Queries per second (QPS): 133984\n", + "\n", + "Benchmarking search with n_probes = 20\n", + "recall 0.94766\n", + "Average search time: 0.144 +/- 0.00121 s\n", + "Queries per second (QPS): 69339\n", + "\n", + "Benchmarking search with n_probes = 30\n", + "recall 0.97398\n", + "Average search time: 0.215 +/- 0.000938 s\n", + "Queries per second (QPS): 46452\n", + "\n", + "Benchmarking search with n_probes = 50\n", + "recall 0.99117\n", + "Average search time: 0.356 +/- 0.00109 s\n", + "Queries per second (QPS): 28067\n", + "\n", + "Benchmarking search with n_probes = 100\n", + "recall 0.99831\n", + "Average search time: 0.719 +/- 0.0074 s\n", + "Queries per second (QPS): 13901\n", + "\n", + "Benchmarking search with n_probes = 200\n", + "recall 0.99932\n", + "Average search time: 1.438 +/- 0.00288 s\n", + "Queries per second (QPS): 6953\n", + "\n", + "Benchmarking search with n_probes = 500\n", + "recall 0.99936\n", + "Average search time: 3.302 +/- 0.0646 s\n", + "Queries per second (QPS): 3028\n", + "\n", + "Benchmarking search with n_probes = 1024\n", + "recall 0.99933\n", + "Average search time: 2.272 +/- 0.0397 s\n", + "Queries per second (QPS): 4402\n" + ] + } + ], "source": [ "n_probes = np.asarray([10, 20, 30, 50, 100, 200, 500, 1024]);\n", "qps = np.zeros(n_probes.shape);\n", @@ -302,7 +434,7 @@ " index,\n", " cp.asarray(queries),\n", " k=10,\n", - " handle=handle,\n", + " resources=handle,\n", " )\n", " handle.sync()\n", " \n", @@ -327,10 +459,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "e1ac370f-91c8-4054-95c7-a749df5f16d2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "fig = plt.figure(figsize=(12,3))\n", "ax = fig.add_subplot(131)\n", @@ -368,10 +511,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "addbfff3-7773-4290-9608-5489edf4886d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 464 ms, sys: 4.68 ms, total: 469 ms\n", + "Wall time: 463 ms\n" + ] + } + ], "source": [ "%%time\n", "build_params = ivf_flat.IndexParams(\n", @@ -382,7 +534,7 @@ " add_data_on_build=True\n", " )\n", "\n", - "index = ivf_flat.build(build_params, dataset, handle=handle)" + "index = ivf_flat.build(build_params, dataset, resources=handle)" ] }, { @@ -395,10 +547,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "8a0149ad-de38-4195-97a5-ce5d5d877036", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 512 ms, sys: 240 ms, total: 752 ms\n", + "Wall time: 745 ms\n" + ] + } + ], "source": [ "%%time\n", "n_queries=10000\n", @@ -406,7 +567,7 @@ "search_params = ivf_flat.SearchParams(n_probes=10)\n", "\n", "# Search 10 nearest neighbors.\n", - "distances, indices = ivf_flat.search(search_params, index, cp.asarray(queries[:n_queries,:]), k=10, handle=handle)\n", + "distances, indices = ivf_flat.search(search_params, index, cp.asarray(queries[:n_queries,:]), k=10, resources=handle)\n", " \n", "handle.sync()\n", "distances, neighbors = cp.asnumpy(distances), cp.asnumpy(indices)" @@ -414,10 +575,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "eedc3ec4-06af-42c5-8cdf-490a5c2bc49a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.98719" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "calc_recall(neighbors, gt_neighbors)" ] @@ -433,10 +605,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "5a54d190-64d4-4cd4-a497-365cbffda871", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 67.7 ms, sys: 3.97 ms, total: 71.7 ms\n", + "Wall time: 71 ms\n" + ] + } + ], "source": [ "%%time\n", "build_params = ivf_flat.IndexParams( \n", @@ -445,7 +626,7 @@ " kmeans_trainset_fraction=0.1, \n", " kmeans_n_iters=20 \n", " ) \n", - "index = ivf_flat.build(build_params, dataset, handle=handle)" + "index = ivf_flat.build(build_params, dataset, resources=handle)" ] }, { @@ -458,14 +639,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "4cc992e8-a5e5-4508-b790-0e934160b660", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.98814" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "search_params = ivf_flat.SearchParams(n_probes=10)\n", "\n", - "distances, indices = ivf_flat.search(search_params, index, cp.asarray(queries[:n_queries,:]), k=10, handle=handle)\n", + "distances, indices = ivf_flat.search(search_params, index, cp.asarray(queries[:n_queries,:]), k=10, resources=handle)\n", " \n", "handle.sync()\n", "distances, neighbors = cp.asnumpy(distances), cp.asnumpy(indices)\n", @@ -487,10 +679,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "7ebcf970-94ed-4825-9885-277bd984b90c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index before adding vectors Index(type=IvfFlat)\n" + ] + }, + { + "ename": "AttributeError", + "evalue": "module 'cuvs.neighbors.ivf_flat' has no attribute 'extend'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[20], line 17\u001b[0m\n\u001b[1;32m 13\u001b[0m index \u001b[38;5;241m=\u001b[39m ivf_flat\u001b[38;5;241m.\u001b[39mbuild(build_params, train_set)\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIndex before adding vectors\u001b[39m\u001b[38;5;124m\"\u001b[39m, index)\n\u001b[0;32m---> 17\u001b[0m \u001b[43mivf_flat\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mextend\u001b[49m(index, dataset, cp\u001b[38;5;241m.\u001b[39marange(dataset\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m], dtype\u001b[38;5;241m=\u001b[39mcp\u001b[38;5;241m.\u001b[39mint64))\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIndex after adding vectors\u001b[39m\u001b[38;5;124m\"\u001b[39m, index)\n", + "\u001b[0;31mAttributeError\u001b[0m: module 'cuvs.neighbors.ivf_flat' has no attribute 'extend'" + ] + } + ], "source": [ "# subsample the dataset\n", "n_train = 10000\n", @@ -520,6 +731,30 @@ "metadata": {}, "outputs": [], "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23010fbc-8f5a-4403-a112-33f190a85498", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "774848e8-fa45-4223-bd2a-e8585650531e", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6309b8a7-f4eb-4976-a824-cd4499a0000d", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -538,7 +773,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/notebooks/rmm_log.txt b/notebooks/rmm_log.txt new file mode 100644 index 000000000..681eba61a --- /dev/null +++ b/notebooks/rmm_log.txt @@ -0,0 +1,2 @@ +[266514][18:28:55:663533][info ] ----- RMM LOG BEGIN [PTDS DISABLED] ----- +[266514][18:40:02:947176][error ] [A][Stream 0x2][Upstream 14270349312B][FAILURE maximum pool size exceeded] diff --git a/notebooks/tutorial_ivf_pq.ipynb b/notebooks/tutorial_ivf_pq.ipynb index cc0fe4142..fb6296228 100644 --- a/notebooks/tutorial_ivf_pq.ipynb +++ b/notebooks/tutorial_ivf_pq.ipynb @@ -14,16 +14,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: adjustText in /home/cjnolet/software/miniconda3/envs/cuvs_062724_2408/lib/python3.11/site-packages (1.2.0)\n", + "Requirement already satisfied: h5py in /home/cjnolet/software/miniconda3/envs/cuvs_062724_2408/lib/python3.11/site-packages (3.11.0)\n", + "Requirement already satisfied: matplotlib in /home/cjnolet/software/miniconda3/envs/cuvs_062724_2408/lib/python3.11/site-packages (3.8.4)\n", + "Requirement already satisfied: numpy in /home/cjnolet/software/miniconda3/envs/cuvs_062724_2408/lib/python3.11/site-packages (from adjustText) (1.26.4)\n", + "Requirement already satisfied: scipy in /home/cjnolet/software/miniconda3/envs/cuvs_062724_2408/lib/python3.11/site-packages (from adjustText) (1.14.0)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /home/cjnolet/software/miniconda3/envs/cuvs_062724_2408/lib/python3.11/site-packages (from matplotlib) (1.2.1)\n", + "Requirement already satisfied: cycler>=0.10 in /home/cjnolet/software/miniconda3/envs/cuvs_062724_2408/lib/python3.11/site-packages (from matplotlib) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /home/cjnolet/software/miniconda3/envs/cuvs_062724_2408/lib/python3.11/site-packages (from matplotlib) (4.53.1)\n", + "Requirement already satisfied: kiwisolver>=1.3.1 in /home/cjnolet/software/miniconda3/envs/cuvs_062724_2408/lib/python3.11/site-packages (from matplotlib) (1.4.5)\n", + "Requirement already satisfied: packaging>=20.0 in /home/cjnolet/software/miniconda3/envs/cuvs_062724_2408/lib/python3.11/site-packages (from matplotlib) (24.1)\n", + "Requirement already satisfied: pillow>=8 in /home/cjnolet/software/miniconda3/envs/cuvs_062724_2408/lib/python3.11/site-packages (from matplotlib) (10.4.0)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /home/cjnolet/software/miniconda3/envs/cuvs_062724_2408/lib/python3.11/site-packages (from matplotlib) (3.1.2)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /home/cjnolet/software/miniconda3/envs/cuvs_062724_2408/lib/python3.11/site-packages (from matplotlib) (2.9.0)\n", + "Requirement already satisfied: six>=1.5 in /home/cjnolet/software/miniconda3/envs/cuvs_062724_2408/lib/python3.11/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n" + ] + } + ], "source": [ "!pip install adjustText h5py matplotlib" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -47,7 +68,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -62,9 +83,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The index and data will be saved in /tmp/cuvs_ivf_pq_tutorial\n" + ] + } + ], "source": [ "# We'll need to load store some data in this tutorial\n", "WORK_FOLDER = os.path.join(tempfile.gettempdir(), 'cuvs_ivf_pq_tutorial')\n", @@ -76,9 +105,40 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wed Jul 10 18:28:55 2024 \n", + "+-----------------------------------------------------------------------------+\n", + "| NVIDIA-SMI 520.61.05 Driver Version: 520.61.05 CUDA Version: 11.8 |\n", + "|-------------------------------+----------------------+----------------------+\n", + "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", + "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", + "| | | MIG M. |\n", + "|===============================+======================+======================|\n", + "| 0 NVIDIA RTX A6000 Off | 00000000:B3:00.0 On | Off |\n", + "| 30% 44C P8 40W / 300W | 12334MiB / 49140MiB | 21% Default |\n", + "| | | N/A |\n", + "+-------------------------------+----------------------+----------------------+\n", + " \n", + "+-----------------------------------------------------------------------------+\n", + "| Processes: |\n", + "| GPU GI CI PID Type Process name GPU Memory |\n", + "| ID ID Usage |\n", + "|=============================================================================|\n", + "| 0 N/A N/A 1346 G /usr/lib/xorg/Xorg 574MiB |\n", + "| 0 N/A N/A 1901 G /usr/bin/gnome-shell 70MiB |\n", + "| 0 N/A N/A 263673 C ...vs_062724_2408/bin/python 11250MiB |\n", + "| 0 N/A N/A 3393713 G ...372896767459192031,262144 219MiB |\n", + "| 0 N/A N/A 3456207 G ...--variations-seed-version 54MiB |\n", + "+-----------------------------------------------------------------------------+\n" + ] + } + ], "source": [ "# Report the GPU in use to put the measurements into perspective\n", "!nvidia-smi" @@ -95,7 +155,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -119,11 +179,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The index and data will be saved in /tmp/raft_example\n" + ] + } + ], "source": [ "DATASET_URL = \"http://ann-benchmarks.com/sift-128-euclidean.hdf5\"\n", + "DATASET_NAME = \"SIFT-128\"\n", "f = load_dataset(DATASET_URL)" ] }, @@ -136,9 +205,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded dataset of size (1000000, 128); metric: 'euclidean'.\n", + "Number of test queries: 10000\n" + ] + } + ], "source": [ "metric = f.attrs['distance']\n", "\n", @@ -165,7 +243,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -176,9 +254,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'add_data_on_build': True,\n", + " 'codebook_kind': 0,\n", + " 'conservative_memory_allocation': False,\n", + " 'force_random_rotation': False,\n", + " 'kmeans_n_iters': 20,\n", + " 'kmeans_trainset_fraction': 0.5,\n", + " 'metric': 'euclidean',\n", + " 'metric_arg': 2.0,\n", + " 'n_lists': 1024,\n", + " 'pq_bits': 8,\n", + " 'pq_dim': 64}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# First, we need to initialize the build/indexing parameters.\n", "# One of the more important parameters is the product quantisation (PQ) dim.\n", @@ -197,16 +296,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "using ivf_pq::index_params nrows 1000000, dim 128, n_lits 1024, pq_dim 64\n", + "CPU times: user 4.06 s, sys: 299 ms, total: 4.36 s\n", + "Wall time: 4.28 s\n" + ] + }, + { + "data": { + "text/plain": [ + "Index(type=IvfPq)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "%%time\n", "## Build the index\n", "# This function takes a row-major either numpy or cupy (GPU) array.\n", "# Generally, it's a bit faster with GPU inputs, but the CPU version may come in handy\n", "# if the whole dataset cannot fit into GPU memory.\n", - "index = ivf_pq.build(index_params, dataset, handle=resources)\n", + "index = ivf_pq.build(index_params, dataset, resources=resources)\n", "# This function is asynchronous so we need to explicitly synchronize the GPU before we can measure the execution time\n", "resources.sync()\n", "index" @@ -222,9 +341,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 75.7 ms, sys: 84.3 ms, total: 160 ms\n", + "Wall time: 158 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "Index(type=IvfPq)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "%%time\n", "index_filepath = os.path.join(WORK_FOLDER, \"ivf_pq.bin\")\n", @@ -246,9 +384,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'internal_distance_dtype': 0, 'lut_dtype': 0, 'n_probes': 20}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "k = 10\n", "search_params = ivf_pq.SearchParams()\n", @@ -257,12 +406,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 26.3 ms, sys: 16.4 ms, total: 42.8 ms\n", + "Wall time: 42.4 ms\n" + ] + } + ], "source": [ "%%time\n", - "distances, neighbors = ivf_pq.search(search_params, index, queries, k, handle=resources)\n", + "distances, neighbors = ivf_pq.search(search_params, index, queries, k, resources=resources)\n", "# Sync the GPU to make sure we've got the timing right\n", "resources.sync()" ] @@ -277,9 +435,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Got recall = 0.85222 with the default parameters (k = 10).\n" + ] + } + ], "source": [ "recall_first_try = calc_recall(neighbors, gt_neighbors)\n", "print(f\"Got recall = {recall_first_try} with the default parameters (k = {k}).\")" @@ -297,22 +463,39 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 92 ms, sys: 16 ms, total: 108 ms\n", + "Wall time: 107 ms\n" + ] + } + ], "source": [ "%%time\n", "\n", - "candidates = ivf_pq.search(search_params, index, queries, k * 2, handle=resources)[1]\n", - "distances, neighbors = refine(dataset, queries, candidates, k, handle=resources)\n", + "candidates = ivf_pq.search(search_params, index, queries, k * 2, resources=resources)[1]\n", + "distances, neighbors = refine(dataset, queries, candidates, k, resources=resources)\n", "resources.sync()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Got recall = 0.94949 with 2x refinement (k = 10).\n" + ] + } + ], "source": [ "recall_refine2x = calc_recall(neighbors, gt_neighbors)\n", "print(f\"Got recall = {recall_refine2x} with 2x refinement (k = {k}).\")" @@ -341,15 +524,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "32.8 ms ± 277 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "34.5 ms ± 416 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "36.6 ms ± 464 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "38.1 ms ± 408 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "39 ms ± 96.7 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "36.9 ms ± 73.1 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "42.2 ms ± 264 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "53.1 ms ± 710 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "37.6 ms ± 582 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "37.6 ms ± 450 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "bench_k = np.exp2(np.arange(10)).astype(np.int32)\n", "bench_avg = np.zeros_like(bench_k, dtype=np.float32)\n", "bench_std = np.zeros_like(bench_k, dtype=np.float32)\n", "for i, k in enumerate(bench_k):\n", - " r = %timeit -o ivf_pq.search(search_params, index, queries, k, handle=resources); resources.sync()\n", + " r = %timeit -o ivf_pq.search(search_params, index, queries, k, resources=resources); resources.sync()\n", " bench_avg[i] = (queries.shape[0] * r.loops / np.array(r.all_runs)).mean()\n", " bench_std[i] = (queries.shape[0] * r.loops / np.array(r.all_runs)).std()\n", "\n", @@ -377,9 +587,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3.86 ms ± 96.5 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "6.83 ms ± 150 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "12.8 ms ± 239 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "23.7 ms ± 473 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "43.5 ms ± 756 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "81.6 ms ± 156 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "158 ms ± 500 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "305 ms ± 2.29 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "591 ms ± 4.66 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "1.12 s ± 2.16 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "2.23 s ± 12.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], "source": [ "bench_probes = np.exp2(np.arange(11)).astype(np.int32)\n", "bench_qps = np.zeros_like(bench_probes, dtype=np.float32)\n", @@ -387,9 +615,9 @@ "k = 100\n", "for i, n_probes in enumerate(bench_probes):\n", " sp = ivf_pq.SearchParams(n_probes=n_probes)\n", - " r = %timeit -o ivf_pq.search(sp, index, queries, k, handle=resources); resources.sync()\n", + " r = %timeit -o ivf_pq.search(sp, index, queries, k, resources=resources); resources.sync()\n", " bench_qps[i] = (queries.shape[0] * r.loops / np.array(r.all_runs)).mean()\n", - " bench_recall[i] = calc_recall(ivf_pq.search(sp, index, queries, k, handle=resources)[1], gt_neighbors)\n", + " bench_recall[i] = calc_recall(ivf_pq.search(sp, index, queries, k, resources=resources)[1], gt_neighbors)\n", " " ] }, @@ -407,9 +635,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABR8AAAFzCAYAAAC3uH7uAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8fJSN1AAAACXBIWXMAAA9hAAAPYQGoP6dpAACe/UlEQVR4nOzdd3xUZdrG8d/MpJFOEpJQEgidJNSELigCEWxgAxs2UJCoi6wFXtd1UXfZtbC4GhBExS4WLKuoREVaKCHSQycQCCkkkE6SSTLvH5GsCCikncnk+n5ePuucmXPmOiEvz5l77vM8JpvNZkNERERERERERESkjpmNDiAiIiIiIiIiIiKOScVHERERERERERERqRcqPoqIiIiIiIiIiEi9UPFRRERERERERERE6oWKjyIiIiIiIiIiIlIvVHwUERERERERERGReqHio4iIiIiIiIiIiNQLFR9FRERERERERESkXjgZHaChVVZWcuzYMby8vDCZTEbHERGR32Gz2SgoKKBVq1aYzfq+7Fw0romINB4a1/6YxjURkcbhYsa0Jld8PHbsGCEhIUbHEBGRi3DkyBHatGljdAy7pHFNRKTx0bh2fhrXREQalwsZ05pc8dHLywuo+uF4e3vX6BhWq5Xly5cTExODs7NzXcZThkaYwV5yKIMy2FuGusiRn59PSEhI9b/dcjaNa8qgDI6dQxkcK4PGtT9WF+Pa+djD71F9cMTz0jk1Ho54XjqnC3MxY1qTKz6ebt339vau1Yc0d3d3vL29Db34UQb7yGAvOZRBGewtQ13m0G1X56dxTRmUwbFzKINjZtC4dn51Ma6djz38HtUHRzwvnVPj4YjnpXO6OBcypmmiEREREREREREREakXKj6KiIiIiIiIiIhIvVDxUUREREREREREROqFio8iIiIiIiIiIiJSL1R8FBERaYTi4uIIDw+nb9++RkcRERGpNY1rIiKOS8VHERGRRig2Npbk5GQSExONjiIiIlJrGtdERByXio8iIiIiIiIiIiJSL1R8FBERERERERERkXqh4qOIiIiIiIiIiIjUCyejA4iI1FRlpY1iawVFpeUUlpZTVFpOXlEpu06acN97HIvFQmUl2ACbzUalDaDqf202sFX/t63qeDYbNhvV207vV72Nqv/+33O/3qfqGDYbWCvK2Z1m4ujqFCwWiyE/m4qKCvYYnOF0jqJcE1calkD+yKmyCmZ8up0e+jpSREQcwOeb08guLGVUZDBtmrsbHUdERLCD4uO8efN4/vnnSU9PJyIigrlz5zJkyJDzvj4uLo5XXnmFQ4cOERoayhNPPMEdd9zRgIlFpKZsNhunrBW/FAqrioZFpeUUlZVT+KvHhdX/+5ttZVX7FZaWU1xaTlFZxXneyQK7NzfouZ0zQ+o+ZQCGBJuMjiC/41/f7uaLrekst1ho2TWLK3u2NjqSiIhIjb2+JoXtaXk8+/Uuurf2YVRkMKMig+nQwtPoaCIiTZahxcclS5Ywbdo05s2bx+DBg1mwYAGjR48mOTmZ0NDQs14/f/58Zs6cyWuvvUbfvn3ZuHEj9957L82bN+eaa64x4AxEBCC7sJSkwyfZlJLD5n1m/vveZoqtlb8qJFZUFxmrug/rlsVswsPFgoerE+4uFkqLC/H18cFsNmEymTABZhO/+m8TVP0fZpMJkwlMpl+2U/U682+eh1+2mcCECbO56n/5ZT8T/zuGrbKStGNptGnTBrPJmHaySlslaUeP0trADKdzeBQcMez95Y/dN7Q9W46cZMuRPKZ+sIVJR/J4fHRXnC1qhRQRkcbFZrNxY1Qb3F0sJB46wfa0PLan5fH8d3voHORJTLdA3Iv+d9eLiIg0DEOLj3PmzGHixIlMmjQJgLlz5/Ldd98xf/58Zs+efdbr33nnHSZPnsz48eMBaN++PevXr+df//qXio8iDaSy0sb+44VVxcZDJ0k6fIJDOcW/eoUZso//7jFMJvBwccLDtapg6Onq9MtjJzx/2eZRvc1S9fzp17mevc3VyYzpl8Kh1Wpl2bJlXHnlAJydnevxJ3F+VRmOcOWVkQZnSDU0w69ziP1q5duM9+7pS+zC5fyUbmbRmhR+Tj3JK7f2oZVvM6PjiYiIXDCTycSdg9px56B2ZBeWEp+cyTc7MkjYn83ezEL2ZhYCTiw5upZR3YMZHdmSnm18qq8jRUSkfhhWfCwrKyMpKYkZM2acsT0mJoaEhIRz7lNaWoqbm9sZ25o1a8bGjRuxWq3n/IBdWlpKaWlp9eP8/Hyg6gOx1WqtUfbT+9V0/7qgDPaTwV5y1FeG4rJyth3N5+fUXH5OzWXzkVzyS8rPel2nQA96tfGhLOcIvSPD8XZ3+aWAaDmj0OjhYqGZswWzua4u8iopL6+sfuTIfxeNLUNd5DA6f1Ph4mTmunaV3HRZb2Ys3cnPqblc9Z/VzL25N5d2bmF0PBERkYsW4OnKLf1CuaVfKHnFVr7flck324/x054sDp8oZsHKgyxYeZCWPm5cERHM6Mhgotv5Yamza1QRETnNsOJjdnY2FRUVBAUFnbE9KCiIjIyMc+5zxRVXsGjRIsaOHUufPn1ISkrijTfewGq1kp2dTcuWLc/aZ/bs2cyaNeus7cuXL8fdvXYTEMfHx9dq/7qgDPaTAewjR20z5JZCSoGJgwUmUgpMpBVBJWdehLmYbbT1tBHmBWFeNtp52XB3ygPyoBVwYiecgBKq/uTUKlHNOMLfhaNkgJrnKC4u/uMXSZ2JCQ8isk1zpr73MzuP5XPXmxt5YFhHpo3orA9jIiLSaPm4O3NDVBuu7RHEZ/9dhmu7PsTvPs6K3Vmk55WwOOEQixMOEeDpwsjwqkLkwA7+moJERKSOGL7gzG9b3G0223nb3p988kkyMjIYMGAANpuNoKAg7rrrLp577rnzruY6c+ZMpk+fXv04Pz+fkJAQYmJi8Pb2rlFmq9VKfHw8I0eONPSWSmWwjwz2kqMmGcorKtmTWcjPqbkkpeayOTWXY3klZ70u2NuVqNDm9A71ISq0OV2CPc95MdZYfw7KYL85TnerS8Np6+/Bp/cP4pmvknlvQyov/7ifTYdO8tItvQj0cvvjA4iIiNgxVwtc2T2YMX1CKLFWsHpfNt/sSOf75EyyC8v4YGMqH2xMxdvNiRHhQYyObMmQTgG4OZ/786aIiPwxw4qPAQEBWCyWs7ocs7KyzuqGPK1Zs2a88cYbLFiwgMzMTFq2bMnChQvx8vIiICDgnPu4urri6up61nZnZ+dafyCvi2PUljLYTwZ7yfF7GfJLrGxOzSXp0AmSUk+yJTX3rBWjzSYIb+VNVGhzotr5EdW2Oa0vct43e/85KEPjyWEP2ZsiN2cLf7+uO/3C/Ji5dDvrDuZw1X/W8PItvRnQ3t/oeCIiInXCzdnCyPAgRoYHYa2oZN2BHL7ZkUF8cgbZhWUs/TmNpT+n4eFi4bKugYyODGZYl0A8XA3v4RERaVQM+1fTxcWFqKgo4uPjue6666q3x8fHM2bMmN/d19nZmTZt2gDw4YcfcvXVV2M2qyVe5NdsNhtHTpxi0+ETbDp8kp8Pn2RPZgG/XdzPy82JPqHNiWrbnOi2zekZ4qsLKhEBYEyv1kS08mbqez+zN7OQW19bz59junD/pR3qcN5WERER4zlbzAzt3IKhnVvw7NhINh06wTc7MvhuZwbpeSV8vS2dr7el4+pU9bpREcGM6BaEj7u+KBUR+SOGVhimT5/OhAkTiI6OZuDAgSxcuJDU1FSmTJkCVN0ynZaWxttvvw3A3r172bhxI/379+fkyZPMmTOHHTt28NZbbxl5GiJ2obS8kkMF8PraQ2w5ks+mwyfJLiw963Whfu5Et21OVLuqgmPnQC8VEUTkvDoGevF57GD+8vkOlv6cxvPf7WHToRPMGdeL5h4uRscTERGpcxazif7t/enf3p+nrgln69E8vtmRzrc7MjicU0x8cibxyZk4mU0M6hjAqIhgYiKCCPA8+447ERExuPg4fvx4cnJyePrpp0lPTycyMpJly5bRtm1bANLT00lNTa1+fUVFBS+++CJ79uzB2dmZYcOGkZCQQLt27Qw6AxHj2Ww2Pkw8wuxlu8gvcYIde6ufc7aYiGztU1VsbNucPm2ba842Eblo7i5OvHhTT/qH+fHXL3ayYs9xrn55Da/c2pveoc2NjiciIlJvTCYTvUJ86RXiy4xRXdmdUcA3OzL4dkc6ezMLWbX3OKv2Hucvn2+nbzs/RkUGMyoymJY+FzdtkYiIIzP83sqpU6cyderUcz63ePHiMx5369aNzZs3N0AqkcYhLfcUMz7dxup92QB4ONkY0DGQ6Hb+RLdrTvfWPpocW0TqhMlkYnzfULq39mXqe0kcyilm3IJ1zBzdjbsHtzvvYnEiIiKOwmQy0a2lN91aejN9ZGcOHC/k2x0ZfLsjg+1peWxIOcGGlBPM+m8yvUJ8GRVZtXJ2W38Po6OLiBjK8OKjiFy8092Of/96F4Wl5bg6mZk+oiOBuclcfVVvLdIhIvUmvJU3/33wEh7/dBvLtmfw9FfJJB46wb9u7IG3m/7tERGRpqNDC09ih3UkdlhHjp4sri5EJqWeZMuRXLYcyeWf3+ymW0tvRv/SEdkp0FNf2IlIk6Pio0gj89tuxz6hvjx/U09CfV1ZtizZ4HQi0lDi4uKIi4ujoqLij19cx7zcnIm7tQ9vJRzi78t28c2ODHal5xN3Wx8iWvk0eB4REWn8jBzX6kKb5u5MGtKeSUPak5VfwnfJmXy7I531B0+wKz2fXen5zInfS/sWHlWFyIiWRLb2ViFSRJoEFR9FGgmbzcaSxCM8+6tux0diunDPJWFYzCasVqvREUWkAcXGxhIbG0t+fj4+Pg1f8DOZTNw1OIyeIb488P5mDuUUc928BGZdG8HNfUP0YUpERC6K0eNaXQr0dmPCgLZMGNCWE0VlfL8rk293ZLBmXzYHjxcRt+IAcSsO0KZ5M0ZFBDO6ezC9Q5prEUgRcVgqPoo0AsdyTzFj6XZW7T0O/K/bsUMLT4OTiUhT1zu0OV8/dAnTP9rKj7uzmLl0O4kpJ3j2ukjcXXSZISIiTZufhwvjokMYFx1CfomVFbuz+HZHBj/tOc7Rk6dYtCaFRWtSCPRy5YqIqjki+4X54WQxGx1dRKTO6FOBiB2z2Wx8tOkIz361i4JzdDuKiNgDX3cXFt0RzYJVB3lh+R6Wbk5je1oe827rQ6cgL6PjiYiI2AVvN2fG9GrNmF6tOVVWwcq9VYXIH3ZlkVVQyjvrD/PO+sM0d3dmZHgQoyNbMqijP65OWkBSRBo3FR9F7NRvux17h/rygrodRcROmc0m7r+sA31CfXnwg83syyrk2lfW8o/rI7mudxuj44mIiNiVZi4WRkW2ZFRkS0rLK0jYn8O3OzJYnpzByWIrH206ykebjuLl6sTl3QIZHRnMpZ0DaeaiQqSIND4qPorYmd92O7o4mXkkpjMTL2mvbkcRsXv92/vz9UNDmLZkM2v35/Dwkq1sTDnJU9eE4+asD0wiIiK/5epkYVjXQIZ1DeTvFZFsTDnBNzsy+G5nBlkFpXyx5RhfbDmGm7OZyzoHMrp7MJd3DcTLzdno6CIiF0TFRxE7kp53ihmfbmflr7odn7+xJx0D1e0oIo1HCy9X3r6nPy/9sI+Xf9zHBxtT2Xokl3m39aFdgIfR8UREROyWk8XMoI4BDOoYwKxrI9h85CTfbM/gmx0ZpOWe4tudGXy7MwMXi5nBHf0ZHdmSyzr7GR1bROR3qfgoYgdsNhsfbzrKM18lV3c7/nlkZyYNUbejiDROFrOJ6SM7E922OdOWbCE5PZ9rXl7Dczf2YHT3lkbHExERsXtms4motn5EtfXjiau6sfNYPt/sSOebHRkcPF7Eij3HWbHnOBazifaeZk76p3Jlj9YEersZHV1E5AwqPooY7Lfdjr1CquZ2VLejiDiCoZ1b8PVDl/Dg+5vZdPgk97/3M3cPbsfM0d1wcdJKniIiIhfCZDIR2dqHyNY+PHpFV/ZlFvDNjqqOyF3p+ezLN/O3r3Yz6+vdRIU2Z1RkMFdEBBPi5250dBERFR9FjGKz2fg46ZduxxJ1O4qI42rp04wP7hvA89/tYeGqg7y59hCbU3OJu60PrX2bGR1PRESk0ekU5EWnIC8eGt6J/Zl5/OfTlRyu9GPr0Tw2HT7JpsMnefbrXXRv7cOoyGBGRQZr4UoRMYyKjyIGSM87xcyl2/lpz6+7HXvQMdDL4GQiIvXD2WLm/67sRnTb5jzy8Va2HMnlqv+s5t/jejGsa6DR8URERBqttn7uDG9t48or+5NdXM53v3REJh46wfa0PLan5fH8d3voHOTJqIhgRkW2pFtLL0wmNTyISMNQ8VGkAZ2r23H6yM7cq25HEWkiYiKC+bqlN7Hv/8y2o3ncvTiRqZd14MHLwoyOJiIi0ui19GnGXYPDuGtwGNmFpcQnZ/LNjgwS9mezN7OQvZn7+c+P+2nr717VERkRTK8QXxUiRaReqfgo0kAy8kqYuXQbK37pduwZ4suL6nYUkSYoxM+dj6cM5O9f7+LtdYeZ99MBNh06wdX+RicTERFxHAGertzSL5Rb+oWSV2zlh91VhchVe49zOKeYBSsPsmDlQVr6uHFFRNWt2X3b+akpQkTqnIqPIvXMZrPxSdJRnv5Nt+OkS8JwsmixBRFpmlydLDw9JpK+7fyY8ek2Nh46yZ40C5F98+jTLsDoeCIiIg7Fx92Z6/u04fo+bSgqLWfFniy+3ZHBit1ZpOeVsDjhEIsTDhHg6cLI8KpC5MD2/locTkTqhIqPIvUoI6+E//tsOz/uzgKquh1fuLEHnYLU7SgiAnBNz1aEt/Jm8tub2H+8iFsWJfLCTT25pmcro6OJiIg4JA9XJ67u0Yqre7SixFrB6n3ZfLMjnR92ZZFdWMYHG1P5YGMq3m5OjOgWxKjIYIZ2boGbs8Xo6CLSSKn4KFIPbDYbn/6cxqz/7qzqdrSYeXhkZ+4dom5HEZHf6tDCk4/u68/tcd+TnAsPfrCZfVmFTBveCbNu/RIREak3bs4WRoYHMTI8CGtFJesP5vDNjgyW78wgu7CMpZvTWLo5DXcXC8O6BDIqMphhXQPxdFUpQUQunP7FEKljmfklzFz6q27HNj68cFNPdTuKiPwOLzcn7u1ayQ5LGK+vPcx/ftjHvswCXhzXE3cXXa6IiIjUN2eLmSGdWjCkUwueGRNJ0uGTfLMjne92ZHAsr4Svt6fz9fZ0XJzMDO0UwBURwYwMD8LX3cXo6CJi53Q1L1JHbDZYujmNvy/bQ/4v3Y7TRnbiviHt1e0oInIBzCaYMaoLXVr68MRn2/lmRwapJ4pZdGc0LX2aGR1PRESkybCYTfQL86NfmB9/vTqcbUfz+GZHBt/uSOdQTjHf78ri+11ZOJlNDOzgzxURwcREBBHo5WZ0dBGxQyo+itSBzPwSXttjZuf6nUBVt+PzN/Wks7odRUQu2rjoEMICPJj8ThI7j+Vz7StrWTghit6hzY2OJiIi0uSYTCZ6hvjSM8SXx0d1YU9mAd/uyODbHRnszihg9b5sVu/L5skvdhDdtjmjIlsyKjKY1r764lBEqqj4KFJLP6ee5K43NpJfYsbZYuLhkZ3V7SgiUkt92/nxRexgJr21iT2ZBYxfuJ7nb+zBmF6tjY4mIiLSZJlMJroGe9M12JtpIzqTkl30SyEyna1H80g8dJLEQyd55qtkerTxYVRkMKMigmnfwtPo6CJiIBUfRWoht7iMB9/fTH5JOSEeNhbcPYjwNurMERGpCyF+7nw6dRDTPtzM97uy+NOHW9ibWcCfR3bRQjQiIiJ2ICzAg/sv68D9l3UgLfcU3+3I4NudGSQeOsG2o3lsO5rHc9/uoUuQV1UhMjKYrsFemEwax0WaEhUfRWrIZrPx+KfbSMs9RVs/d6Z2zKdTkL7RExGpS56uTiyYEM3z3+3h1ZUHiFtxgP1ZhcwZ1wsPrbQpIiJiN1r7NuOeS8K455IwjheUEp+cyTc70ll3IIc9mQXsySzgpR/20c7fnSsigxkd2ZKebXxUiBRpAnTVLlJD764/zHc7M3G2mHhpfA8Ob1ljdCQREYdkMZuYMbornQI9mbl0O9/tzOTGV9ex6M5ozSclIiJih1p4uXJr/1Bu7R9KXrGV73dl8s2ODFbtO86hnGIWrDzIgpUHaenjxhURwYyODCa6nR8W3dkg4pBUfBSpgeRj+Tzz9S4AZo7uRkQrbw5vMTaTiIijuyGqDe0C3Jn8ThK70vMZ88oaFkyIJqqtprsQERGxVz7uztwQ1YYbotpQVFrOij1ZfLsjgxW7s0jPK2FxwiEWJxwiwNOFkeFVt2ZHh3gbHVtE6pCKjyIXqai0nAc++Jmy8kqGdw3k7sHtKC8vNzqWiEiTENXWj89jB3Pv21UFyFsWruefN3Tn+j5tjI4mIiIif8DD1Ymre7Ti6h6tKLFWsGZfNt/syOD7XZlkF5bxwcZUPtiYirebE108zbjsymJYt2DcnC1GRxeRWlDxUeQiPfXlTg4eLyLY243nb+qpOUpEpNZSUlK45557yMzMxGKxsH79ejw8PIyOZbfaNHfnkykDeXjJFpYnZzL9o63szSzk0Su66HYtERE7oHFNLoSbs4UR4UGMCA/CWlHJ+oM5fLMjg+U7M8guLCOxxEzi+1twd7EwrEsgoyKDGdY1EE/N+SzS6Oj/a0Uuwmebj/JJ0lHMJph7cy/8PFyMjiQiDuCuu+7i2WefZciQIZw4cQJXV1ejI9k9D1cnXr09ihfj9xC34gCvrjzA/qwC5t7cWx9KREQMpnFNLpazxcyQTi0Y0qkFz4yJZMOB4yxYtoG9xe6k55Xw9fZ0vt6ejouTmaGdArgiIpiR4UH4uuvzmEhjoKtzkQuUkl3EXz7bAcBDwzsxoL2/wYlExBHs3LkTZ2dnhgwZAoCfn5/BiRoPs9nEo1d0pVOgF499uo3vd2Vx4/wEXrsjmhA/d6PjiYg0SRrXpLYsZhN92zXneLtKRo8ewq7MYr7dmcG3OzJIyS7i+11ZfL8rCyeziYEd/LkiIpiYiCACvdyMji4i52E2OoBIY1BaXsGDH/xMUVkF/cP8ePDyTkZHEhE7sWrVKq655hpatWqFyWTi888/P+s18+bNIywsDDc3N6Kioli9enX1c/v27cPT05Nrr72WPn368I9//KMB0zuGsb1bs+S+AbTwcmV3RgFj49aSeOiE0bFERBoljWtiT0wmEz1DfHl8VFd+/POlfDdtKNNGdKJrsBfllTZW78vmL5/voP8/fuCmVxN4fU0KabmnjI4tIr+h4qPIBfjXN3vYkZZPc3dn5t7cS3OKiUi1oqIievbsySuvvHLO55csWcK0adN44okn2Lx5M0OGDGH06NGkpqYCYLVaWb16NXFxcaxbt474+Hji4+Mb8hQcQu/Q5nwRO5iIVt7kFJVx62vr+XjTEaNjiYg0OhrXxF6ZTCa6BHsxbURnvp02lBWPXMbjo7rSM8QXmw0SD53kma+SGfzPH7n2lTXM+2k/B48XGh1bRNBt1yJ/6PvkTN5YmwLACzf1pKVPM4MTiYg9GT16NKNHjz7v83PmzGHixIlMmjQJgLlz5/Ldd98xf/58Zs+eTZs2bejbty8hISEAXHnllWzZsoWRI0ee83ilpaWUlpZWP87PzweqPuxZrdYancPp/Wq6f12oiwwtPJx4f2I0jy/dybc7M3n0k23sTs/j0ZjOF/SlkaP8HJTBsXIog2NlMPp3+kI4wrh2Pvbwe1QfHPG8LuSc2vi4MGlwKJMGh5KeV8Ly5Ey+S85i0+GTbDuax7ajeTz37R46B3pyRUQgMeFBdAnyNGzBUEf8ewLHPC+d08Ud80Ko+CjyO9LzTvHoJ1sBuGdwGMO7BRmcSEQak7KyMpKSkpgxY8YZ22NiYkhISACgb9++ZGZmcvLkSXx8fFi1ahWTJ08+7zFnz57NrFmzztq+fPly3N1rN8+hPXSm1EWGK7zA1sbMd0fNvL72MOt2pnBnp0rcLvCqx1F+DspQd+whhzI4Robi4uI6TNLwGtu4dj728HtUHxzxvC7mnFoAt7eEa/1hx0kTW3NM7M03sTerkL1Zhby84iABbjZ6+tno6VdJqCcYUYd0xL8ncMzz0jn9vosZ01R8FDmPikobf/pwCyeLrXRv7cPjo7sYHUlEGpns7GwqKioICjrzi4ugoCAyMjIAcHJy4h//+AdDhw7FZrMRExPD1Vdffd5jzpw5k+nTp1c/zs/PJyQkhJiYGLy9vWuU02q1Eh8fz8iRI3F2dq7RMWqrrjNcDXy9PYPHl+4gORcWHfbi1dt6E/o7C9E44s9BGRp/DmVwrAynu/oaq8Yyrp2PPfwe1QdHPK+6Oqe8U1Z+3H2c5cmZrNqfQ3ZJJT8cM/HDMTPB3q7EhAdxRUQgUaHN631qLUf8ewLHPC+d04W5mDHN8OLjvHnzeP7550lPTyciIoK5c+dWr4x2Lu+99x7PPfcc+/btw8fHh1GjRvHCCy/g76+Vh6VuvfzjPjamnMDDxcLLt/TG1clidCQRaaR+e3uPzWY7Y9sf3eL2a66urri6up613dnZudYXEnVxjNqqywxj+4TQPtCLe9/exL6sIm5csIH5t0cxoP3vXzM42s9BGRwjhzI4Rgajs9eVxjKunY89/B7VB0c8r9qeU4CzM+P6tWVcv7YUlZazYk8W3+7IYMXuLDLyS3l7fSpvr08lwNOFkeHBjIoMZmB7f1yc6m95DEf8ewLHPC+d0x8f60IZuuDMH01W/Ftr1qzhjjvuYOLEiezcuZOPP/6YxMTE6vlGROrK+oM5/OeHfQD84/rutAvwMDiRiDRGAQEBWCyW6m6Q07Kyss7qGpH60aONL1/EXkKPNj6cLLZy+6INfLDx3NcZIiLy+zSuSWPm4erE1T1a8cqtfUh6ciSL7ojmhj5t8GnmTHZhGR9sTOXONzYS/Ww805dsYfnODEqsFUbHFnEIhhYffz1Zcbdu3Zg7dy4hISHMnz//nK9fv3497dq146GHHiIsLIxLLrmEyZMns2nTpgZOLo7sRFEZ0z7cQqUNbopqw5herY2OJCKNlIuLC1FRUWfNrRIfH8+gQYNqdey4uDjCw8Pp27dvrY7TFAT7uLHkvoFc3aMl5ZU2Zi7dzqz/7qS8otLoaCIijYrGNXEUbs4WRoQH8eK4nmz6ywjemdiP2/qHEuDpSn5JOUs3p3HfO0n0eSae2Pd+5r9bj1FYWm50bJFGy7Dbri9ksuLfGjRoEE888QTLli1j9OjRZGVl8cknn3DVVVed9320KqgyXAybzcafP9pMRn4J7QPc+cuVnS/o2I74s1AGZbCHHEbnvxCFhYXs37+/+nFKSgpbtmzBz8+P0NBQpk+fzoQJE4iOjmbgwIEsXLiQ1NRUpkyZUqv3jY2NJTY2lvz8fHx8fGp7Gg6v2S9TaHQO8mJO/F7eXHuIA8eLeOXW3ni7OdbtNCIitaFxTZoaZ4uZIZ1aMKRTC54eE0nS4ZN8uyOD73ZmkJZ7iq+3p/P19nRcnMwM7RTAqMiWjOgWiK+7i9HRRRoNw4qPFzJZ8W8NGjSI9957j/Hjx1NSUkJ5eTnXXnstL7/88nnfR6uCKsPF+CndxIpDFpxMNm5slc9P3y83JEdtKIMy2FsGqHmOxrAq6KZNmxg2bFj149OT5t95550sXryY8ePHk5OTw9NPP016ejqRkZEsW7aMtm3bGhW5yTKZTDw0vBMdAz2Z/tEWVu09znVxa3n9zr6aXkNE5Bca16Qps5hN9Avzo1+YH09e3Y3taXl8syODb3dkkJJdxPe7svh+VxZOZhMDO/hzRUQwMRFBBHq5GR1dxK4ZvuDMH01W/GvJyck89NBD/PWvf+WKK64gPT2dRx99lClTpvD666+fcx+tCqoMF2pHWj5fbdwA2PjLVd24rX+oITlqShmUwd4y1EWOxrAq6GWXXYbNZvvd10ydOpWpU6c2UCL5I1d2b0monzuT3trEgeNFjIlby/zb+tC3rTptREQ0rolUMZlM9GjjS482vjx2RRf2Zhby7Y4MvtmRzu6MAlbvy2b1vmye/GIH0W2bMyqyJaMig2nt28zo6CJ2x7DiY00mK549ezaDBw/m0UcfBaBHjx54eHgwZMgQnn32WVq2bHnWPloVVBkuRGFpOQ9/vA1rhY0rIoK4c3D78xbB6zNHXVAGZbC3DLXJYQ/ZxTFFtvbhywcGc+87SWw9kssdb2zkyau64mt0MBEREbE7JpOJLsFedAn24k8jOnEou4hvd2bwzY4Mth7JJfHQSRIPneSZr5Lp0caHUZHBjIoIpn0LT6Oji9gFwxacqclkxcXFxZjNZ0a2WCwAf/jtnMj52Gw2nvhsO4dyimnt24znbuhZo8KjiEhD0sT8tRfo7caS+wYwplcryittPPXfXXySYtZCNCIiBtC4Jo1JuwAPplzagS9iB5Mw43KeuiacfmF+mEyw7Wgez327h8tfXMkV/17Fyz/s43BOkdGRRQxl6GrX06dPZ9GiRbzxxhvs2rWLhx9++IzJimfOnMkdd9xR/fprrrmGpUuXMn/+fA4ePMjatWt56KGH6NevH61atTLqNKSR+yTpKF9sOYbFbOKlm3vh465OKxGxf7GxsSQnJ5OYmGh0lEbNzdnC3PG9ePSKLgCszjAz8e2fyS0uMziZiEjTonFNGqtWvs24e3AYH00eyMb/G8E/ruvO0M4tcDKb2JNZwIvxe7n0+Z+4ccEGVqWbyC4s/eODijgYQ+d8/KPJitPT00lNTa1+/V133UVBQQGvvPIKf/7zn/H19eXyyy/nX//6l1GnII3c/qxC/vrFTgCmj+xMdDs/gxOJiEhDM5lMxA7rSJhfM6Yt2UzCwROMjVvLojuj6RjoZXQ8ERERaSRaeLlya/9Qbu0fSl6xleXJGXy59Rhr92ez9WgeW7Hw2XMrGdwxgLG9WhMTEYSXm5pfxPEZvuDM701WvHjx4rO2Pfjggzz44IP1nEqaghJrBQ+8/zOnrBUM7ujPlEs7GB1JREQMNDI8kGmRFbyX6smhnGLGxiXw8i29GdY10OhoIiIi0sj4uDtzU3QIN0WHcLyglC82H+GdVbs5XGiqXqzG9TMzI7oFMaZXKy7t0gJXJ4vRsUXqhaG3XYsY6R/LdrE7owB/Dxf+Pa4XFrPmeRQRaepae8CnUwbQL8yPwtJy7nkrkQUrD2huaREREamxFl6u3DmwLdO7V/D9tEuYPrIz7Vt4UFpeydfb07nvnST6Pvs9Mz7dxroDOVRW6rpDHIvhnY8iRvh2RwZvrzsMwIvjehLo7WZwIhERsRf+Hi68O7E/T325kw82pjL7m93sySjgH9d3x81ZHQkiIiJSc2393XloeCcevLwjO4/l88WWNL7ceozM/FI+TDzCh4lHCPZ245qeLRnTqzURrby1IKo0eio+SpNz9GQxj32yFYDJQ9tzWRfdTicijU9cXBxxcXFUVFQYHcUhuTiZ+cd1kXQN9uLpr5JZujmNg9lFLJwQpS+sRETqgcY1aWpMJhORrX2IbO3DjNHd2JCSw5dbjrFsezoZ+SW8tjqF11an0KGFB2N6tWZMr1a09fcwOrZIjei2a2lSyisq+dOHW8gvKadniC9/julidCQRkRrRqqD1z2Qyceegdrx9Tz98mjmz5Ugu176ylm1Hc42OJiLicDSuSVNmMZsY1CGAf97Qg8S/jGDhhCiu6t4SVyczB44XMeeXFbPHxq3lzbUpHC/QitnSuKjzUZqUud/vI+nwSbxcnXj55t64OKn+LiIiv29wxwC+iB3MpLc3sT+rkJteXcfzN/Xk2p6tjI4mIiIiDsbVyUJMRDAxEcEUlFhZvjOTz7eksXZ/NluO5LLlSC7PfJXM4I4BjOnVmiu0YrY0Aio+SpOxdn82cT/tB+Af13cn1N/d4EQiItJYtAvwYOnUQfzpg82s2HOchz7YzJ6MfP48sgtmLVgmIiIi9cDLzZkbotpwQ1QbjheU8tW2Y3yx5RhbjuRWr5j9xC8rZl/bqxWXacVssVMqPkqTkF1YyrQlW7DZ4JZ+IVyjbhUREblI3m7OLLqzL899t5sFKw8St+IAezML+ff4Xni66pJKRERE6k8LL1fuHhzG3YPDOJxTxJdbjvH5ljQOHC/i6+3pfL09HW83J67s3pJre7Wif5g/Fn1BKnZCV8ri8Corbfz5o60cLyilc5Anf706wuhIIiLSSFnMJmaO7kaXIC9mfLqd+ORMbpiXwKI7ownxU0e9iIiI1L+2/h48OLwTD2jFbGkkVHwUh7dozUFW7j2Om7OZV27tQzMXtaGLiEjtXN+nDe0CPJj8ThJ7MgsYE7eWebf1YUB7f6OjiYiISBPx2xWzN6ac4IstaeddMfvanq1oF6AVs6XhabUNcWhbjuTy3Ld7AHjqmgg6B3kZnEhEpG7ExcURHh5O3759jY7SZPUJbc6XDwyme2sfThSVcfuiDby/IdXoWCIijZLGNZHasZhNDOzg/7srZl/2wk+M+WXF7KyCEqMjSxOi4qM4rPwSKw9+8DPllTau6tGSm/uGGB1JRKTOxMbGkpycTGJiotFRmrSWPs34aPJArunZivJKG//32Xae+mIH1opKo6OJiDQqGtdE6s7pFbPjbuvDpr+M4MWbejKkUwBmE2w9ksus/yYz4B8/MOH1DXySdJSCEqvRkcXB6bZrcUg2m42Zn27nyIlTtGnejNnXd9ccFyIiUi+auVj4z8296BLkyQvL9/LWusPsyypk3m198HV3MTqeiIiINGG/XTH7623H+PwcK2YP7xbImF6ttWK21AsVH8UhfZh4hK+3p+NkNvHyLb3xdnM2OpKIiDgwk8nEA5d3onOQF9OWbCHhQA5j4tay6I5oOmnKDxEREbEDLbxcuWtwGHedY8XsZdszWLY9Ay83J66MbMmY3loxW+qObrsWh7M3s4C/fbkTgEev6ELv0OYGJxIRkaYiJiKYpVMH0aZ5Mw7nFHPdvAR+3J1pdCwRERGRM5xeMfv76Zfy1YOXcN/Q9gR7u1FQUs6STUe49bUNDPrnDzz7VTI70vKw2WxGR5ZGTMVHcSinyip44P2fKS2vZGjnFtw7pL3RkUREpInpGuzNF7GD6RfmR2FpORPf2sSrKw/ool1ERETszukVs//vym6snXE5H9w7gFv6heDt5kRmfimL1qRw9ctrGD5nJS99v49D2UVGR5ZGSMVHcShPf5XM3sxCWni5MmdcT8xqERcREQP4e7ry7sT+3No/FJsN/vnNbv780VZKrBVGRxMRERE5p9MrZs++/lcrZveoWjH74PEi/v39/1bMfmONVsyWC6c5H8VhfLXtGB9sTMVkgn+P60WAp6vRkUREpAlzcTLz97GRdA32YtZ/k1m6OY0D2UW8NiGKQG83o+OJiIiInNfpFbNjIoIpKLGyfGcmX2w9xpp9x9l6JJetR3J59utkBrb3py0mhpRY8XPWWgtybup8FIdw5GQxMz/dDsDUyzpwSacAgxOJiNSvuLg4wsPD6du3r9FR5HeYTCbuGNiOt+/ph08zZ7YeyeWaV9aw7Wiu0dFEROyKxjUR+3V6xey37+nHhv8bwd+uCad3qC+VNlh7IIf3D1gY8K+VTH0viW93ZFBarjs95EwqPkqjV1EJ0z7aRkFpOVFtmzNtRGejI4mI1LvY2FiSk5NJTEw0OopcgMEdA/jygcF0DPQkM7+Um15dxxdb0oyOJSJiNzSuiTQOp1fM/mzqYFY+ehnThnckqJmNsvJKlm3PYMq7SUQ/+z2Pf7KNhP3ZVFRqzmvRbdfiAL4+YmbbsXy83Zx46eZeOFtUUxcREfvT1t+Dz6YO4k8fbuHH3Vn86cMt7M0s4M8ju2iOYhEREWl02vp7EHtZe9oV7aZd70tYtjOLL7ccIyO/hCWbjrBk0xGCvF25pkcrxvRqTWRrb0wmXfM0RSo+SqO2Zn8OPxyrKjY+d2MP2jR3NziRiIjI+Xm5OfPaHdE8991uFqw8SNyKA+zNLOTf43vh6arLMhEREWl8TCaIaOVNr7b+zBjVlQ0pJ/hyaxpfb0uvXjF70ZoU2rfwYEzP1lzbqxVhAR5Gx5YGpBYxabQqK23845vdANzarw2jIlsanEhEROSPWcwmZo7uxr/H98TFyUx8ciY3zEvgyIlio6OJiIiI1Ir5AlbMHvbCT4x5ZY1WzG5C9BW7NFpfb09nX1YRzSw2/jyik9FxRERELsp1vdvQzt+Dye8ksSezgGtfWcPLN/c0OpaIiIhInfj1itmFpeV8tyPjfytmH81j69E8nv06mUEdAhjTqxVXRAbj7aYVsx2Rio/SKFVU2pj7/V4AhrWqxLuZ/oESEZHGp3doc7584BLue2cT247mcdfiJK5vZ+JKo4OJiIiI1CFPVyduiGrDDVFtOF5QytfbjvHF1mNsTs1lzf5s1uzP5onPdzC8ayA3RbdhWJdAzQ/pQHTbtTRK/916jAPHi/Bp5sSlwVo9S0REGq9gHzc+mjyQa3u2orzSxkcHLTz5ZTJl5ZVGRxMRERGpc79dMfvPIzvToYUHZeWVfLMjg3sWb+KaV9YQn5yJzabP+45AxUdpdMorKnnph30ATBzcDjf174qISCPn5mzhpZt78cjITpiw8WHiUW59bb3mQRIRERGH1tbfgweHd+L76Zfy9UOXMPGSMNxdLOxIy+fetzdx9ctrWL4zQ0XIRk7FR2l0Pt9yjJTsIpq7OzNhQKjRcUREROqEyWRi8tAw7utaiZebE5sOn+Sal9ew5Uiu0dFERERE6pXJZCKilQ9PXh3Omscv5/7LOuDhYmHnsXzueydJRchGTsVHaVSsFZX855eux8mXdsDTVW2PItI0xcXFER4eTt++fY2OInUsvLmNpVP60zHQk8z8Usa9uo6PNh0xOpaISL3SuCYip/l5uPD4qK6sfvxypv6mCHnVf9bwnYqQjY6Kj9KoLP35KKknivH3cOGOgW2NjiMiYpjY2FiSk5NJTEw0OorUg3b+HnweO5iY8CDKKip57JNtPPXFDqwVmgdSRByTxjUR+S0/DxceG9WVNY9fTuywqiJkcno+k38pQn67I4PKShUhGwMVH6XRKCuv5D8/7Afg/ss64O6irkcREXFcnq5OvHp7FA+P6AzAW+sOc9uiDWQXlhqcTERERKThNPdw4dErzi5CTnk3iateVhGyMVDxURqNj5OOkJZ7ihZertzWX12PIiLi+MxmE38a0YnX7ojG09WJjSknuPblNWw/mmd0NBEREZEG9esi5APDOuLp6sSuX4qQV/5nNd/uSFcR0k6p+CiNQml5BXE/VnU9Tr2sA81cLAYnEhERaTgjw4P4PHYw7QM8OJZXwo2vJrD056NGxxIRERFpcM09XHjkii6seXwYD15eVYTcnVHAlHd/5sr/rOab7SpC2hsVH6VR+CjxCMfySgjyduWWflrhWkREmp6OgZ58/sBgLu8aSGl5JdM/2sozXyVTrnkgRUREpAnydXfhzzFVRciHflWEvP89FSHtjYqPYvdKrBW8sqKq6/GBYR1xc1bXo4iINE3ebs4suiOaBy/vCMDra1K4442NnCgqMziZiIiIiDF83V2Y/qsipNevipCjX1rNMhUhDWd48XHevHmEhYXh5uZGVFQUq1evPu9r77rrLkwm01l/IiIiGjCxNLQPNqaSmV9KKx83xvUNMTqOiIiIocxmE3+O6cKrt/fB3cVCwoEcrnl5DTuPaR5IERERabr+V4S8nIeGd8LL1Yk9mQVM/aUI+fU2FSGNYmjxccmSJUybNo0nnniCzZs3M2TIEEaPHk1qauo5X//SSy+Rnp5e/efIkSP4+flx0003NXByaSinyiqY99MBAB64vBOuTup6FBERARgV2ZLPpg6mrb87abmnuGF+Al9uPWZ0LBERERFD+bg7M31kZ9Y8fjl/+lURMvb9nxn10iq+2nZMRcgGZmjxcc6cOUycOJFJkybRrVs35s6dS0hICPPnzz/n6318fAgODq7+s2nTJk6ePMndd9/dwMmloby34TDHC0pp7duMG6PaGB1HRETErnQJ9uLL2Eu4tHMLSqyVPPTBZmZ/s4sKXVCLiIhIE+fj7szDvy5CujmxN7OQB97frCJkA3My6o3LyspISkpixowZZ2yPiYkhISHhgo7x+uuvM2LECNq2bXve15SWllJaWlr9OD8/HwCr1YrVaq1Bcqr3q+n+daEpZCguK2feT1VzPcZeFobJVoHVWtGgGS6UPeRQBmWwtwx1kcPo/CKNgY+7M2/c1ZcXlu9h/k8HWLDyIMnH8nn5lt74ursYHU9ERETEUKeLkPdcEsaba1N4fU1KdRGyU+A+HhreiSu7t8RiNhkd1WEZVnzMzs6moqKCoKCgM7YHBQWRkZHxh/unp6fzzTff8P777//u62bPns2sWbPO2r58+XLc3d0vLvRvxMfH12r/uuDIGX5IM3GiyIK/qw239G0sW7atwTNcLHvIoQzKYG8ZoOY5iouL6ziJiGOymE08PqorEa28efTjbazel821r6xl4R1RdA32NjqeiIiIiOF8mjkzbURn7h4cxuK1h3h9zUH2ZRXy4Aeb+c8P+3hweCeu6t7S6JgOybDi42km05mVZZvNdta2c1m8eDG+vr6MHTv2d183c+ZMpk+fXv04Pz+fkJAQYmJi8Pau2cW41WolPj6ekSNH4uzsXKNj1JajZygsLedvc1YDVh67KpJrerdu8AwXwx5yKIMy2FuGushxultdRC7M1T1a0T7Ak/ve2UTqiWKun5fACzf15EpdSIuIiIgAVUXIP43oxN2XtGPx2kMsWl1VhHzolyJk7KVhoLux65RhxceAgAAsFstZXY5ZWVlndUP+ls1m44033mDChAm4uPz+7USurq64urqetd3Z2bnWH8jr4hi15agZ3l9zmJPFVsICPLghKhQny+9PT2oPPwd7yaEMymBvGWqTwx6y26u4uDji4uKoqKj44xdLkxLeypv/PnAJD3zwM2v35zD1vZ+JHdaB6SO76HYiEbFbGtdEpKF5uznz0PBO3DX4f0XI/VmFPPzxdoKaWbCFpDOmd4iun+qAYQvOuLi4EBUVddatePHx8QwaNOh39125ciX79+9n4sSJ9RlRDJJfYmXhqoMA/Gl4pz8sPIqINEWxsbEkJyeTmJhodBSxQ809XHjr7n7cOyQMgLgVB5j0ViJ5pzSPqojYJ41rImKU00XINTMu588jO+PTzInMUyamf7ydmH+v5IstaVrMr5YMrepMnz6dRYsW8cYbb7Br1y4efvhhUlNTmTJlClB1y/Qdd9xx1n6vv/46/fv3JzIysqEjSwN4c80h8k5Z6dDCg2t6tjI6joiISKPkZDHzxFXhzB3fC1cnMyv2HGds3Fr2ZxUYHU1ERETE7ni7OfPg8E6smD6Uq0Iq8GnmxIHjRfzpwy2MVBGyVgwtPo4fP565c+fy9NNP06tXL1atWsWyZcuqV69OT08nNTX1jH3y8vL49NNP1fXooPJOWVm0pqrrcdqIzmpvFhERqaWxvVvz6f2DaO3bjJTsIsbGJbB85x8v7iciIiLSFHm5ORHTxsaK6UN5JKYzvu7OHPxVEfLzzSpCXizD72edOnUqhw4dorS0lKSkJIYOHVr93OLFi/npp5/OeL2Pjw/FxcXce++9DZxUGsLra1IoKCmnc5CnVpkSERGpI5GtffjygcEMaO9HYWk5972TxL/j91KpC2cRERGRc/Jyc+KByzux+rFhPHpFl+oi5LQlWxg5ZyWfbT5KeUWl0TEbBcOLjyKn5RaX8caaFAAeHtEZs7oeRURE6oy/pyvvTOzPXYPaAfDSD/u4750kCko0D6SIiIjI+Xi5ORM7rCNrHr/8f0XI7CIeXrKVmH+vUhHyAqj4KHbjtdUHKSwtp1tLb66ICDY6joiIiMNxtpj527URPH9jD1yczHy/K5OxcWs5eLzQ6GgiIiIids3T1am6CPnYqC40/1URcuS/V7H0ZxUhz0fFR7ELJ4rKeHPtIQCmjeikrkcREZF6dFN0CB9NHkiwtxsHjhcx5pW1/Lg70+hYIiIiInbP09WJqZd1ZPWvipAp2UVM/2grI+as5NMkFSF/S8VHsQsLVh2guKyCiFbexIQHGR1HRETE4fUK8eXLBwfTt11zCkrLmfjWJl75cR82m+aBFBEREfkjp4uQax6/nMdHdaW5uzOHcor588dVRchPVISspuKjGO54QSlvJxwGYPrIzphM6noUERFpCIFebrw3aQC3DwjFZoMXlu/lwQ+3UlphdDIRERGRxsHD1Yn7L+vAmscvZ8borvh5uHAop5hHPt7KcBUhARUfxQ4sWHmAU9YKerbx4fKugUbHERERaVJcnMw8O7Y7s6/vjrPFxHfJWczZbiH1RLHR0UREREQaDQ9XJ6Zc2oHVjw2rLkIe/lURctvRXKMjGkbFRzFUVn4J76yv6np8WF2PIiIihrmlXygf3jeQQC9XMk6ZuHHBBjamnDA6loiIiEij8usi5MxfFSHveGMjezIKjI5nCBUfxVDzfjpAaXklfUJ9ubRzC6PjiIiINGlRbZuzdEp/QjxsnCy2ctui9Xy86YjRsUREREQaHQ9XJyZf2oGVj15G71Bfcout3P76Bg5lFxkdrcGp+CiGycgr4f2NqQBMH9lFXY8iIiJ2IMjbjYciKhgVEYS1wsajn2xj9rJdVFRqIRoRERGRi+Xl5sziu/rRNdiL4wWl3LZoA8dyTxkdq0Gp+CiGmffTfsrKK+nXzo/BHf2NjiMiIiK/cLHAS+N68NDwTgAsWHWQye8kUVhabnAyERERkcbHx92Zdyb2p32AB2m5p7j99Q1kF5YaHavBqPgohkjLPcWHG6tu49JcjyIiIvbHbDYxfWRnXrq5Fy5OZr7flcmN8xM4elIL0YiIiIhcrBZerrwzqT+tfZtx8HgRE17fSF6x1ehYDULFRzFE3Ir9lFVUMrC9PwM7qOtRRETEXo3p1Zol9w0gwNOV3RkFjI1bS9Lhk0bHEhEREWl0Wvs2491J/QnwdGVXej53L95IURO4s0TFR2lwR04U81Hi/7oeRUTk4sXFxREeHk7fvn2NjiJNQO/Q5nz5wGC6tfQmu7CMWxau57PNR42OJSIOROOaiDQVYQEevDupHz7NnPk5NZf73tlEibXC6Fj1SsVHaXCv/Lif8kobl3QMoF+Yn9FxREQapdjYWJKTk0lMTDQ6ijQRrXyb8cmUgcSEB1FWUcnDS7by/He7qdRCNCJSBzSuiUhT0jXYm7fu6YeHi4W1+3N44P3NWCsqjY5Vb1R8lAZ1OKeIT36u6pR4eGQng9OIiIjIxfBwdeLV26O4/7IOAMStOMDU936muMzxbxcSERERqUu9QnxZdGdfXH+ZW/uRj7c67Je6Kj5Kg/rPD/upqLRxaecWRLVV16OIiEhjYzabeHxUV168qScuFjPf7szgplfXkZ53yuhoIiIiIo3KwA7+zL+9D05mE19sOcZfvtiBzeZ4BUgVH6XBHDxeWD0/lOZ6FBERadxuiGrD+/f2x9/DhZ3H8hnzylq2Hsk1OpaIiIhIo3J51yDm3twLswne35DK7G92O1wBUsVHaTD/+WEflTYY3jWQXiG+RscRERGRWopu58fnsYPpEuRFVkEp4xas46ttx4yOJSIiItKoXN2jFf+8vgcAC1cd5JUf9xucqG6p+CgNYn9WAV9srfowoq5HERERxxHi584n9w/k8q6BlJZX8sD7m5n7/V6H+8ZeREREpD6N6xvCk1eHA/Bi/F7eWJNicKK6o+KjNIi53+/DZoOY8CAiW/sYHUdERETqkJebM6/dEc29Q8KAqnH/wQ82U2KtMDiZiIiISOMx8ZIwHh5R1bD19FfJfLTpiMGJ6oaKj1Lv9mQU8PX2dACmjVDXo4iIiCOymE08cVU4/7qhO05mE19tS2f8gnVk5ZcYHU1ERESk0XhoeMfqL3RnfLqNr7elG5yo9lR8lHr30g97sdngyu7BhLfyNjqOiIiI1KPxfUN5d1J/fN2d2Xo0j2tfWcuOtDyjY4mIiIg0CiaTif+7shu39Auh0gbTlmxmxe4so2PVioqPUq+Sj+WzbHsGJhP8abi6HkVERJqCAe39+SJ2MB1aeJCRX8JNr67j2x2N/1t7ERERkYZgMpl4dmx3ru3ZCmuFjSnvJrH+YI7RsWpMxUepV3O/3wtUrdzUJdjL4DQiIiLSUNr6e/BZ7GCGdm7BKWsFU979mbgV+7UQjYiIiMgFsJhNvDiuJyO6VS3qN3FxIluP5Bodq0ZUfJR6s/1oHsuTM3/peuxodBwRERFpYN5uzrxxZzR3DWoHwPPf7WH6R1u1EI2IiIjIBXC2mHnl1j4M6uBPUVkFd765kT0ZBUbHumhOF/rC66+//oIPunTp0hqFEcdyuutxTM9WdAxU16OIiEhT5GQx87drI+gQ6MnfvtzJZ5vTOJxTxIIJ0bTwcjU6noiIiIhdc3O28Nod0dz++gY2p+Zy++sb+HjyQNoFeBgd7YJdcOejj4/PBf8R2XIklx92Z2E2wUPDOxkdR0RERAw2YUBb3rq7H95uTvycmsvYuLXsSs83OpaIiIiI3fNwdWLxXf3oGuzF8YJSblu0gWO5p4yOdcEuuPPxzTffrM8c4mD+HV/V9Xhd7za0b+FpcBoRERGxB5d0CuCz2MFMemsTKdlF3Dg/gZdu7s2I8CCjo4mIiIjYNR93Z96Z2J/xC9ZxMLuI2xdt4KMpAwnwtP87STTno9S5pMMnWLn3OBaziYc016OINBIrV65k2bJlnDx50ugoIg6tQwtPPps6qHruonvf2cTCVQe0EI1IHdKYJiLimFp4ufLupP609m3GwewiJry+kbxiq9Gx/tAFdz727t0bk8l0Qa/9+eefaxxIGr9/x+8D4MY+bWjr33jmIBCRpuH555+nsLCQWbNmAWCz2Rg9ejTLly8HIDAwkB9++IGIiAgjY4o4NF93F966px9PfbmT9zek8o9lu9mXWcjfr+uOi5O+Gxe5UBrTRESanla+zXh3Un9uenUdu9LzuXvxRt6Z2B8P1wsu8TW4C042duzYeowhjmLDwRzW7M/GyWzigcvV9Sgi9ueDDz7g8ccfr378ySefsGrVKlavXk23bt244447mDVrFh999JGBKUUcn7PFzN/HRtI50JOnv0rm46SjHM4p5tUJUXi5XNgX3iJNncY0EZGmKSzAg3cn9WP8gvX8nJrLfe9s4vU7++LmbDE62jldcPHxqaeeqs8c4iD+/csK1+P6hhDi525wGhGRs6WkpNCjR4/qx8uWLeOGG25g8ODBAPzlL3/hpptuMiqeSJNiMpm4a3AY7QI8ePD9zWw8dIIxcWtYcFtvo6OJNAoa00REmq6uwd68dU8/bnttPWv35/DA+5uZf3sfnC32dxeJ/SWSRivhQDbrD57AxWImdpi6HkXEPlmtVlxd/zcp87p16xg0aFD141atWpGdnW1ENJEm67IugSydOohQP3eOnDjFuIUbST6p7keRP6IxTUSkaesV4suiO/vi6mTm+12ZPPLxVioq7W8e7RoVHysqKnjhhRfo168fwcHB+Pn5nfFHmh6bzcbcX+Z6vLlfCK19mxmcSETk3Dp27MiqVasASE1NZe/evVx66aXVzx89ehR/f3+j4ok0WZ2CvPg8djD9wvwoLC1n4W4z721INTqWiF3TmCYiIgM7+DP/9j44mU18seUYL/+4z+hIZ6lR8XHWrFnMmTOHcePGkZeXx/Tp07n++usxm8387W9/u6hjzZs3j7CwMNzc3IiKimL16tW/+/rS0lKeeOIJ2rZti6urKx06dOCNN96oyWlIHVq7P4eNh07g4mRm6mXqehQR+3X//ffzwAMPMHHiREaPHs3AgQMJDw+vfv7HH3+kd2/d8iliBD8PF96d2J8b+7TGhom/fbWb2d/sotIOv8EXsQca00REBODyrkE8OzYSgI8Sj2Cz2de1U42Kj++99x6vvfYajzzyCE5OTtxyyy0sWrSIv/71r6xfv/6Cj7NkyRKmTZvGE088webNmxkyZAijR48mNfX833KPGzeOH374gddff509e/bwwQcf0LVr15qchtQRm83GnPg9ANzaL5RgHzeDE4mInN/kyZN56aWXOHHiBEOHDuXTTz894/ljx45xzz33GJRORFyczPxjbDhXhVQAsGDlQf60ZAul5RUGJxOxPxrTRETktLG9W9PM2cKxvBJ2Hss3Os4ZarQOd0ZGBt27dwfA09OTvLw8AK6++mqefPLJCz7OnDlzmDhxIpMmTQJg7ty5fPfdd8yfP5/Zs2ef9fpvv/2WlStXcvDgwerbu9u1a1eTU5A6tHp/Dj+n5uLqZGbqZR2MjiMi8ocmTpzIxIkTz/ncvHnzGjiNiPyWyWQipo2Ny/pFMvOznfx36zEy80t4bUI0Pu7ORscTsSsa00REBMDN2cLQzgF8tzOT5cmZRLb2MTpStRoVH9u0aUN6ejqhoaF07NiR5cuX06dPHxITE8+Y8Pj3lJWVkZSUxIwZM87YHhMTQ0JCwjn3+fLLL4mOjua5557jnXfewcPDg2uvvZZnnnmGZs3OPcdgaWkppaWl1Y/z86uqv1arFavVekFZf+v0fjXdvy7YSwabDeZ+XzWfwG39QmjezNKgmezh52AvOZRBGewtQ13kqI/8lZWVvPjii3z++edYrVZGjBjBX//6V9zcjOvadnJyIjKy6jaJ6OhoFi1aZFgWEXsytlcrWjX3YMo7SWxMOcENryaw+O6+tGnubnQ0Ebtgj2MaaFwTETHKyPBgvtuZSXxyJtNHdjY6TrUaFR+vu+46fvjhB/r378+f/vQnbrnlFl5//XVSU1N5+OGHL+gY2dnZVFRUEBQUdMb2oKAgMjIyzrnPwYMHWbNmDW5ubnz22WdkZ2czdepUTpw4cd55H2fPns2sWbPO2r58+XLc3Wt34RofH1+r/euC0Rl25prYfqwAF7ONsNIDLFt2wJAcRv8cTrOHHMqgDPaWAWqeo7i4uI6TwL/+9S/+8pe/MHz4cJo1a8acOXPIzs5m4cKFdf5eF8rX15ctW7YY9v4i9mxwxwA+vn8gd72RyP6sQq6bl8Cbd/W1q2/zRYxij2MaaFwTETHK5V0DMZtgV3o+R04UE+JnH1/Y1qj4+M9//rP6v2+88UZCQkJYu3YtHTt25Nprr72oY5lMpjMe22y2s7adVllZiclk4r333sPHp+qCc86cOdx4443ExcWds/tx5syZTJ8+vfpxfn4+ISEhxMTE4O3tfVFZT7NarcTHxzNy5EicnY259cceMpSVlfH8nBUA3DkojJuvaPiquj38HOwlhzIog71lqIscp7vV69LixYt5+eWXmTp1KlA1pcfYsWNZsGDBeccfETFW12BvPosdxN1vJrI7o4BxC9YRd1sfhnUJNDqaiKE0pomIyK/5ebgQ3c6PjSkn+H5XJncPDjM6ElDD4uNv9e/fn/79+1/UPgEBAVgslrO6HLOyss7qhjytZcuWtG7durrwCNCtWzdsNhtHjx6lU6dOZ+3j6up6zlvBnZ2da/2BvC6OUVtGZohPzuJokQkPFwv3D+tk6M/CHv4u7CWHMiiDvWWoTY76yH748GGuvvrq6sdXXHEFNpuNY8eO0bp164s+3qpVq3j++edJSkoiPT2dzz77jLFjx57xmnnz5vH888+Tnp5OREQEc+fOZciQIdXP5+fnExUVRbNmzfj73//OpZdeWuPzE3FULX2a8dGUgUx992fW7M9m0lub+PvYSG7uF2p0NBHD1PWYBhrXREQau5jwIDamnCA+2X6KjzVa7Xr27NnnvM35jTfe4F//+tcFHcPFxYWoqKizbsWLj49n0KBB59xn8ODBHDt2jMLCwupte/fuxWw206ZNm4s4A6ktm81G3MqqW6zvGBCKn4eLwYlERC5MWVnZGZ3yJpMJFxeXM+YHvhhFRUX07NmTV1555ZzPL1myhGnTpvHEE0+wefNmhgwZwujRo0lNTa1+zaFDh0hKSuLVV1/ljjvuqJeOTxFH4O3mzBt39eX6Pq2pqLQxY+l2Xly+B5vNZnQ0EUPU9ZgGGtdERBq7keFVDX0bUk6QV2zsGgCn1ajzccGCBbz//vtnbY+IiODmm2/m8ccfv6DjTJ8+nQkTJhAdHc3AgQNZuHAhqampTJkyBai6ZTotLY23334bgFtvvZVnnnmGu+++m1mzZpGdnc2jjz7KPffcc94FZ6R+JBzIYeexApzNNu4a1NboOCIiF+XJJ588Y97fsrIy/v73v5/RWT9nzpwLOtbo0aMZPXr0eZ+fM2cOEydOZNKkSQDMnTuX7777jvnz5zN79mwAWrVqBUBkZCTh4eHs3buX6Ojocx5PC6kpQ1PPYAL+OTacVt6uvPLTQV7+cT9Hcor4+9gIXJxq9L16jXI0FGVwrAz1kb8uxzRwjHHtfOzh96g+OOJ56ZwaD0c8r8Z+Tq28XegU6MG+rCLik9MZ07NlvZzTxRyrRsXHjIwMWrZsedb2Fi1akJ6efsHHGT9+PDk5OTz99NOkp6cTGRnJsmXLaNu2qpiVnp5+xjdonp6exMfH8+CDDxIdHY2/vz/jxo3j2WefrclpSC28+kvX44BAm7oeRaRRGTp0KHv27Dlj26BBgzh48GD147qaJ6usrIykpCRmzJhxxvaYmBgSEhIAOHnyJO7u7ri6unL06FGSk5Np3779eY+phdSUQRmqdAJubm/io4NmPt+aTvKhNO7pXEmzOplU6MJzNBRlcIwMdb2QWkOOadD4xrXzsYffo/rgiOelc2o8HPG8GvM5tXM2sw8z767YinPa5urtdXlOFzOm1ejy7PQCM2FhZ947vnbt2upvuS7U1KlTqydI/q3Fixefta1r166N+hfAEew8lsfqfdmYTTCsZaXRcURELspPP/10xuPs7GxcXFxqvAjZ78nOzqaiouKsuYyDgoKq5zzetWsXkydPxmw2YzKZeOmll/Dz8zvvMbWQmjIow/9cCYzcl81DH25lbx68merNaxP60NLHrUFz1CdlcKwMdX37cUOOaaeP3xjGtfOxh9+j+uCI56Vzajwc8bwc4ZxaH80jfsEG9hU4MzxmGGZbRZ2f08WMaTUqPk6aNIlp06ZhtVq5/PLLAfjhhx947LHH+POf/1yTQ0oj8tqqqm9SR0cE4+921OA0IiIXLzc3lyeeeIIlS5Zw8uRJoKp7/+677z7r9rW68NuuE5vNVr1t0KBBbN++/YKPpYXUlEEZzjQ8vCVLJrtz9+JE9mQWMm7hRt68uy/dWtZd0aKx/CyUwf4z1Ef2hh7ToPGMa+djD79H9cERz0vn1Hg44nk15nPq09afQC9XsgpK2ZSax+D2zYG6PaeLOU6Nio+PPfYYJ06cYOrUqZSVlQHg5ubG448/zsyZM2tySGkkjp4s5r/bqm6tn3RJO1K3qvgoIo3LiRMnGDhwIGlpadx2221069YNm83Grl27ePnll4mPj2fNmjVs3bqVDRs28NBDD9X4vQICArBYLNXdIKdlZWWd1TUiIjUX2dqHz6YO4q43E9mfVci4V9cx//YoLukUYHQ0kXrVkGMaaFwTEWkszGYTI8KDeH9DKvHJmdXFR8Py1GQnk8nEv/71L44fP8769evZunUrJ06c4K9//Wtd5xM78/qaFCoqbQzu6E9k6/q5nUNEpD49/fTTuLi4cODAARYsWMC0adN4+OGHWbhwIfv376esrIwJEyYQExNzxmT9NeHi4kJUVNRZ04XEx8czaNCgWh1bRM7Uprk7n04ZRP8wPwpKy7nrzY18mqQvScWxNeSYBhrXREQak9OrXn+/KxObzWZollpNyZ2RkcGJEycYOnQorq6uZ7Tbi+PJLS5jSeIRAO4b2sHgNCIiNfP555+zYMGCc3ZoBAcH89xzz3HllVfy1FNPceedd/7h8QoLC9m/f3/145SUFLZs2YKfnx+hoaFMnz6dCRMmEB0dzcCBA1m4cCGpqalMmTKlVucRFxdHXFwcFRUVtTqOiCPxcXfm7Yn9eOTjbfx36zH+/PFWjuWe4oHLO+oaVRxSXY9poHFNRMRRDOrgj4eLhcz8UnYcq9s5hy9WjYqPOTk5jBs3jhUrVmAymdi3bx/t27dn0qRJ+Pr68uKLL9Z1TrED764/THFZBV2DvRjaKYDy8nKjI4mIXLT09HQiIiLO+3xkZCRms5mnnnrqgo63adMmhg0bVv349KT5d955J4sXL2b8+PHk5OTw9NNPk56eTmRkJMuWLaNt27a1Oo/Y2FhiY2PJz8+vk24WEUfh6mThpfG9aO3bjFdXHuDF+L2k5Z7imbGROFtqdNOPiN2q6zENNK6JiDgKVycLl3ZpwbLtGXy/6zhdDMxSoyuwhx9+GGdnZ1JTU8+YwHj8+PF8++23dRZO7EeJtYLFCYcAmHJpB3UPiEijFRAQwKFDh877fEpKCoGBgRd8vMsuuwybzXbWn8WLF1e/ZurUqRw6dIjS0lKSkpIYOnRoLc5ARP6I2WxixuiuPDMmArMJPkw8wqS3NlFYqi9OxbHU9ZgGGtdERBzJ6Vuvf9idZWiOGhUfly9fzr/+9S/atGlzxvZOnTpx+PDhOgkm9mXpz2lkF5bRyseNq3q0NDqOiEiNjRo1iieeeKJ6wbRfKy0t5cknn2TUqFEGJBORujZhYDsWToimmbOFlXuPM37BOrLyS4yOJVJnNKaJiMjvGdYlEIvZxJ7MQnIMvASq0W3XRUVFZ3Q8npadnY2rq2utQ4l9qai08drqgwBMHNJetyyJSKM2a9YsoqOj6dSpE7GxsXTt2hWA5ORk5s2bR2lpKW+//bbBKUWkrowID+LD+wZwz+JEdh7L57p5CSy+uy+dgryMjiZSaxrTRETk9/i6u9CvnR/rDuaw/aRxd7DWqIo0dOjQMwYxk8lEZWUlzz///Bnzg4hjiE/OICW7CJ9mztzcN8ToOCIitdKmTRvWrVtHeHg4M2fOZOzYsYwdO5YnnniC8PBw1q5dS2hoqNEx/1BcXBzh4eH07dvX6Cgidq9niC+fTR1M+wAP0nJPccP8BNYfzDE6lkitOcqYBhrXRETqy+lbr7efMK74WKPOxxdeeIFLL72UTZs2UVZWxmOPPcbOnTs5ceIEa9eureuMYiCbzcarK6u6Hm8fEIqHa60WSBcRsQthYWF88803nDx5kn379gHQsWNH/Pz8DE524TQxv8jFCfV359P7BzHp7U0kHT7JHa9v5IVxPbm2Zyujo4nUiiOMaaBxTUSkvowMD+Lpr5I5mG8it9hKCx/nBs9w0Z2PVquVqVOn8uWXX9KvXz9GjhxJUVER119/PZs3b6ZDhw71kVMMknjoJFuO5OLiZObOQe2MjiMiUqeaN29Ov3796NevX6P7kCYiF6+5hwvvTerP6MhgyioqeeiDzby68gA2m83oaCK1pjFNRETOJcTPna5BnlRi4qe9xw3JcNFtbM7OzuzYsQN/f39mzZpVH5nEjixYeQCAG/q0IdDLzeA0IiIiIrXj5mwh7tY+/H3ZLl5fk8I/v9lN2slT/O3aCCxm425HEhEREakvw7sFsjuzkO93ZXFT37YN/v41mvPxjjvu4PXXX6/rLGJn9mUW8MPuLEwmuHdImNFxREREROqE2WziyavDefLqcEwmeGf9YSa/k8Spsgqjo4mIiIjUuRFdAwFYvT+HEmvDX+/UaAK/srIyFi1aRHx8PNHR0Xh4eJzx/Jw5c+oknBhr4aqquR5jwoNo38LT4DQiIiIidWviJWG08nFj2pItfL8rk5tfW8/rd0YT4OlqdDQRERGROhPRygtfFxu5ZRWsO5DDsF+KkQ2lRsXHHTt20KdPHwD27t17xnMmk25XcQQZeSV8viUNgMmXah5PERF7ExcXR1xcHBUV6tQSqY3R3VsS6O3KpLc2sfVILtfPS2Dx3X31xatIA9O4JiJSf0wmE5HNbazJNLE8ObNxFB9XrFhR1znEzryZkIK1wkbfds3pE9rc6DgiIvIbWhVUpO5EtfXj0/sHcdebiaSeKOaG+QksujOaHq28jI4m0mRoXBMRqV/d/WysyYTvd2Xy98pIzA0413WN5nwUx5ZfYuX99akATB6qrkcRERFxfO1beLJ06iB6tvHhZLGVW1/bwHc7M42OJSIiIlInOnrb8HR14nhBKVuP5jboe6v4KGf5YEMqBaXldAz05PIGbsUVERERMUqApysf3DeAEd0CKS2v5MElW/kpXVMKiYiISOPnZIZLOwUAEJ/csF+wqvgoZygrr+SNtSkA3DekfYO24YqIiIgYzd3FiQUTopkwoC02G3x2yMIzX++motJmdDQRERGRWunfvmpavT0ZBQ36vio+yhm+2JJGZn4pgV6ujOndyug4IiIiIg3OYjbx9JgIHruiEwBvr0/l/neTOFWmhTBERESk8Wru7gJA3ilrg76vio9SrbLSxsJVBwG455IwXJ0sBicSEZHziYuLIzw8nL59+xodRcQhmUwm7r0kjLs6VeDiZGZ5cia3vLae7MJSo6OJOCSNayIi9c+3mTMAuSo+ilF+2pvFvqxCPF2duLV/qNFxRETkd8TGxpKcnExiYqLRUUQcWu8AG2/dFYWvuzNbjuRy/bwEDhwvNDqWiMPRuCYiUv+8mzkB6nwUA726sqrr8db+oXi7ORucRkRERMQ+RLdtzqf3DyLUz53UE8XcMD+BxEMnjI4lIiIiclFOdz7mFVux2RpuPmsVHwWAzakn2ZhyAmeLibsHtzM6joiIiIhd6dDCk6VTB9ErxJfcYiu3LdrAV9uOGR1LRERE5IJ5/1J8LKuopMRa2WDvq+KjAFTP9Xhtz9a09GlmcBoRERER+xPg6coH9w4gJjyIsvJKHnh/M6+uPNCgnQMiIiIiNeXhYsHJbAIg91RZg72vio9CSnYR3+7MAOC+oe0NTiMiIiJiv5q5WJh/e1T1nSL//GY3T36xg/KKhuseEBEREakJk8mEz+lbrxtw3kcVH4XXVh/EZoPLuwbSJdjL6DgiIiIids1iNvHUNRH89epwTCZ4d30q972TRFFpudHRRERERH6Xj/svK14Xq/goDeR4QSmfJB0F1PUoIiIicjHuuSSM+bdF4epk5sfdWYxfuI6sghKjY4mIiIicl686H6Whvb3uEGXllfQM8aV/mJ/RcURE5ALFxcURHh5O3759jY4i0qSNigzmg/sG4Ofhwo60fK6LS2BfZoHRsUQaHY1rIiINw+dXK143FBUfm7Ci0nLeXncYgClD22MymQxOJCIiFyo2Npbk5GQSExONjiLS5PUJbc7S+wcRFuBBWu4prp+fwLoDOUbHEmlUNK6JiDQMX3cXQAvOSAP5aNMR8k5ZaefvTkxEsNFxRERERBqtdgEefHr/IKLaNqegpJw73tjA55vTjI4lIiIicgYtOCMNpryikkWrUwCYNKQ9FrO6HkVERERqw8/Dhfcm9eeq7i2xVtiYtmQLr/y4D5vNZnQ0EREREeB/xUctOCP17uvt6aTlnsLfw4Ubo9oYHUdERETEIbg5W3j5lt7VC/m9sHwvM5dux1pRaXAyEREREXU+SgOx2WwsWHkQgDsHtcPN2WJwIhERERHHYTab+L8ru/HMmAjMJvgw8QgT39pEYWm50dFERESkifN1b4LFx3nz5hEWFoabmxtRUVGsXr36vK/96aefMJlMZ/3ZvXt3AyZu/NbszyY5PZ9mzhYmDGhrdBwRERERhzRhYDsWToimmbOFVXuPc9Or68jIKzE6loiIiDRhTa7zccmSJUybNo0nnniCzZs3M2TIEEaPHk1qaurv7rdnzx7S09Or/3Tq1KmBEjuGhauquh7H9w2huYeLwWlEREREHNeI8CCWTB5AgKcru9LzuW7eWnZn5BsdS0RERJqo052PTWbOxzlz5jBx4kQmTZpEt27dmDt3LiEhIcyfP/939wsMDCQ4OLj6j8Wi24Yv1I60PFbvy8ZiNjHxkjCj44iIiIg4vB5tfPls6iA6tPAgPa+Em+avY82+bKNjiYiISBNkROejU4O902+UlZWRlJTEjBkzztgeExNDQkLC7+7bu3dvSkpKCA8P5y9/+QvDhg0772tLS0spLS2tfpyfX/VNs9VqxWqt2Q/69H413b8u1DTDqz/tB2B0RBDBXs61OofG/HNwxBzKoAz2lqEuchidX0SkroT4ubP0/sHc+84mNqac4K43NzL7+u7cFB1idDQRERFpQnyaVd0Bm19ipbLShtlsqvf3NKz4mJ2dTUVFBUFBQWdsDwoKIiMj45z7tGzZkoULFxIVFUVpaSnvvPMOw4cP56effmLo0KHn3Gf27NnMmjXrrO3Lly/H3d29VucQHx9fq/3rwsVkyCmBZdstgIlupqMsW3a0wTPUF3vIAPaRQxmUwd4yQM1zFBcX13ESxxEXF0dcXBwVFRVGRxGRC+Tj7sw7E/vx2Cfb+GLLMR79ZBtHT55i2ohOmEz1f+EvYs80romINIzTnY82GxSUlOPzy23Y9cmw4uNpv73Qstls57346tKlC126dKl+PHDgQI4cOcILL7xw3uLjzJkzmT59evXj/Px8QkJCiImJwdvbu0aZrVYr8fHxjBw5Emfn+v9LqqsMz3y9m0pSGdTBj/tuijYkQ12zhwz2kkMZlMHeMtRFjtPd6nK22NhYYmNjyc/Px8fHx+g4InKBXJ0s/HtcL9o0b0bcigO89MM+jp48xezru+PiZPhakCKG0bgmItIwXJzMuLtYKC6rIPdUmWMXHwMCArBYLGd1OWZlZZ3VDfl7BgwYwLvvvnve511dXXF1dT1ru7Ozc60/kNfFMWrrQjOcLCrj46Q0AO6/rGOd5m5MP4emkEMZlMHeMtQmhz1kFxGpa2aziUev6EprX3ee/GIHn/58lPS8U7w6IQpvN/27JyIiIvXLt5lzVfGx2Epb//p/P8O+XnVxcSEqKuqsW/Hi4+MZNGjQBR9n8+bNtGzZsq7jOZx31x/mlLWC8JbeXNIxwOg4IiIiIk3erf1DWXRnNB4uFhIO5HDj/ATSck8ZHUtEREQcnHcDLzpj6G3X06dPZ8KECURHRzNw4EAWLlxIamoqU6ZMAapumU5LS+Ptt98GYO7cubRr146IiAjKysp49913+fTTT/n000+NPA27V2KtYHHCIQAmX9pecwqJiIiI2IlhXQJZMnkg9yxOZG9mIdfFreWNu/oS2Vq3nYqIiEj98P3lVuvcplB8HD9+PDk5OTz99NOkp6cTGRnJsmXLaNu2LQDp6emkpqZWv76srIxHHnmEtLQ0mjVrRkREBF9//TVXXnmlUafQKHySdJScojJa+zbjyu7qEhURERGxJ5GtffgsdjB3v7mRvZmFjF+wjldu68OwLoFGRxMREREH5NOUOh8Bpk6dytSpU8/53OLFi894/Nhjj/HYY481QCrHUVFpY9HqgwBMGhKGs0UTmYuIiIjYm9a+zfh4yiDufzeJhAM5THprE8+MieSmPvriWEREROqWbzMXAPKKyxrk/VSJcnDLd2ZwKKcYn2bOjIsOMTqOiIiIiJyHTzNnFt/dj+v7tKai0sb/fbadF+P3UWkzOpmIiIg4ktMrXDdU56OKjw7MZrPx6qqqrsc7BrbFw9XwRlcRERER+R0uTmZevKknfxreCYBXV6Xw7n4zpeWVBicTERERR3H6tuvcYhUfpZY2ppxg65FcXJzM3DmondFxREREROQCmEwmHh7Zmedv7IGT2URStpmJbyc1WHeCiIiIODZ3FwsAp6wVDfJ+Kj46sAW/dD3eGNWGAE9Xg9OIiIiIyMW4KTqEhRN642qxsSHlJDfOTyAt95TRsURERKSRM5tMADTUzC4qPjqovZkF/Lg7C5MJ7h3S3ug4IiIiIlIDQzoG8KeICoK8XNmXVch1cWvZeSzP6FgiIiLSiJmrao/YbA1TflTx0UEt/KXrcVREMGEBHganEREREZGaau0BH0/uT+cgT7IKShn36jpW7T1udCwRERFppEy/dD5WNtCU0io+OqD0vFN8sSUNgPuGqutRRMQRxcXFER4eTt++fY2OIiINoKWPGx9PGcTA9v4UlVVwz+JEPt50xOhYInVG45qISMP5pfZIpTofpabeXHsIa4WNfmF+9A5tbnQcERGpB7GxsSQnJ5OYmGh0FBFpID7NnFl8T1/G9mpFeaWNRz/Zxtzv9zbYLVMi9UnjmohIwzk952NlA11CqPjoYPJLrLy/IRWAKZeq61FERETEkbg6WZgzrhdTL+sAwNzv9/H4p9uwVjTQfVMiIiLS6J2e87GhlpxR8dHBvL8hlcLScjoFenJZ50Cj44iIiIhIHTObTTw2qit/vy4Sswk+2nSUiW9torC03OhoIiIi0giY1PkoNVVaXsEba1KAqrkezf8rZYuIiIiIg7mtf1teuyOaZs4WVu09zrhX15GZX2J0LBEREbFz/7vtWp2PcpG+2HKMrIJSgrxdGdOrtdFxRERERKSeDe8WxIf3DSDA04Xk9Hyun5fA3swCo2OJiIiIHTNXLzjTQO/XMG8j9a2y0sbCVQcBuGdwGC5O+qsVERERaQp6hviy9P7BtA/wIC33FDfMT2DdgRyjY4mIiIidOr3adUMtWqcKlYP4cXcW+7MK8XJ14pb+oUbHEREREZEGFOrvzqf3DyKqbXMKSsq5842NfLn1mNGxRERExA6dvu26gWqPKj46itNdj7cOCMXbzdngNCIiIiLS0Jp7uPDepP6MjgymrKKShz7YzKsrDzRYV4OIiIg0DibN+SgXK+nwSTYeOoGzxcQ9g8OMjiMiIiIiBnFzthB3ax8mXlJ1TfjPb3bz5Bc7qGioSZ1ERETE7v1vzkcVH+UCLVx1AICxvVoT5O1mcBoRERERMZLZbOLJq8N58upwTCZ4d30qk99J4lRZhdHRRERExA78b7XrBnq/hnkbqS8p2UUsT84E4L6h7Q1OIyIiIiL2YuIlYcy7tQ8uTma+35XJza+tJ7uw1OhYIiIiYrBfGh+14IxcmNfXHsZmg+FdA+kU5GV0HBERERGxI6O7t+T9Sf3xdXdm65Fcrp+XQEp2kdGxRERExEAmdT7Khcovg8+2VK1iOPnSDganERERERF7FN3Oj0/vH0SIXzNSTxRz/by1JB0+aXQsERERMcjpOR/V+Sh/aHWGmbLySnqH+tK3XXOj44iIiIiInerQwpOl9w+mRxsfThZbufW19Xy7I8PoWCIiImIAzfkoF6SotJzVGVW/LJOHtq9umRUREREROZcWXq58eN8AhncNpLS8kvvfS+LNtSlGxxIREZEGZv6lGqjOR/ldHyWlcarCRDt/d0aGBxsdR0REREQaAXcXJxZMiOK2/qHYbDDrv8k8+1UylQ3V+iAiIiKG05yP8odKyyt4M+EwABMHt8NiVtejiIiIiFwYJ4uZZ8dG8tioLgAsWpPCgx9spsRaYXAyERERaQinq0iV6nyU8/lo01HS80rwdrZxXa+WRscREREDxMXFER4eTt++fY2OIiKNkMlkYuplHZk7vhfOFhNfb09nwusbyC0uMzqaNFEa10REGs7pOR8bqPao4mNjU1pewbwV+wEY2boSV2eLwYlERMQIsbGxJCcnk5iYaHQUEWnExvZuzVv39MPLzYnEQye5YX4CR04UGx1LmiCNayIiDed/C86o81HO4aPEI6TnlRDk7crAIM3NIyIiIiK1M6hDAJ9MGUQrHzcOHC/iunkJbD+aZ3QsERERqSenZ+9T56OcpcRaQdyKAwBMGRqGs/72RERERKQOdAn2YunUwXQN9iK7sJRxC9axYneW0bFERESkHpjU+SjnsyTxCBn5JbT0ceOmqDZGxxERERERBxLs48bHUwYypFMAp6wVTHp7Ex9sTDU6loiIiNSxX2qPKj7KmUqsFcz7qWqux6nDOuLqpL86EREREalbXm7OvHFXX26MakNFpY2ZS7cz5/t9DXZbloiIiNQ/LTgj5/TBxlQy80tp5ePGuGh1PYqIiIhI/XC2mHn+xh78aXgnAOavTOHd/WZKyysNTiYiIiJ1oXrOx4Z6vwZ6H6mFqq7HqrkeYy/viKuTVrgWERERkfpjMpl4eGRnnruhBxaziU3ZZu5avImTRWVGRxMREZFa0m3Xcpb3NqRyvKCU1r7NuCkqxOg4IiIiItJEjOsbwmsTeuNmsbHpcC7XzVvLweOFRscSERGRWjE16Lup+GjnTpVVMP+XrscHLu+Ii+Z6FBEREZEGNKRjANMiK2jt68ahnGKum5fA+oM5RscSERGRRsLwSta8efMICwvDzc2NqKgoVq9efUH7rV27FicnJ3r16lW/AQ323obDZBeW0qZ5M27UCtciIiIiYoCW7vDJ5P70CvEl75SVCa9v4JOko0bHEhERkUbA0OLjkiVLmDZtGk888QSbN29myJAhjB49mtTU1N/dLy8vjzvuuIPhw4c3UFJjFJeV8+rKqq7HBy/viLPF8FqxiIiIiDRRAZ6ufHjfAK7q3hJrhY1HPt7KC9/tobJSS2GLiIjI+RlazZozZw4TJ05k0qRJdOvWjblz5xISEsL8+fN/d7/Jkydz6623MnDgwAZKaox31x8mu7CMUD93ru+jrkcRERERMZabs4WXb+lN7LAOALyyYj8PfbiZEmuFwclERETEXhlWfCwrKyMpKYmYmJgztsfExJCQkHDe/d58800OHDjAU089Vd8RDVVcVs6ClQeBqrke1fUoIiIiIvbAbDbx6BVdef7GHjhbTHy1LZ1bXltPdmGp0dFERETEDjkZ9cbZ2dlUVFQQFBR0xvagoCAyMjLOuc++ffuYMWMGq1evxsnpwqKXlpZSWvq/C6H8/HwArFYrVqu1RtlP71fT/S/Em2tSyCkqI9SvGddEBp71Xg2R4Y8og33lUAZlsLcMdZHD6PwiInJ+N0WH0Ka5O1PeTWJzai5j49by5l196RTkZXQ0ERERsSOGFR9PM5nOXN7bZrOdtQ2goqKCW2+9lVmzZtG5c+cLPv7s2bOZNWvWWduXL1+Ou7v7xQf+lfj4+Frtfz6lFTDvZwtgYkjzQpZ/922DZ7gYyvA/9pBDGZTB3jJAzXMUFxfXcRIREalLAzv489nUQdyzOJFDOcVcPy+Bebf3YUinFkZHExERETthWPExICAAi8VyVpdjVlbWWd2QAAUFBWzatInNmzfzwAMPAFBZWYnNZsPJyYnly5dz+eWXn7XfzJkzmT59evXj/Px8QkJCiImJwdvbu0bZrVYr8fHxjBw5Emdn5xod4/csWJVCUfk+2vq585cJg3A6xy3X9Z3hQiiDfeVQBmWwtwx1keN0t7qIiNiv9i08WTp1MFPeSWLjoRPc9WYiz46N5JZ+oUZHExERETtgWPHRxcWFqKgo4uPjue6666q3x8fHM2bMmLNe7+3tzfbt28/YNm/ePH788Uc++eQTwsLCzvk+rq6uuLq6nrXd2dm51h/I6+IYv1VYWs6itYcAeGh4J5q5nZ29vjNcLGWwrxzKoAz2lqE2Oewhu4iI/DE/DxfemdSPGZ9u57PNacxcup2U7CJmjOqK2Xz2XU0iIiLSdBh62/X06dOZMGEC0dHRDBw4kIULF5KamsqUKVOAqq7FtLQ03n77bcxmM5GRkWfsHxgYiJub21nbG7O3Eg6RW2wlLMCDMb1aGR1HREREROSCuDpZmDOuJ2EBHsyJ38vCVQc5lF3E3Jt74e5i+GxPIiIiYhBDrwLGjx9PTk4OTz/9NOnp6URGRrJs2TLatm0LQHp6OqmpqUZGbFAFJVYWrqpa4fqh4R3Pebu1iIiIiIi9MplMPDS8E2393Xn0420sT85k/IL1LLozmiBvN6PjiYiIiAEMr25NnTqVQ4cOUVpaSlJSEkOHDq1+bvHixfz000/n3fdvf/sbW7Zsqf+QDeSthEPknbLSvoUH1/ZsbXQcEREREZEaGdOrNe/f2x8/Dxe2p+UxNm4tycc0j6+IiEhTZHjxUarkl1h5bXUKAH8a3gmL5sYRERERkUYsup0fn00dRIcWHqTnlXDTqwn8uDvT6FgiIiLyC5utYd5HxUc7sXhtVddjx0BPru6huR5FREREpPFr6+/B0vsHM6iDP0VlFUx6axOL16YYHUtEREQakIqPdiDvlJVFq0/P9aiuRxERERFxHD7uzrx1Tz/GR4dQaYO//TeZp77YQXlFpdHRREREpAGo+GgH3lybQn5JOZ0CPbmqe0uj44iIiIiI1Clni5l/3tCdGaO7AvDWusPc+/YmCkvLDU4mIiIi9U3FR4PlnbLy+ppf5nocoa5HEZGmqri4mLZt2/LII48YHUVEpF6YTCamXNqB+bf1wdXJzIo9x7lxfgLHck8ZHU3qgcY1ERE5TcVHg72+JoWCknK6BHlxZaS6HkVEmqq///3v9O/f3+gYIiL1bnT3liyZPJAAT1d2ZxQwJm4t247mGh1L6pjGNREROU3FRwPlFVt581ddj2Z1PYqINEn79u1j9+7dXHnllUZHERFpEL1CfPk8dhBdgrw4XlDKuAXr+G5nhtGxpI5oXBMRkV9T8dFAi9YcpKC0nK7BXoyKCDY6joiI1MCqVau45ppraNWqFSaTic8///ys18ybN4+wsDDc3NyIiopi9erVZzz/yCOPMHv27AZKLCJiH9o0d+eT+wdyaecWlFgrmfJuEgtXHcBmsxkdrUnTuCYiInXNyegATVVucRlvrj0EwDR1PYqINFpFRUX07NmTu+++mxtuuOGs55csWcK0adOYN28egwcPZsGCBYwePZrk5GRCQ0P54osv6Ny5M507dyYhIeEP36+0tJTS0tLqx/n5+QBYrVasVmuNzuH0fjXdvy4ogzLYWwZ7yeHoGdws8OqtPXlm2W7e33iUfyzbzYGsQp66uivOlv/1STjKz8Ho3+kL4Qjj2vnYw+9RfXDE89I5NR6OeF5N4ZwqyqsWfLPZbLX+DHEhVHw0yGurD1JYWk63lt7EhKvrUUSksRo9ejSjR48+7/Nz5sxh4sSJTJo0CYC5c+fy3XffMX/+fGbPns369ev58MMP+fjjjyksLMRqteLt7c1f//rXcx5v9uzZzJo166zty5cvx93dvVbnEh8fX6v964IyKIO9ZQD7yOHoGfqZ4VQ7E58fMrNk01E270vl7s6VuP/m00pj/zkUFxfXYZL64Ujj2vnYw+9RfXDE89I5NR6OeF6OfE4pBQBOFBcXs2zZshod62LGNBUfDXCiqIzF6noUEXF4ZWVlJCUlMWPGjDO2x8TEVHeDzJ49u/rWtMWLF7Njx47zfkADmDlzJtOnT69+nJ+fT0hICDExMXh7e9cop9VqJT4+npEjR+Ls7FyjY9SWMiiDvWWwlxxNKcNVwBW7s5j+8Xb25sGiQ168NqE3Ic3dHebncLqrr7FqLOPa+djD71F9cMTz0jk1Ho54Xk3hnDan5jJ3x0bc3d258sohNTrmxYxpKj4a4LXVBykqqyCilTcx4UFGxxERkXqSnZ1NRUUFQUFn/lsfFBRERkbNFlZwdXXF1dX1rO3Ozs61vjiqi2PUljIog71lsJccTSXDqO6taePnyaS3NnHgeBE3LdjIwjui6dHKs8Ey/JHaZDA6e201tnHtfOzh96g+OOJ56ZwaD0c8L0c+J4tTVTnQZDI1yJim4mMDyyks5a2EQwBMG9EZk0ldjyIiju63/9bbbLZz/vt/1113NVAiERH7Fdnah89jBzPxrUR2HsvnltfW89z1keiq2X5oXBMRkYuh1a4b2MLVBykuq6B7ax9GdAs0Oo6IiNSjgIAALBbLWd0gWVlZZ3WNiIjI/wT7uPHR5IGM6BZEWXkl0z7axvKjJq2EbTCNayIiUhMqPjag7MJS3k44DFTN9aiuRxERx+bi4kJUVNRZk1XHx8czaNCgWh07Li6O8PBw+vbtW6vjiIjYKw9XJxZMiGLSJWEAfH3EwqyvdlNRqQKkUTSuiYg4FhsNM6bqtusGtHDVQU5ZK+jRxofLu6rrUUTEERQWFrJ///7qxykpKWzZsgU/Pz9CQ0OZPn06EyZMIDo6moEDB7Jw4UJSU1OZMmVKrd43NjaW2NhY8vPz8fHxqe1piIjYJYvZxF+uDqe1rytPf7WL9zYe4eQpK/8e3wtXJ4vR8RySxjUREcfX0L1wKj42kOMFpby97hCgrkcREUeyadMmhg0bVv349Iqdd955J4sXL2b8+PHk5OTw9NNPk56eTmRkJMuWLaNt27ZGRRYRaXRu7x/K4b07ee+AE8u2Z3CyKJGFd0Th5eZYCwHYA41rIiJS11R8bCALVx2gxFpJzxBfhnVR16OIiKO47LLL/nAOsqlTpzJ16tQGSiQi4ph6+9u4fHAfpr6/lXUHcxi/YD2L7+lLoJeb0dEcisY1ERGpa5rzsQFkFZTwznrN9SgiIiIiUhsD2/vz4X0DCPB0ITk9nxvnr+NwTpHRsUREROR3qPjYABasPEiJtZJeIb5c1rmF0XFERMQBaGJ+EWmqIlv78MmUQYT6uZN6opgb5iewIy3P6FhSSxrXREQcl4qP9Swrv4R3f+l6fHhkZ3U9iohInYiNjSU5OZnExESjo4iINLh2AR58cv9Awlt6k11Yxs0L15OwP9voWFILGtdERByXio/1bP7KA5SWV9In1JehnQKMjiMiIiIi4hACvdz4cPIABrT3o7C0nLveTOTrbelGxxIREZHfUPGxHmXml/DehlRAXY8iIiIiInXN282ZxXf3Y3RkMGUVlTzwwc+8s+6Q0bFERETkV1R8rEfzfzpAWXkl0W2bc0lHdT2KiIiIiNQ1N2cLr9zah9v6h2KzwZNf7GTO8j1/uGKziIiINAwVH+tJRl4J729U16OIiIiISH2zmE08OzaSaSM6AfCfH/fzf5/toKJSBUgRERGjqfhYT+b9tJ+y8kr6tfNjUAd/o+OIiIiD0aqgIiJnMplMTBvRmWfHRmIywQcbU5n6XhIl1gqjo8kF0LgmIuK4VHysB8dyT/HhxiMATBvZSV2PIiJS57QqqIjIud0+oC3zbu2Di8XMdzszueONjeSdshodS/6AxjUREcel4mM9mPfTfsoqKukf5segDprrUURERESkIY3u3pK37umHl6sTG1NOMH7BOrLyS4yOJSIi0iSp+FjH0nJPsSSxquvx4ZGdDU4jIiIiItI0Dezgz4eTBxDg6crujAKun59ASnaR0bFERESaHBUf61jciv1YK2wMbO/PgPaa61FERERExCgRrXxYev8g2vq7c/TkKW6cn8C2o7lGxxIREWlSVHysQ0dPFvPxJnU9ioiIiIjYi1B/dz6ZMojI1t7kFJVxy8L1rN533OhYIiIiTYaKj3XodNfj4I7+9AvzMzqOiIiIiIgALbxc+eDeAQzu6E9RWQX3LE7ky63HjI4lIiLSJKj4WEeOnCjm401HAXh4hLoeRUSkfsXFxREeHk7fvn2NjiIi0ih4uTnzxl19uapHS6wVNh76YDNvrk0xOpb8QuOaiIjjUvGxjrzy437KK20M6RRAdDt1PYqISP2KjY0lOTmZxMREo6OIiDQark4WXr65N3cObAvArP8m8/x3u7HZbAYnE41rIiKOy/Di47x58wgLC8PNzY2oqChWr1593teuWbOGwYMH4+/vT7NmzejatSv//ve/GzDtuaXmFPPJz1Vdj9PU9SgiIiIiYrfMZhN/uzaCR2KqrtvjVhxgxqfbKa+oNDiZiIiIY3Iy8s2XLFnCtGnTmDdvHoMHD2bBggWMHj2a5ORkQkNDz3q9h4cHDzzwAD169MDDw4M1a9YwefJkPDw8uO+++ww4gyqvrNhHRaWNoZ1bENW2uWE5RERERETkj5lMJh64vBP+nq488dl2lmw6Qk5RGa/c2hs3Z4vR8URERByKoZ2Pc+bMYeLEiUyaNIlu3boxd+5cQkJCmD9//jlf37t3b2655RYiIiJo164dt99+O1dcccXvdkvWt8M5RXz6cxoA00Z0MiyHiIiIiIhcnFv6hTL/9ihcnMx8vyuTCa9vIK/YanQsERERh2JY8bGsrIykpCRiYmLO2B4TE0NCQsIFHWPz5s0kJCRw6aWX1kfEC/Lyj/upqLRxaecW9AlV16OIiIiISGNyRUQw79zTDy83JxIPnWTcgnVk5pcYHUtERMRhGHbbdXZ2NhUVFQQFBZ2xPSgoiIyMjN/dt02bNhw/fpzy8nL+9re/MWnSpPO+trS0lNLS0urH+fn5AFitVqzWmn2reXq//Zl5fLa5quvxwWHta3y82mRoyPdUBvvOoQzKYG8Z6iKH0flFRKRp6N/en48mD+TONzayJ7OA6+cl8PbEfnRo4Wl0NBERkUbP0DkfoWq+lV+z2Wxnbfut1atXU1hYyPr165kxYwYdO3bklltuOedrZ8+ezaxZs87avnz5ctzd3WseHPjrknVUVJoJ960kbdta0rbV6nA1Eh8f3/BvqgznZQ85lEEZ7C0D1DxHcXFxHScRERE5t24tvfn0/kHc+cZGDmYXceP8BN68ux+9QnyNjiYiItKoGVZ8DAgIwGKxnNXlmJWVdVY35G+FhYUB0L17dzIzM/nb3/523uLjzJkzmT59evXj/Px8QkJCiImJwdvbu0bZrVYr730ZT1J21V3rz4wfSI82PjU6Vk1ZrVbi4+MZOXIkzs7ODfreymCfOZRBGewtQ13kON2tLmeLi4sjLi6OiooKo6OIiDiMED93Pp4ykLsXJ7LtaB63vrae+bdHcWnnFkZHc3ga10REHJdhxUcXFxeioqKIj4/nuuuuq94eHx/PmDFjLvg4NpvtjNuqf8vV1RVXV9eztjs7O9fqA/l3R81U2mB410CiwgJqfJzaqu15KIPj5VAGZbC3DLXJYQ/Z7VVsbCyxsbHk5+fj49OwX4CJiDgyf09XPrh3AFPeTWL1vmwmLk7khZt6clVkoNHRHJrGNRERx2XobdfTp09nwoQJREdHM3DgQBYuXEhqaipTpkwBqroW09LSePvtt4Gqb8NCQ0Pp2rUrAGvWrOGFF17gwQcfbNDcB48XkZRddWv4tBGdG/S9RURERESkfnm4OvH6nX155OOtfLn1GNOWbCErvwu/f3+WiIiInIuhxcfx48eTk5PD008/TXp6OpGRkSxbtoy2bdsCkJ6eTmpqavXrKysrmTlzJikpKTg5OdGhQwf++c9/Mnny5AbN/cpPB7BhYnjXFnRv4NutRURERESk/rk4mZk7vhf+ni68ufYQ//hmD8NbmRltsxkdTUREpFExfMGZqVOnMnXq1HM+t3jx4jMeP/jggw3e5fhb+7MK+Gp71TyVDw7rYGgWERERERGpP2azib9eHU4LL1ee+3YPPxwzs2RTGhMGhRkdTUREpNEwvPjY2LTwciP20vZs3LmfiFY1W7BGREREREQaB5PJxNTLOtK8mRNvr9jO9b1bGR1JRESkVvw9XLihTxv8PV0a5P1UfLxIPs2c+dPwjiwr3Wt0FBERERERaSA39mlNs/StuDiZjY4iIiJSK239PXhxXM8Gez+NnCIiIiIiIhfAZDI6gYiISOOj4qOIiIiIiIiIiIjUCxUfRUREREREREREpF6o+CgiItIIxcXFER4eTt++fY2OIiIiUmsa10REHJeKjyIiIo1QbGwsycnJJCYmGh1FRESk1jSuiYg4LhUfRUREREREREREpF6o+CgiIiIiIiIiIiL1QsVHERERERERERERqRcqPoqIiIiIiIiIiEi9UPFRRERERERERERE6oWKjyIi8v/t3XlUlOUeB/DvyLAMKJgSApKIgqKGoGLIUnjCQLPUuBVaIqZ5pCDcQvAqgd7jFUxz144mlkUX08Br1zRJAUUTFUEJiEUx9YJSZg7KDVGe+0fHSWSZhZlh8fs5h3OceZfn+5tn3vk572xEREREREREOiFt6wD6JoQAAMjlco33UVdXh5qaGsjlchgaGmorGjN00AztJQczMEN7y6CNHA8eqx88dlNj7GvMwAydOwczdK4M7GvKaaOvNac93I90oTPWxZo6js5YF2tSjTo97bE7+VhdXQ0AeOqpp9o4CRERqaq6uhoWFhZtHaNdYl8jIup42Neax75GRNSxqNLTJOIxe9mtvr4eFRUV6NatGyQSSYNlI0eOxOnTpxtt8+j1crkcTz31FK5cuQJzc3OdZ26KLjI0V7+2Mqiy/5bWaWpZSxnUrac11LkttJ3rwf50lUGdeWsqg7pz2trbR1fHpzq5VMmgaZ2qbjdixAiUlZVpnEFb89ba+RBCoLq6Gra2tujShd8U0hT2taZpcoxpu69psrw99DVd9HdVadLT1M2gzryo09fUvV5VHaGv6bqnAbrra+pcr425YF9TrqW+1lrtod/pQmesizV1HJ2xLtakGnV62mP3zscuXbrAzs6uyWUGBgZNTkJz15ubm7f5HVGbGZqrU1sZVNl/S+u0tKypDJrW0xqq3BbazvXo/rSdQZN5eziDunOqrdtH28enJrlayqBpnapuZ2Bg0KoM2p631swH3xnSMva1prXmsURbfa01y9tDX9Nmf1eVJj1N3QyazIsqfU3d69XVnvuarnvag3VbyqBsf9qct9bOBftay1rqa9rSHvqdLnTGulhTx9EZ62JNyqna0/hy20PCwsLUur6z0XWdquy/pXXUzdde503buTTZnzrb6HLemlrGedPOdrNmzWrVvjrLvD3uHue+po8alY3R2uWtXV9ftJlL14+Nqqyr6fKOdrx1tHnTVV/raPNGRETUET12H7vWBrlcDgsLC9y6datNP57GDO0jQ3vJwQzM0N4ytKcc1LL2ME/MwAztLUN7ycEMzEDa01nnsDPWxZo6js5YF2vSPr7zUQPGxsaIjY2FsbExMzBDu8nBDMzQ3jK0pxzUsvYwT8zADO0tQ3vJwQzMQNrTWeewM9bFmjqOzlgXa9I+vvORiIiIiIiIiIiIdILvfCQiIiIiIiIiIiKd4MlHIiIiIiIiIiIi0gmefCQiIiIiIiIiIiKd4MlHIiIiIiIiIiIi0gmefFTD0aNH8fLLL8PW1hYSiQR79+7Ve4YVK1Zg5MiR6NatG6ysrDBp0iQUFxfrPcejmSQSCebOnau3Me/du4clS5bAwcEBMpkM/fr1w7Jly1BfX6+zMVWZ/6KiIkyYMAEWFhbo1q0bRo0ahcuXL2stw5YtWzB06FCYm5vD3Nwcnp6eOHDgAACgrq4OUVFRcHFxgZmZGWxtbTFt2jRUVFRobfwH/vvf/2Lq1Kno2bMnTE1N4ebmhpycnCbXnT17NiQSCdauXavxeC3d9qrWfe3aNQQHB8Pa2hpmZmYYPnw49uzZo3IGVY696dOnQyKRNPgbNWpUo3398MMPeP7552FmZobu3btj9OjR+N///qc0Q1xcXKP9W1tbK5anpKQgICAAlpaWkEgkyMvLa7D9b7/9hvfeew8DBw6Eqakp+vTpg4iICNy6davZMZXd74UQiIuLg62tLWQyGUaPHo2CggKNx6ytrYWbm1uT+Un72rqvsaf9hX2Nfe0B9jX2NWqdzZs3w8HBASYmJhgxYgSOHTvW7LpZWVnw9vZGz549IZPJ4OzsjDVr1ugxrerUqethx48fh1QqhZubm24DakCdmjIyMho9XkgkEvz00096TKycuvNUW1uLxYsXw97eHsbGxujfvz8SExP1lFZ16tTVVO+QSCQYMmSIHhMrp+5cJSUlwdXVFaamprCxscFbb72FGzdu6CmtatStadOmTRg0aBBkMhkGDhyInTt36iwbTz6q4c6dO3B1dcXGjRvbLENmZibCwsJw8uRJpKWl4d69e/D398edO3faJM/p06exdetWDB06VK/jJiQk4OOPP8bGjRtRVFSElStX4sMPP8SGDRt0Nqay+b9w4QJ8fHzg7OyMjIwMnDt3DjExMTAxMdFaBjs7O8THx+PMmTM4c+YMnn/+eUycOBEFBQWoqanB2bNnERMTg7NnzyIlJQUlJSWYMGGC1sYHgJs3b8Lb2xuGhoY4cOAACgsLsXr1anTv3r3Runv37kV2djZsbW1bNWZLt72qdQcHB6O4uBj79u1Dfn4+AgMDERQUhNzcXJUyqHrsjR07FpWVlYq/b7/9tsHyH374AWPHjoW/vz9OnTqF06dPIzw8HF26qPZwPGTIkAb7z8/Pb3A7eXt7Iz4+vsltKyoqUFFRgVWrViE/Px+ffvopDh48iJkzZzY7nrL7/cqVK/HRRx9h48aNOH36NKytrfHCCy+gurpaozEXLlzY6vsLqa6t+xp72l/Y19jXHmBfY18jze3atQtz587F4sWLkZubi2effRbjxo1r9kUTMzMzhIeH4+jRoygqKsKSJUuwZMkSbN26Vc/JW6ZuXQ/cunUL06ZNg5+fn56Sqk7TmoqLixs8Zjg5OekpsXKa1PT666/j8OHD2L59O4qLi/Gvf/0Lzs7OekytnLp1rVu3rsEcXblyBT169MBrr72m5+TNU7emrKwsTJs2DTNnzkRBQQF2796N06dP4+2339Zz8uapW9OWLVuwaNEixMXFoaCgAEuXLkVYWBi++eYb3QQUpBEAIjU1ta1jiKqqKgFAZGZm6n3s6upq4eTkJNLS0oSvr6+YM2eO3sYeP368mDFjRoPrAgMDxdSpU/UyflPzHxQUpLfxH/bEE0+ITz75pMllp06dEgDEzz//rLXxoqKihI+Pj9L1rl69Knr37i1+/PFHYW9vL9asWaOV8VU59pqq28zMTOzcubPBej169Gj2tlOmqWMvJCRETJw4scXtPDw8xJIlSzQaMzY2Vri6uipdr7y8XAAQubm5Stf96quvhJGRkairq1O67qO3fX19vbC2thbx8fGK6/744w9hYWEhPv74Y7XH/Pbbb4Wzs7MoKChQOT9pT3voa49rTxOCfe1h7GuNsa+xr5FqnnnmGREaGtrgOmdnZxEdHa3yPl555ZU2eexriaZ1BQUFiSVLlqh8rOmTujWlp6cLAOLmzZt6SKcZdWs6cOCAsLCwEDdu3NBHPI219rhKTU0VEolEXLp0SRfxNKJuTR9++KHo169fg+vWr18v7OzsdJZRXerW5OnpKd5///0G182ZM0d4e3vrJB/f+djBPfiIR48ePfQ+dlhYGMaPH48xY8bofWwfHx8cPnwYJSUlAIBz584hKysLL774ot6zAEB9fT3279+PAQMGICAgAFZWVvDw8NDpRxjv37+P5ORk3LlzB56enk2uc+vWLUgkkibfvaGpffv2wd3dHa+99hqsrKwwbNgwbNu2rcE69fX1CA4ORmRkZJu8vb6pun18fLBr1y789ttvqK+vR3JyMmprazF69GiNxwAaH3sZGRmwsrLCgAEDMGvWLFRVVSmWVVVVITs7G1ZWVvDy8kKvXr3g6+uLrKwslcctLS2Fra0tHBwcMHnyZFy8eFGj/A/XYW5uDqlUqva25eXluHbtGvz9/RXXGRsbw9fXFydOnFBrzOvXr2PWrFn4/PPPYWpqqnYW6hwe154GsK8B7GstYV9Trw72tcfT3bt3kZOT02D+AMDf37/F+XtYbm4uTpw4AV9fX11E1Iimde3YsQMXLlxAbGysriOqrTVzNWzYMNjY2MDPzw/p6em6jKkWTWp60H9WrlyJ3r17Y8CAAXj//fdV+toKfdHGcbV9+3aMGTMG9vb2uoioNk1q8vLywtWrV/Htt99CCIHr169jz549GD9+vD4iK6VJTbW1tY0+zSKTyXDq1CnU1dVpPSNPPnZgQgjMnz8fPj4+ePrpp/U6dnJyMs6ePYsVK1boddwHoqKiMGXKFDg7O8PQ0BDDhg3D3LlzMWXKlDbJU1VVhdu3byM+Ph5jx47FoUOH8MorryAwMBCZmZlaHSs/Px9du3aFsbExQkNDkZqaisGDBzda748//kB0dDTeeOMNmJuba238ixcvYsuWLXBycsJ3332H0NBQRERENPh+iISEBEilUkRERGhtXFU1V/euXbtw79499OzZE8bGxpg9ezZSU1PRv39/tcdo7tgbN24ckpKScOTIEaxevRqnT5/G888/j9raWgBQPKGKi4vDrFmzcPDgQQwfPhx+fn4oLS1VOq6Hhwd27tyJ7777Dtu2bcO1a9fg5eWl8XeN3LhxA//4xz8we/Zsjba/du0aAKBXr14Nru/Vq5dimSpjCiEwffp0hIaGwt3dXaMs1PE9zj0NYF9jX2se+5rq2Nceb7/++ivu37+v1vw9YGdnB2NjY7i7uyMsLKxdfZRSk7pKS0sRHR2NpKQkjU7E65omNdnY2GDr1q34+uuvkZKSgoEDB8LPzw9Hjx7VR2SlNKnp4sWLyMrKwo8//ojU1FSsXbsWe/bsQVhYmD4iq6Q1xxUAVFZW4sCBAx3+mPLy8kJSUhKCgoJgZGQEa2trdO/eXadfj6MOTWoKCAjAJ598gpycHAghcObMGSQmJqKurg6//vqr1jO2v0ciUll4eDjOnz+v1qvL2nDlyhXMmTMHhw4d0ur3Pqlj165d+OKLL/Dll19iyJAhyMvLw9y5c2Fra4uQkBC953nwgwATJ07EvHnzAABubm44ceIEPv74Y62+ejpw4EDk5eXh999/x9dff42QkBBkZmY2eKJWV1eHyZMno76+Hps3b9ba2MCftbq7u+Of//wngD9ffSwoKMCWLVswbdo05OTkYN26dTh79iwkEolWx1ampbqXLFmCmzdv4vvvv4elpSX27t2L1157DceOHYOLi4ta4zR37AUFBSn+/fTTT8Pd3R329vbYv38/AgMDFfeT2bNn46233gLw5+13+PBhJCYmKj3xMW7cOMW/XVxc4Onpif79++Ozzz7D/Pnz1apBLpdj/PjxGDx4cKtfEX90noUQTc59c2Nu2LABcrkcixYtalUO6tge554GsK+xrzWNfU117Gv0gKrz97Bjx47h9u3bOHnyJKKjo+Ho6NhmL/40R9W67t+/jzfeeANLly7FgAED9BVPI+rM1cCBAzFw4EDFZU9PT1y5cgWrVq3Cc889p9Oc6lCnpvr6ekgkEiQlJcHCwgIA8NFHH+HVV1/Fpk2bIJPJdJ5XVZocVwDw6aefonv37pg0aZKOkmlOnZoKCwsRERGBDz74AAEBAaisrERkZCRCQ0Oxfft2fcRViTo1xcTE4Nq1axg1ahSEEOjVqxemT5+OlStXwsDAQOvZ+M7HDuq9997Dvn37kJ6eDjs7O72OnZOTg6qqKowYMQJSqRRSqRSZmZlYv349pFIp7t+/r/MMkZGRiI6OxuTJk+Hi4oLg4GDMmzevzd61YmlpCalU2uidGoMGDdLqr4ICgJGRERwdHeHu7o4VK1bA1dUV69atUyyvq6vD66+/jvLycqSlpWn13SHAn686tlTnsWPHUFVVhT59+ijuHz///DMWLFiAvn37ajXLw1qq+8KFC9i4cSMSExPh5+cHV1dXxMbGwt3dHZs2bVJrHHWOPRsbG9jb2yve/WFjYwMAWrufmJmZwcXFRaV3lzysuroaY8eORdeuXZGamgpDQ0O1xwag+EXSR19Nq6qqavSqW0tjHjlyBCdPnoSxsTGkUikcHR0BAO7u7m1y0oX073HvaQD7GvtaY+xrqmNfI+DPxy0DAwOV5u9RDg4OcHFxwaxZszBv3jzExcXpMKl61K2ruroaZ86cQXh4uOIxa9myZTh37hykUimOHDmir+jNas1cPWzUqFFqP17oiiY12djYoHfv3ooTj8Cfj59CCFy9elWneVXVmrkSQiAxMRHBwcEwMjLSZUy1aFLTihUr4O3tjcjISAwdOhQBAQHYvHkzEhMTUVlZqY/YLdKkJplMhsTERNTU1ODSpUu4fPky+vbti27dusHS0lLrGXnysYMRQiA8PBwpKSk4cuQIHBwc9J7Bz88P+fn5yMvLU/y5u7vjzTffRF5enk7Okj+qpqam0a8oGhgYKF6B1zcjIyOMHDkSxcXFDa4vKSnR+XdbCCEUH3968ESltLQU33//PXr27Kn18by9vVusMzg4GOfPn29w/7C1tUVkZCS+++47recBlNddU1MDAK26z2hy7N24cQNXrlxRPDnr27cvbG1ttXY/qa2tRVFRkWL/qpDL5fD394eRkRH27dvXqnd6OTg4wNraGmlpaYrr7t69i8zMTHh5eak85vr163Hu3DnF/eXBL6nu2rULy5cv1zgftX/saX9hX/sL+xr7GvsaacLIyAgjRoxoMH8AkJaW1mD+lHn4Mag9ULcuc3PzRn0tNDRU8S5zDw8PfUVvlrbmKjc3V63HC13SpCZvb29UVFTg9u3biutKSkrQpUsXvb8Y25zWzFVmZibKysowc+ZMXUZUmyY1Nff/NODPx4y21pp5MjQ0hJ2dHQwMDJCcnIyXXnqpUa1aoZOfsemkqqurRW5ursjNzRUAxEcffSRyc3O1+ouLyrzzzjvCwsJCZGRkiMrKSsVfTU2N3jI0Rd+/DBoSEiJ69+4t/vOf/4jy8nKRkpIiLC0txcKFC3U2prL5T0lJEYaGhmLr1q2itLRUbNiwQRgYGIhjx45pLcOiRYvE0aNHRXl5uTh//rz4+9//Lrp06SIOHTok6urqxIQJE4SdnZ3Iy8trcP+ora3VWoZTp04JqVQqli9fLkpLS0VSUpIwNTUVX3zxRbPbtPZXQVu67VWp++7du8LR0VE8++yzIjs7W5SVlYlVq1YJiUQi9u/fr1IGZcdedXW1WLBggThx4oQoLy8X6enpwtPTU/Tu3VvI5XLFftasWSPMzc3F7t27RWlpqViyZIkwMTERZWVlSjMsWLBAZGRkiIsXL4qTJ0+Kl156SXTr1k3xy3E3btwQubm5Yv/+/QKASE5OFrm5uaKyslIIIYRcLhceHh7CxcVFlJWVNajj3r17at/2QggRHx8vLCwsREpKisjPzxdTpkwRNjY2ipo1GVOdXzWl1mnrvsae9hf2NfY19jX2NWq95ORkYWhoKLZv3y4KCwvF3LlzhZmZmeI+FR0dLYKDgxXrb9y4Uezbt0+UlJSIkpISkZiYKMzNzcXixYvbqoQmqVvXo9rjr12rW9OaNWtEamqqKCkpET/++KOIjo4WAMTXX3/dViU0om5N1dXVws7OTrz66quioKBAZGZmCicnJ/H222+3VQlN0vT+N3XqVOHh4aHvuCpRt6YdO3YIqVQqNm/eLC5cuCCysrKEu7u7eOaZZ9qqhEbUram4uFh8/vnnoqSkRGRnZ4ugoCDRo0cPUV5erpN8PPmohvT0dAGg0V9ISIjeMjQ1PgCxY8cOvWVoir6fqMnlcjFnzhzRp08fYWJiIvr16ycWL16s1Scjj1Jl/rdv3y4cHR2FiYmJcHV1FXv37tVqhhkzZgh7e3thZGQknnzySeHn5ycOHTokhPjrP7ZN/aWnp2s1xzfffCOefvppYWxsLJydncXWrVtbXL+1T9Jauu1VrbukpEQEBgYKKysrYWpqKoYOHSp27typcgZlx15NTY3w9/cXTz75pDA0NBR9+vQRISEh4vLly432tWLFCmFnZydMTU2Fp6enyk/kg4KChI2NjTA0NBS2trYiMDBQFBQUKJbv2LGjyYyxsbEt3o4Amm0yyu739fX1IjY2VlhbWwtjY2Px3HPPifz8fKXbtzQmn6TpT1v3Nfa0v7Cvsa+xr7GvkXZs2rRJ8bgyfPhwkZmZqVgWEhIifH19FZfXr18vhgwZIkxNTYW5ubkYNmyY2Lx5s7h//34bJG+ZOnU9qj2efBRCvZoSEhJE//79hYmJiXjiiSeEj4+Pyi+26JO681RUVCTGjBkjZDKZsLOzE/Pnz2/zF2Gbom5dv//+u5DJZEr7aVtSt6b169eLwYMHC5lMJmxsbMSbb74prl69qufULVOnpsLCQuHm5iZkMpkwNzcXEydOFD/99JPOskmEaAfvESUiIiIiIiIiIqJOh9/5SERERERERERERDrBk49ERERERERERESkEzz5SERERERERERERDrBk49ERERERERERESkEzz5SERERERERERERDrBk49ERERERERERESkEzz5SERERERERERERDrBk49Ej4FLly5BIpEgLy+vraMQERG1GvsaERF1ZnFxcXBzc1Ncnj59OiZNmtRmeYhaiycfiYiIiIiIiIiISCd48pGog6urq2vrCERERFrDvkZERO3Z3bt32zoCUYfDk49EWjZ69GhERERg4cKF6NGjB6ytrREXF6fSthKJBFu2bMG4ceMgk8ng4OCA3bt3K5Y/+JjZV199hdGjR8PExARffPEF6uvrsWzZMtjZ2cHY2Bhubm44ePBgo/3/9NNP8PLygomJCYYMGYKMjIwGywsLC/Hiiy+ia9eu6NWrF4KDg/Hrr78qlu/ZswcuLi6QyWTo2bMnxowZgzt37mh0OxERUcfAvkZERI+z0aNHIzw8HPPnz4elpSVeeOEFpf2lvr4eCQkJcHR0hLGxMfr06YPly5crlkdFRWHAgAEwNTVFv379EBMTwxffqFPjyUciHfjss89gZmaG7OxsrFy5EsuWLUNaWppK28bExOBvf/sbzp07h6lTp2LKlCkoKipqsE5UVBQiIiJQVFSEgIAArFu3DqtXr8aqVatw/vx5BAQEYMKECSgtLW2wXWRkJBYsWIDc3Fx4eXlhwoQJuHHjBgCgsrISvr6+cHNzw5kzZ3Dw4EFcv34dr7/+umL5lClTMGPGDBQVFSEjIwOBgYEQQmjhFiMiovaMfY2IiB5nn332GaRSKY4fP474+PgW+wsALFq0CAkJCYiJiUFhYSG+/PJL9OrVS7G8W7du+PTTT1FYWIh169Zh27ZtWLNmTVuURqQfgoi0ytfXV/j4+DS4buTIkSIqKkrptgBEaGhog+s8PDzEO++8I4QQory8XAAQa9eubbCOra2tWL58eaMx33333QbbxcfHK5bX1dUJOzs7kZCQIIQQIiYmRvj7+zfYx5UrVwQAUVxcLHJycgQAcenSJaV1EBFR58G+RkREjzNfX1/h5uamuKysv8jlcmFsbCy2bdum8hgrV64UI0aMUFyOjY0Vrq6uisshISFi4sSJGtdA1NakbXTOk6hTGzp0aIPLNjY2qKqqUmlbT0/PRpcf/TVPd3d3xb/lcjkqKirg7e3dYB1vb2+cO3eu2X1LpVK4u7sr3n2Sk5OD9PR0dO3atVGmCxcuwN/fH35+fnBxcUFAQAD8/f3x6quv4oknnlCpLiIi6rjY14iI6HH2cJ9S1l9+//131NbWws/Pr9n97dmzB2vXrkVZWRlu376Ne/fuwdzcXCfZidoDnnwk0gFDQ8MGlyUSCerr6zXen0QiaXDZzMxM6TpCiEbXtbTv+vp6vPzyy0hISGi0jo2NDQwMDJCWloYTJ07g0KFD2LBhAxYvXozs7Gw4ODioUw4REXUw7GtERPQ4e7hPKesvFy9ebHFfJ0+exOTJk7F06VIEBATAwsICycnJWL16tdZzE7UX/M5Honbm5MmTjS47Ozs3u765uTlsbW2RlZXV4PoTJ05g0KBBze773r17yMnJUex7+PDhKCgoQN++feHo6Njg70GzlUgk8Pb2xtKlS5GbmwsjIyOkpqa2ql4iIurc2NeIiKgzUdZfnJycIJPJcPjw4Sa3P378OOzt7bF48WK4u7vDyckJP//8s56rINIvvvORqJ3ZvXs33N3d4ePjg6SkJJw6dQrbt29vcZvIyEjExsaif//+cHNzw44dO5CXl4ekpKQG623atAlOTk4YNGgQ1qxZg5s3b2LGjBkAgLCwMGzbtg1TpkxBZGQkLC0tUVZWhuTkZGzbtg1nzpzB4cOH4e/vDysrK2RnZ+OXX35p9ESQiIjoYexrRETUmSjrLyYmJoiKisLChQthZGQEb29v/PLLLygoKMDMmTPh6OiIy5cvIzk5GSNHjsT+/fv5whd1ejz5SNTOLF26FMnJyXj33XdhbW2NpKQkDB48uMVtIiIiIJfLsWDBAlRVVWHw4MHYt28fnJycGqwXHx+PhIQE5Obmon///vj3v/8NS0tLAICtrS2OHz+OqKgoBAQEoLa2Fvb29hg7diy6dOkCc3NzHD16FGvXroVcLoe9vT1Wr16NcePG6ey2ICKijo99jYiIOhNl/QUAYmJiIJVK8cEHH6CiogI2NjYIDQ0FAEycOBHz5s1DeHg4amtrMX78eMTExCAuLq4NqyLSLYkQQrR1CCL6k0QiQWpqKiZNmtTWUYiIiFqNfY2IiIiI+J2PREREREREREREpBM8+UikJ0lJSejatWuTf0OGDGnreERERGphXyMiIiIiVfBj10R6Ul1djevXrze5zNDQEPb29npOREREpDn2NSIiIiJSBU8+EhERERERERERkU7wY9dERERERERERESkEzz5SERERERERERERDrBk49ERERERERERESkEzz5SERERERERERERDrBk49ERERERERERESkEzz5SERERERERERERDrBk49ERERERERERESkEzz5SERERERERERERDrxf2Shktx2LDM0AAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "fig, ax = plt.subplots(1, 3, figsize=plt.figaspect(1/4))\n", "\n", @@ -475,9 +714,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "467 ms ± 1.14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "362 ms ± 1.91 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "297 ms ± 2.25 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "344 ms ± 1.71 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "288 ms ± 1.12 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], "source": [ "bench_qps_s1 = np.zeros((5,), dtype=np.float32)\n", "bench_recall_s1 = np.zeros((5,), dtype=np.float32)\n", @@ -492,20 +743,31 @@ "bench_names = ['32/32', '32/16', '32/8', '16/16', '16/8']\n", "\n", "for i, sp in enumerate(search_ps):\n", - " r = %timeit -o ivf_pq.search(sp, index, queries, k, handle=resources); resources.sync()\n", + " r = %timeit -o ivf_pq.search(sp, index, queries, k, resources=resources); resources.sync()\n", " bench_qps_s1[i] = (queries.shape[0] * r.loops / np.array(r.all_runs)).mean()\n", - " bench_recall_s1[i] = calc_recall(ivf_pq.search(sp, index, queries, k, handle=resources)[1], gt_neighbors)" + " bench_recall_s1[i] = calc_recall(ivf_pq.search(sp, index, queries, k, resources=resources)[1], gt_neighbors)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "fig, ax = plt.subplots(1, 1, figsize=plt.figaspect(1/2))\n", "fig.suptitle(\n", - " f'Effects of search parameters on QPS/recall trade-off ({DATASET_FILENAME})\\n' + \\\n", + " f'Effects of search parameters on QPS/recall trade-off ({DATASET_NAME})\\n' + \\\n", " f'k = {k}, n_probes = {n_probes}, pq_dim = {pq_dim}')\n", "ax.plot(bench_recall_s1, bench_qps_s1, 'o')\n", "ax.set_xlabel('recall')\n", @@ -547,14 +809,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "463 ms ± 2.33 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "360 ms ± 2.12 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "297 ms ± 2.74 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "342 ms ± 1.37 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "287 ms ± 1.79 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "490 ms ± 3.19 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "407 ms ± 3.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "378 ms ± 1.97 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "395 ms ± 1.73 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "342 ms ± 2.51 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "541 ms ± 1.61 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "437 ms ± 1.09 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "366 ms ± 1.56 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "414 ms ± 1.27 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", + "375 ms ± 1.89 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], "source": [ "def search_refine(ps, ratio):\n", " k_search = k * ratio\n", - " candidates = ivf_pq.search(ps, index, queries, k_search, handle=resources)[1]\n", - " return candidates if ratio == 1 else refine(dataset, queries, candidates, k, handle=resources)[1]\n", + " candidates = ivf_pq.search(ps, index, queries, k_search, resources=resources)[1]\n", + " return candidates if ratio == 1 else refine(dataset, queries, candidates, k, resources=resources)[1]\n", "\n", "ratios = [1, 2, 4]\n", "bench_qps_sr = np.zeros((len(ratios), len(search_ps)), dtype=np.float32)\n", @@ -569,13 +853,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "fig, ax = plt.subplots(1, 1, figsize=plt.figaspect(1/2))\n", "fig.suptitle(\n", - " f'Effects of search parameters on QPS/recall trade-off ({DATASET_FILENAME})\\n' + \\\n", + " f'Effects of search parameters on QPS/recall trade-off ({DATASET_NAME})\\n' + \\\n", " f'k = {k}, n_probes = {n_probes}, pq_dim = {pq_dim}')\n", "labels = []\n", "for j, ratio in enumerate(ratios):\n", @@ -619,7 +914,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -629,8 +924,8 @@ " n_probes=n_probes,\n", " internal_distance_dtype=internal_distance_dtype,\n", " lut_dtype=lut_dtype)\n", - " candidates = ivf_pq.search(ps, index, queries, k_search, handle=resources)[1]\n", - " return candidates if ratio == 1 else refine(dataset, queries, candidates, k, handle=resources)[1]\n", + " candidates = ivf_pq.search(ps, index, queries, k_search, resources=resources)[1]\n", + " return candidates if ratio == 1 else refine(dataset, queries, candidates, k, resources=resources)[1]\n", "\n", "search_configs = [\n", " lambda n_probes: search_refine(np.float16, np.float16, 1, n_probes),\n", @@ -688,9 +983,52 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "using ivf_pq::index_params nrows 1000000, dim 128, n_lits 100, pq_dim 64\n", + "5.41 ms ± 25 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "5.41 ms ± 31.8 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "5.41 ms ± 18.1 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "9.76 ms ± 85.6 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "37.8 ms ± 219 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "70.5 ms ± 78 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "using ivf_pq::index_params nrows 1000000, dim 128, n_lits 500, pq_dim 64\n", + "2.37 ms ± 12.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "4.08 ms ± 19.5 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "8.81 ms ± 18.8 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "16.3 ms ± 38.6 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "73.3 ms ± 176 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "142 ms ± 362 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "using ivf_pq::index_params nrows 1000000, dim 128, n_lits 1000, pq_dim 64\n", + "3.49 ms ± 20.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "7.36 ms ± 7.32 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "13.6 ms ± 29.1 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", + "26.3 ms ± 1.21 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "120 ms ± 150 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n", + "233 ms ± 1.24 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + }, + { + "ename": "CuvsException", + "evalue": "std::bad_alloc: out_of_memory: RMM failure at:/home/cjnolet/software/miniconda3/envs/cuvs_062724_2408/include/rmm/mr/device/pool_memory_resource.hpp:255: Maximum pool size exceeded", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mCuvsException\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[26], line 12\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, n_lists \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(n_list_variants):\n\u001b[1;32m 11\u001b[0m index_params \u001b[38;5;241m=\u001b[39m ivf_pq\u001b[38;5;241m.\u001b[39mIndexParams(n_lists\u001b[38;5;241m=\u001b[39mn_lists, metric\u001b[38;5;241m=\u001b[39mmetric, pq_dim\u001b[38;5;241m=\u001b[39mpq_dim)\n\u001b[0;32m---> 12\u001b[0m index \u001b[38;5;241m=\u001b[39m \u001b[43mivf_pq\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbuild\u001b[49m\u001b[43m(\u001b[49m\u001b[43mindex_params\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdataset\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresources\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresources\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m j, pl_ratio \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(pl_ratio_variants):\n\u001b[1;32m 14\u001b[0m n_probes \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mmax\u001b[39m(\u001b[38;5;241m1\u001b[39m, n_lists \u001b[38;5;241m/\u001b[39m\u001b[38;5;241m/\u001b[39m pl_ratio)\n", + "File \u001b[0;32mresources.pyx:110\u001b[0m, in \u001b[0;36mcuvs.common.resources.auto_sync_resources.wrapper\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mivf_pq.pyx:269\u001b[0m, in \u001b[0;36mcuvs.neighbors.ivf_pq.ivf_pq.build\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mivf_pq.pyx:270\u001b[0m, in \u001b[0;36mcuvs.neighbors.ivf_pq.ivf_pq.build\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mexceptions.pyx:37\u001b[0m, in \u001b[0;36mcuvs.common.exceptions.check_cuvs\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mCuvsException\u001b[0m: std::bad_alloc: out_of_memory: RMM failure at:/home/cjnolet/software/miniconda3/envs/cuvs_062724_2408/include/rmm/mr/device/pool_memory_resource.hpp:255: Maximum pool size exceeded" + ] + } + ], "source": [ "n_list_variants = [100, 500, 1000, 2000, 5000]\n", "pl_ratio_variants = [500, 200, 100, 50, 10, 5]\n", @@ -703,12 +1041,13 @@ "\n", "for i, n_lists in enumerate(n_list_variants):\n", " index_params = ivf_pq.IndexParams(n_lists=n_lists, metric=metric, pq_dim=pq_dim)\n", - " index = ivf_pq.build(index_params, dataset, handle=resources)\n", + " index = ivf_pq.build(index_params, dataset, resources=resources)\n", " for j, pl_ratio in enumerate(pl_ratio_variants):\n", " n_probes = max(1, n_lists // pl_ratio)\n", " r = %timeit -o search_fun(n_probes); resources.sync()\n", " bench_qps_nl[i, j] = (queries.shape[0] * r.loops / np.array(r.all_runs)).mean()\n", - " bench_recall_nl[i, j] = calc_recall(search_fun(n_probes), gt_neighbors)" + " bench_recall_nl[i, j] = calc_recall(search_fun(n_probes), gt_neighbors)\n", + " del index" ] }, { @@ -719,7 +1058,7 @@ "source": [ "fig, ax = plt.subplots(1, 1, figsize=plt.figaspect(1/2))\n", "fig.suptitle(\n", - " f'Effects of n_list on QPS/recall trade-off ({DATASET_FILENAME})\\n' + \\\n", + " f'Effects of n_list on QPS/recall trade-off ({DATASET_NAME})\\n' + \\\n", " f'k = {k}, pq_dim = {pq_dim}, search = {search_label}')\n", "labels = []\n", "for i, n_lists in enumerate(n_list_variants):\n", @@ -875,7 +1214,7 @@ "bench_recall_ip = np.zeros_like(bench_qps_ip, dtype=np.float32)\n", "\n", "for i, index_params in enumerate(build_configs.values()):\n", - " index = ivf_pq.build(index_params, dataset, handle=resources)\n", + " index = ivf_pq.build(index_params, dataset, resources=resources)\n", " for l, search_fun in enumerate(search_configs):\n", " for j, n_probes in enumerate(n_probes_variants):\n", " r = %timeit -o search_fun(n_probes); resources.sync()\n", @@ -891,7 +1230,7 @@ "source": [ "fig, ax = plt.subplots(len(search_config_names), 1, figsize=(16, len(search_config_names)*8))\n", "fig.suptitle(\n", - " f'Effects of index parameters on QPS/recall trade-off ({DATASET_FILENAME})\\n' + \\\n", + " f'Effects of index parameters on QPS/recall trade-off ({DATASET_NAME})\\n' + \\\n", " f'k = {k}, n_lists = {n_lists}')\n", "\n", "for j, search_label in enumerate(search_config_names):\n", @@ -932,7 +1271,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.11.9" }, "vscode": { "interpreter": { From a4418d779947fddaccd4060b1f761ba7f45d6b03 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 11 Jul 2024 18:29:19 -0400 Subject: [PATCH 2/9] A couple fixes --- cpp/CMakeLists.txt | 3 +-- cpp/src/neighbors/ball_cover/ball_cover.cuh | 16 +++++++------- cpp/src/neighbors/faiss_select/Select.cuh | 22 ++++++++++---------- cpp/src/neighbors/faiss_select/StaticUtils.h | 2 +- 4 files changed, 21 insertions(+), 22 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7c035b9df..484e1f6a9 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -456,8 +456,7 @@ add_library( ) target_compile_options( - cuvs INTERFACE $<$:--expt-extended-lambda - --expt-relaxed-constexpr> + cuvs PUBLIC $<$:--expt-extended-lambda --expt-relaxed-constexpr> ) add_library(cuvs::cuvs ALIAS cuvs) diff --git a/cpp/src/neighbors/ball_cover/ball_cover.cuh b/cpp/src/neighbors/ball_cover/ball_cover.cuh index 8b03a18e6..643417a01 100644 --- a/cpp/src/neighbors/ball_cover/ball_cover.cuh +++ b/cpp/src/neighbors/ball_cover/ball_cover.cuh @@ -644,14 +644,14 @@ void compute_landmark_dists( RAFT_EXPECTS(n_query_pts * static_cast(index.n_landmarks) < static_cast(std::numeric_limits::max()), "Too large input for pairwise_distance with `int` index."); - cuvs::distance::pairwise_distance(handle, - query_pts, - index.get_R().data_handle(), - R_dists, - n_query_pts, - index.n_landmarks, - index.n, - index.get_metric()); + cuvs::distance::pairwise_distance(handle, + query_pts, + index.get_R().data_handle(), + R_dists, + n_query_pts, + index.n_landmarks, + index.n, + index.get_metric()); } /** diff --git a/cpp/src/neighbors/faiss_select/Select.cuh b/cpp/src/neighbors/faiss_select/Select.cuh index ccd2a110c..873688418 100644 --- a/cpp/src/neighbors/faiss_select/Select.cuh +++ b/cpp/src/neighbors/faiss_select/Select.cuh @@ -126,7 +126,7 @@ struct BlockSelect { warpV[i] = initV; } - warpFence(); + raft::warpFence(); } __device__ inline void addThreadQ(K k, V v) @@ -160,7 +160,7 @@ struct BlockSelect { return; } - // This has a trailing warpFence + // This has a trailing raft::warpFence mergeWarpQ(); // Any top-k elements have been merged into the warp queue; we're @@ -176,7 +176,7 @@ struct BlockSelect { // We have to beat at least this element warpKTop = warpK[kMinus1]; - warpFence(); + raft::warpFence(); } /// This function handles sorting and merging together the @@ -199,7 +199,7 @@ struct BlockSelect { warpVRegisters[i] = warpV[i * raft::WarpSize + laneId]; } - warpFence(); + raft::warpFence(); // The warp queue is already sorted, and now that we've sorted the // per-thread queue, merge both sorted lists together, producing @@ -214,7 +214,7 @@ struct BlockSelect { warpV[i * raft::WarpSize + laneId] = warpVRegisters[i]; } - warpFence(); + raft::warpFence(); } /// WARNING: all threads in a warp must participate in this. @@ -300,12 +300,12 @@ struct BlockSelect { __device__ inline void reduce() { // Reduce within the warp - KeyValuePair pair(threadK, threadV); + raft::KeyValuePair pair(threadK, threadV); if (Dir) { - pair = warpReduce(pair, max_op{}); + pair = raft::warpReduce(pair, raft::max_op{}); } else { - pair = warpReduce(pair, min_op{}); + pair = raft::warpReduce(pair, raft::min_op{}); } // Each warp writes out a single value @@ -540,12 +540,12 @@ struct WarpSelect { __device__ inline void reduce() { // Reduce within the warp - KeyValuePair pair(threadK, threadV); + raft::KeyValuePair pair(threadK, threadV); if (Dir) { - pair = warpReduce(pair, max_op{}); + pair = raft::warpReduce(pair, raft::max_op{}); } else { - pair = warpReduce(pair, min_op{}); + pair = raft::warpReduce(pair, raft::min_op{}); } threadK = pair.key; diff --git a/cpp/src/neighbors/faiss_select/StaticUtils.h b/cpp/src/neighbors/faiss_select/StaticUtils.h index 198c28b60..05ee3c0a3 100644 --- a/cpp/src/neighbors/faiss_select/StaticUtils.h +++ b/cpp/src/neighbors/faiss_select/StaticUtils.h @@ -29,7 +29,7 @@ static_assert(!isPowerOf2(3333), "isPowerOf2"); template constexpr __host__ __device__ T nextHighestPowerOf2(T v) { - return (isPowerOf2(v) ? (T)2 * v : ((T)1 << (log2(v) + (T)1))); + return (isPowerOf2(v) ? (T)2 * v : ((T)1 << (raft::log2(v) + (T)1))); } static_assert(nextHighestPowerOf2(1) == 2, "nextHighestPowerOf2"); From 211fc79b40dab4859fc460f70b6435c176da4fd7 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 11 Jul 2024 18:45:07 -0400 Subject: [PATCH 3/9] Fixing linker issuex --- cpp/src/neighbors/ball_cover.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/neighbors/ball_cover.cu b/cpp/src/neighbors/ball_cover.cu index 84402bb4e..6726a9731 100644 --- a/cpp/src/neighbors/ball_cover.cu +++ b/cpp/src/neighbors/ball_cover.cu @@ -29,7 +29,7 @@ void all_knn_query(raft::resources const& handle, cuvs::neighbors::ball_cover::index& index, raft::device_matrix_view inds, raft::device_matrix_view dists, - int64_t k, + int k, bool perform_post_filtering, float weight) { @@ -65,7 +65,7 @@ void knn_query(raft::resources const& handle, raft::device_matrix_view query, raft::device_matrix_view inds, raft::device_matrix_view dists, - int64_t k, + int k, bool perform_post_filtering, float weight) { From bbd9a5717e2fa6c9c348196f8484b005ac341b90 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 15 Aug 2024 14:59:04 -0400 Subject: [PATCH 4/9] Adding source file for template instnatiations --- cpp/CMakeLists.txt | 1 + cpp/src/neighbors/ball_cover/ball_cover.cu | 85 +++++++++++++++++++ .../neighbors/ball_cover/registers-ext.cuh | 4 - 3 files changed, 86 insertions(+), 4 deletions(-) create mode 100644 cpp/src/neighbors/ball_cover/ball_cover.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 484e1f6a9..f6c9f5b82 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -240,6 +240,7 @@ add_library( src/distance/distance.cu src/distance/pairwise_distance.cu src/neighbors/ball_cover.cu + src/neighbors/ball_cover/ball_cover.cu src/neighbors/brute_force.cu src/neighbors/cagra_build_float.cu src/neighbors/cagra_build_int8.cu diff --git a/cpp/src/neighbors/ball_cover/ball_cover.cu b/cpp/src/neighbors/ball_cover/ball_cover.cu new file mode 100644 index 000000000..ef898f444 --- /dev/null +++ b/cpp/src/neighbors/ball_cover/ball_cover.cu @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#define instantiate_raft_neighbors_ball_cover(idx_t, value_t, int_t, matrix_idx_t) \ + template void raft::neighbors::ball_cover::build_index( \ + raft::resources const& handle, \ + raft::neighbors::ball_cover::BallCoverIndex& index); \ + \ + template void raft::neighbors::ball_cover::eps_nn( \ + raft::resources const& handle, \ + const raft::neighbors::ball_cover::BallCoverIndex& index, \ + raft::device_matrix_view adj, \ + raft::device_vector_view vd, \ + raft::device_matrix_view query, \ + value_t eps); \ + \ + template void raft::neighbors::ball_cover::eps_nn( \ + raft::resources const& handle, \ + const raft::neighbors::ball_cover::BallCoverIndex& index, \ + raft::device_vector_view ia, \ + raft::device_vector_view ja, \ + raft::device_vector_view vd, \ + raft::device_matrix_view query, \ + value_t eps, \ + std::optional> max_k); \ + \ + template void raft::neighbors::ball_cover::all_knn_query( \ + raft::resources const& handle, \ + raft::neighbors::ball_cover::BallCoverIndex& index, \ + int_t k, \ + idx_t* inds, \ + value_t* dists, \ + bool perform_post_filtering, \ + float weight); \ + \ + template void raft::neighbors::ball_cover::all_knn_query( \ + raft::resources const& handle, \ + raft::neighbors::ball_cover::BallCoverIndex& index, \ + raft::device_matrix_view inds, \ + raft::device_matrix_view dists, \ + int_t k, \ + bool perform_post_filtering, \ + float weight); \ + \ + template void raft::neighbors::ball_cover::knn_query( \ + raft::resources const& handle, \ + const raft::neighbors::ball_cover::BallCoverIndex& index, \ + int_t k, \ + const value_t* query, \ + int_t n_query_pts, \ + idx_t* inds, \ + value_t* dists, \ + bool perform_post_filtering, \ + float weight); \ + \ + template void raft::neighbors::ball_cover::knn_query( \ + raft::resources const& handle, \ + const raft::neighbors::ball_cover::BallCoverIndex& index, \ + raft::device_matrix_view query, \ + raft::device_matrix_view inds, \ + raft::device_matrix_view dists, \ + int_t k, \ + bool perform_post_filtering, \ + float weight); + +instantiate_raft_neighbors_ball_cover(int64_t, float, int64_t, int64_t); + +#undef instantiate_raft_neighbors_ball_cover diff --git a/cpp/src/neighbors/ball_cover/registers-ext.cuh b/cpp/src/neighbors/ball_cover/registers-ext.cuh index 10ff30a1f..6b8782a7e 100644 --- a/cpp/src/neighbors/ball_cover/registers-ext.cuh +++ b/cpp/src/neighbors/ball_cover/registers-ext.cuh @@ -23,8 +23,6 @@ #include // uint32_t -#if defined(RAFT_EXPLICIT_INSTANTIATE_ONLY) - namespace cuvs::neighbors::detail { template Date: Mon, 26 Aug 2024 17:50:38 -0400 Subject: [PATCH 5/9] It's building, but likely not the most efficiently --- cpp/CMakeLists.txt | 14 +- cpp/src/neighbors/ball_cover.cuh | 66 +++---- cpp/src/neighbors/ball_cover/ball_cover.cu | 85 --------- cpp/src/neighbors/ball_cover/ball_cover.cuh | 7 +- cpp/src/neighbors/ball_cover/common.cuh | 4 +- .../ball_cover/registers_00_generate.py | 165 ++++++++++++++++++ .../registers_eps_pass_euclidean.cu | 66 +++++++ .../ball_cover/registers_pass_one_2d_dist.cu | 55 ++++++ .../registers_pass_one_2d_euclidean.cu | 55 ++++++ .../registers_pass_one_2d_haversine.cu | 55 ++++++ .../ball_cover/registers_pass_one_3d_dist.cu | 55 ++++++ .../registers_pass_one_3d_euclidean.cu | 55 ++++++ .../registers_pass_one_3d_haversine.cu | 55 ++++++ .../ball_cover/registers_pass_two_2d_dist.cu | 55 ++++++ .../registers_pass_two_2d_euclidean.cu | 55 ++++++ .../registers_pass_two_2d_haversine.cu | 55 ++++++ .../ball_cover/registers_pass_two_3d_dist.cu | 55 ++++++ .../registers_pass_two_3d_euclidean.cu | 55 ++++++ .../registers_pass_two_3d_haversine.cu | 55 ++++++ .../neighbors/ball_cover/registers-ext.cuh | 156 ++++++++++++----- .../neighbors/ball_cover/registers-inl.cuh | 8 +- cpp/src/neighbors/ball_cover/registers.cuh | 4 +- .../neighbors/ball_cover/registers_types.cuh | 6 +- .../neighbors/faiss_select/Comparators.cuh | 4 +- .../neighbors/faiss_select/DistanceUtils.h | 4 +- .../faiss_select/MergeNetworkBlock.cuh | 4 +- .../faiss_select/MergeNetworkUtils.cuh | 4 +- .../faiss_select/MergeNetworkWarp.cuh | 4 +- cpp/src/neighbors/faiss_select/Select.cuh | 4 +- cpp/src/neighbors/faiss_select/StaticUtils.h | 4 +- .../faiss_select/key_value_block_select.cuh | 4 +- 31 files changed, 1078 insertions(+), 195 deletions(-) delete mode 100644 cpp/src/neighbors/ball_cover/ball_cover.cu create mode 100644 cpp/src/neighbors/ball_cover/detail/ball_cover/registers_00_generate.py create mode 100644 cpp/src/neighbors/ball_cover/detail/ball_cover/registers_eps_pass_euclidean.cu create mode 100644 cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_2d_dist.cu create mode 100644 cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_2d_euclidean.cu create mode 100644 cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_2d_haversine.cu create mode 100644 cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_3d_dist.cu create mode 100644 cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_3d_euclidean.cu create mode 100644 cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_3d_haversine.cu create mode 100644 cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_2d_dist.cu create mode 100644 cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_2d_euclidean.cu create mode 100644 cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_2d_haversine.cu create mode 100644 cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_3d_dist.cu create mode 100644 cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_3d_euclidean.cu create mode 100644 cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_3d_haversine.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d6198910f..eabd262ee 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -259,7 +259,19 @@ add_library( src/distance/distance.cu src/distance/pairwise_distance.cu src/neighbors/ball_cover.cu - src/neighbors/ball_cover/ball_cover.cu + src/neighbors/ball_cover/detail/ball_cover/registers_eps_pass_euclidean.cu + src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_2d_dist.cu + src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_2d_euclidean.cu + src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_2d_haversine.cu + src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_3d_dist.cu + src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_3d_euclidean.cu + src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_3d_haversine.cu + src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_2d_dist.cu + src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_2d_euclidean.cu + src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_2d_haversine.cu + src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_3d_dist.cu + src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_3d_euclidean.cu + src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_3d_haversine.cu src/neighbors/brute_force.cu src/neighbors/cagra_build_float.cu src/neighbors/cagra_build_int8.cu diff --git a/cpp/src/neighbors/ball_cover.cuh b/cpp/src/neighbors/ball_cover.cuh index 4e06881a4..40a34bd71 100644 --- a/cpp/src/neighbors/ball_cover.cuh +++ b/cpp/src/neighbors/ball_cover.cuh @@ -63,12 +63,12 @@ void build_index(raft::resources const& handle, cuvs::neighbors::ball_cover::index& index) { if (index.metric == cuvs::distance::DistanceType::Haversine) { - cuvs::neighbors::detail::rbc_build_index( - handle, index, cuvs::neighbors::detail::HaversineFunc()); + cuvs::neighbors::ball_cover::detail::rbc_build_index( + handle, index, cuvs::neighbors::ball_cover::detail::HaversineFunc()); } else if (index.metric == cuvs::distance::DistanceType::L2SqrtExpanded || index.metric == cuvs::distance::DistanceType::L2SqrtUnexpanded) { - cuvs::neighbors::detail::rbc_build_index( - handle, index, cuvs::neighbors::detail::EuclideanFunc()); + cuvs::neighbors::ball_cover::detail::rbc_build_index( + handle, index, cuvs::neighbors::ball_cover::detail::EuclideanFunc()); } else { RAFT_FAIL("Metric not support"); } @@ -117,24 +117,24 @@ void all_knn_query(raft::resources const& handle, { ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation"); if (index.metric == cuvs::distance::DistanceType::Haversine) { - cuvs::neighbors::detail::rbc_all_knn_query( + cuvs::neighbors::ball_cover::detail::rbc_all_knn_query( handle, index, k, inds, dists, - cuvs::neighbors::detail::HaversineFunc(), + cuvs::neighbors::ball_cover::detail::HaversineFunc(), perform_post_filtering, weight); } else if (index.metric == cuvs::distance::DistanceType::L2SqrtExpanded || index.metric == cuvs::distance::DistanceType::L2SqrtUnexpanded) { - cuvs::neighbors::detail::rbc_all_knn_query( + cuvs::neighbors::ball_cover::detail::rbc_all_knn_query( handle, index, k, inds, dists, - cuvs::neighbors::detail::EuclideanFunc(), + cuvs::neighbors::ball_cover::detail::EuclideanFunc(), perform_post_filtering, weight); } else { @@ -266,28 +266,30 @@ void knn_query(raft::resources const& handle, { ASSERT(index.n <= 3, "only 2d and 3d vectors are supported in current implementation"); if (index.metric == cuvs::distance::DistanceType::Haversine) { - cuvs::neighbors::detail::rbc_knn_query(handle, - index, - k, - query, - n_query_pts, - inds, - dists, - cuvs::neighbors::detail::HaversineFunc(), - perform_post_filtering, - weight); + cuvs::neighbors::ball_cover::detail::rbc_knn_query( + handle, + index, + k, + query, + n_query_pts, + inds, + dists, + cuvs::neighbors::ball_cover::detail::HaversineFunc(), + perform_post_filtering, + weight); } else if (index.metric == cuvs::distance::DistanceType::L2SqrtExpanded || index.metric == cuvs::distance::DistanceType::L2SqrtUnexpanded) { - cuvs::neighbors::detail::rbc_knn_query(handle, - index, - k, - query, - n_query_pts, - inds, - dists, - cuvs::neighbors::detail::EuclideanFunc(), - perform_post_filtering, - weight); + cuvs::neighbors::ball_cover::detail::rbc_knn_query( + handle, + index, + k, + query, + n_query_pts, + inds, + dists, + cuvs::neighbors::ball_cover::detail::EuclideanFunc(), + perform_post_filtering, + weight); } else { RAFT_FAIL("Metric not supported"); } @@ -323,7 +325,7 @@ void eps_nn(raft::resources const& handle, ASSERT(index.is_index_trained(), "index must be previously trained"); // run query - cuvs::neighbors::detail::rbc_eps_nn_query( + cuvs::neighbors::ball_cover::detail::rbc_eps_nn_query( handle, index, eps, @@ -331,7 +333,7 @@ void eps_nn(raft::resources const& handle, query.extent(0), adj.data_handle(), vd.data_handle(), - cuvs::neighbors::detail::EuclideanSqFunc()); + cuvs::neighbors::ball_cover::detail::EuclideanSqFunc()); } /** @@ -380,7 +382,7 @@ void eps_nn(raft::resources const& handle, if (max_k.has_value()) { max_k_ptr = max_k.value().data_handle(); } // run query - cuvs::neighbors::detail::rbc_eps_nn_query( + cuvs::neighbors::ball_cover::detail::rbc_eps_nn_query( handle, index, eps, @@ -390,7 +392,7 @@ void eps_nn(raft::resources const& handle, adj_ia.data_handle(), adj_ja.data_handle(), vd.data_handle(), - cuvs::neighbors::detail::EuclideanSqFunc()); + cuvs::neighbors::ball_cover::detail::EuclideanSqFunc()); } /** diff --git a/cpp/src/neighbors/ball_cover/ball_cover.cu b/cpp/src/neighbors/ball_cover/ball_cover.cu deleted file mode 100644 index ef898f444..000000000 --- a/cpp/src/neighbors/ball_cover/ball_cover.cu +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include - -#define instantiate_raft_neighbors_ball_cover(idx_t, value_t, int_t, matrix_idx_t) \ - template void raft::neighbors::ball_cover::build_index( \ - raft::resources const& handle, \ - raft::neighbors::ball_cover::BallCoverIndex& index); \ - \ - template void raft::neighbors::ball_cover::eps_nn( \ - raft::resources const& handle, \ - const raft::neighbors::ball_cover::BallCoverIndex& index, \ - raft::device_matrix_view adj, \ - raft::device_vector_view vd, \ - raft::device_matrix_view query, \ - value_t eps); \ - \ - template void raft::neighbors::ball_cover::eps_nn( \ - raft::resources const& handle, \ - const raft::neighbors::ball_cover::BallCoverIndex& index, \ - raft::device_vector_view ia, \ - raft::device_vector_view ja, \ - raft::device_vector_view vd, \ - raft::device_matrix_view query, \ - value_t eps, \ - std::optional> max_k); \ - \ - template void raft::neighbors::ball_cover::all_knn_query( \ - raft::resources const& handle, \ - raft::neighbors::ball_cover::BallCoverIndex& index, \ - int_t k, \ - idx_t* inds, \ - value_t* dists, \ - bool perform_post_filtering, \ - float weight); \ - \ - template void raft::neighbors::ball_cover::all_knn_query( \ - raft::resources const& handle, \ - raft::neighbors::ball_cover::BallCoverIndex& index, \ - raft::device_matrix_view inds, \ - raft::device_matrix_view dists, \ - int_t k, \ - bool perform_post_filtering, \ - float weight); \ - \ - template void raft::neighbors::ball_cover::knn_query( \ - raft::resources const& handle, \ - const raft::neighbors::ball_cover::BallCoverIndex& index, \ - int_t k, \ - const value_t* query, \ - int_t n_query_pts, \ - idx_t* inds, \ - value_t* dists, \ - bool perform_post_filtering, \ - float weight); \ - \ - template void raft::neighbors::ball_cover::knn_query( \ - raft::resources const& handle, \ - const raft::neighbors::ball_cover::BallCoverIndex& index, \ - raft::device_matrix_view query, \ - raft::device_matrix_view inds, \ - raft::device_matrix_view dists, \ - int_t k, \ - bool perform_post_filtering, \ - float weight); - -instantiate_raft_neighbors_ball_cover(int64_t, float, int64_t, int64_t); - -#undef instantiate_raft_neighbors_ball_cover diff --git a/cpp/src/neighbors/ball_cover/ball_cover.cuh b/cpp/src/neighbors/ball_cover/ball_cover.cuh index 643417a01..fa6f1902d 100644 --- a/cpp/src/neighbors/ball_cover/ball_cover.cuh +++ b/cpp/src/neighbors/ball_cover/ball_cover.cuh @@ -50,7 +50,7 @@ #include -namespace cuvs::neighbors::detail { +namespace cuvs::neighbors::ball_cover::detail { /** * Given a set of points in row-major order which are to be @@ -208,7 +208,8 @@ void k_closest_landmarks( bfknn, raft::make_device_matrix_view(query_pts, n_query_pts, inputs.extent(1)), raft::make_device_matrix_view(R_knn_inds, n_query_pts, k), - raft::make_device_matrix_view(R_knn_dists, n_query_pts, k)); + raft::make_device_matrix_view(R_knn_dists, n_query_pts, k), + std::nullopt); } /** @@ -715,4 +716,4 @@ void rbc_eps_nn_query( vd); } -}; // namespace cuvs::neighbors::detail +}; // namespace cuvs::neighbors::ball_cover::detail diff --git a/cpp/src/neighbors/ball_cover/common.cuh b/cpp/src/neighbors/ball_cover/common.cuh index 505c58a11..d0008c2ad 100644 --- a/cpp/src/neighbors/ball_cover/common.cuh +++ b/cpp/src/neighbors/ball_cover/common.cuh @@ -24,7 +24,7 @@ #include -namespace cuvs::neighbors::detail { +namespace cuvs::neighbors::ball_cover::detail { struct NNComp { template @@ -66,4 +66,4 @@ __device__ inline bool _get_val(std::uint32_t* arr, std::uint32_t h) return (arr[idx] & (1 << bit)) > 0; } -}; // namespace cuvs::neighbors::detail +}; // namespace cuvs::neighbors::ball_cover::detail diff --git a/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_00_generate.py b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_00_generate.py new file mode 100644 index 000000000..254e0e250 --- /dev/null +++ b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_00_generate.py @@ -0,0 +1,165 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +header = """/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by registers_00_generate.py + * + * Make changes there and run in this directory: + * + * > python registers_00_generate.py + * + */ + +#include // int64_t +#include +#include "../../registers-inl.cuh" + +""" + + +macro_pass_one = """ +#define instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( \\ + Mvalue_idx, Mvalue_t, Mvalue_int, Mmatrix_idx, Mdims, Mdist_func) \\ + template void \\ + cuvs::neighbors::ball_cover::detail::rbc_low_dim_pass_one( \\ + raft::resources const& handle, \\ + const cuvs::neighbors::ball_cover::index& index, \\ + const Mvalue_t* query, \\ + const Mvalue_int n_query_rows, \\ + Mvalue_int k, \\ + const Mvalue_idx* R_knn_inds, \\ + const Mvalue_t* R_knn_dists, \\ + Mdist_func& dfunc, \\ + Mvalue_idx* inds, \\ + Mvalue_t* dists, \\ + float weight, \\ + Mvalue_int* dists_counter) + +""" + +macro_pass_two = """ +#define instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( \\ + Mvalue_idx, Mvalue_t, Mvalue_int, Mmatrix_idx, Mdims, Mdist_func) \\ + template void \\ + cuvs::neighbors::ball_cover::detail::rbc_low_dim_pass_two( \\ + raft::resources const& handle, \\ + const cuvs::neighbors::ball_cover::index& index, \\ + const Mvalue_t* query, \\ + const Mvalue_int n_query_rows, \\ + Mvalue_int k, \\ + const Mvalue_idx* R_knn_inds, \\ + const Mvalue_t* R_knn_dists, \\ + Mdist_func& dfunc, \\ + Mvalue_idx* inds, \\ + Mvalue_t* dists, \\ + float weight, \\ + Mvalue_int* dists_counter) + +""" + +macro_pass_eps = """ +#define instantiate_cuvs_neighbors_detail_rbc_eps_pass( \\ + Mvalue_idx, Mvalue_t, Mvalue_int, Mmatrix_idx, Mdist_func) \\ + template void \\ + cuvs::neighbors::ball_cover::detail::rbc_eps_pass( \\ + raft::resources const& handle, \\ + const cuvs::neighbors::ball_cover::index& index, \\ + const Mvalue_t* query, \\ + const Mvalue_int n_query_rows, \\ + Mvalue_t eps, \\ + const Mvalue_t* R_dists, \\ + Mdist_func& dfunc, \\ + bool* adj, \\ + Mvalue_idx* vd); \\ + \\ + template void \\ + cuvs::neighbors::ball_cover::detail::rbc_eps_pass( \\ + raft::resources const& handle, \\ + const cuvs::neighbors::ball_cover::index& index, \\ + const Mvalue_t* query, \\ + const Mvalue_int n_query_rows, \\ + Mvalue_t eps, \\ + Mvalue_int* max_k, \\ + const Mvalue_t* R_dists, \\ + Mdist_func& dfunc, \\ + Mvalue_idx* adj_ia, \\ + Mvalue_idx* adj_ja, \\ + Mvalue_idx* vd) + +""" + + +distances = dict( + haversine="cuvs::neighbors::ball_cover::detail::HaversineFunc", + euclidean="cuvs::neighbors::ball_cover::detail::EuclideanFunc", + dist="cuvs::neighbors::ball_cover::detail::DistFunc", +) + +euclideanSq="cuvs::neighbors::ball_cover::detail::EuclideanSqFunc", + +types = dict( + int64_float=("std::int64_t", "float"), + #int64_double=("std::int64_t", "double"), +) + +for k, v in distances.items(): + for dim in [2, 3]: + path = f"registers_pass_one_{dim}d_{k}.cu" + with open(path, "w") as f: + f.write(header) + f.write(macro_pass_one) + for type_path, (int_t, data_t) in types.items(): + f.write(f"instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one(\n") + f.write(f" {int_t}, {data_t}, {int_t}, {int_t}, {dim}, {v});\n") + f.write("#undef instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one\n") + print(f"src/neighbors/ball_cover/detail/ball_cover/{path}") + +for k, v in distances.items(): + for dim in [2, 3]: + path = f"registers_pass_two_{dim}d_{k}.cu" + with open(path, "w") as f: + f.write(header) + f.write(macro_pass_two) + for type_path, (int_t, data_t) in types.items(): + f.write(f"instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two(\n") + f.write(f" {int_t}, {data_t}, {int_t}, {int_t}, {dim}, {v});\n") + f.write("#undef instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two\n") + print(f"src/neighbors/ball_cover/detail/ball_cover/{path}") + +path="registers_eps_pass_euclidean.cu" +with open(path, "w") as f: + f.write(header) + f.write(macro_pass_eps) + for type_path, (int_t, data_t) in types.items(): + f.write(f"instantiate_cuvs_neighbors_detail_rbc_eps_pass(\n") + f.write(f" {int_t}, {data_t}, {int_t}, {int_t}, {euclideanSq});\n") + f.write("#undef instantiate_cuvs_neighbors_detail_rbc_eps_pass\n") + print(f"src/neighbors/ball_cover/detail/ball_cover/{path}") + diff --git a/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_eps_pass_euclidean.cu b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_eps_pass_euclidean.cu new file mode 100644 index 000000000..4a0f9850c --- /dev/null +++ b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_eps_pass_euclidean.cu @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by registers_00_generate.py + * + * Make changes there and run in this directory: + * + * > python registers_00_generate.py + * + */ + +#include "../../registers-inl.cuh" +#include // int64_t +#include + +#define instantiate_cuvs_neighbors_detail_rbc_eps_pass( \ + Mvalue_idx, Mvalue_t, Mvalue_int, Mmatrix_idx, Mdist_func) \ + template void cuvs::neighbors::ball_cover::detail:: \ + rbc_eps_pass( \ + raft::resources const& handle, \ + const cuvs::neighbors::ball_cover::index& \ + index, \ + const Mvalue_t* query, \ + const Mvalue_int n_query_rows, \ + Mvalue_t eps, \ + const Mvalue_t* R_dists, \ + Mdist_func& dfunc, \ + bool* adj, \ + Mvalue_idx* vd); \ + \ + template void cuvs::neighbors::ball_cover::detail:: \ + rbc_eps_pass( \ + raft::resources const& handle, \ + const cuvs::neighbors::ball_cover::index& \ + index, \ + const Mvalue_t* query, \ + const Mvalue_int n_query_rows, \ + Mvalue_t eps, \ + Mvalue_int* max_k, \ + const Mvalue_t* R_dists, \ + Mdist_func& dfunc, \ + Mvalue_idx* adj_ia, \ + Mvalue_idx* adj_ja, \ + Mvalue_idx* vd) + +instantiate_cuvs_neighbors_detail_rbc_eps_pass( + std::int64_t, + float, + std::int64_t, + std::int64_t, + cuvs::neighbors::ball_cover::detail::EuclideanSqFunc); +#undef instantiate_cuvs_neighbors_detail_rbc_eps_pass diff --git a/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_2d_dist.cu b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_2d_dist.cu new file mode 100644 index 000000000..d36daf7c5 --- /dev/null +++ b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_2d_dist.cu @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by registers_00_generate.py + * + * Make changes there and run in this directory: + * + * > python registers_00_generate.py + * + */ + +#include "../../registers-inl.cuh" +#include // int64_t +#include + +#define instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( \ + Mvalue_idx, Mvalue_t, Mvalue_int, Mmatrix_idx, Mdims, Mdist_func) \ + template void cuvs::neighbors::ball_cover::detail:: \ + rbc_low_dim_pass_one( \ + raft::resources const& handle, \ + const cuvs::neighbors::ball_cover::index& \ + index, \ + const Mvalue_t* query, \ + const Mvalue_int n_query_rows, \ + Mvalue_int k, \ + const Mvalue_idx* R_knn_inds, \ + const Mvalue_t* R_knn_dists, \ + Mdist_func& dfunc, \ + Mvalue_idx* inds, \ + Mvalue_t* dists, \ + float weight, \ + Mvalue_int* dists_counter) + +instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( + std::int64_t, + float, + std::int64_t, + std::int64_t, + 2, + cuvs::neighbors::ball_cover::detail::DistFunc); +#undef instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one diff --git a/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_2d_euclidean.cu b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_2d_euclidean.cu new file mode 100644 index 000000000..650d1e285 --- /dev/null +++ b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_2d_euclidean.cu @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by registers_00_generate.py + * + * Make changes there and run in this directory: + * + * > python registers_00_generate.py + * + */ + +#include "../../registers-inl.cuh" +#include // int64_t +#include + +#define instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( \ + Mvalue_idx, Mvalue_t, Mvalue_int, Mmatrix_idx, Mdims, Mdist_func) \ + template void cuvs::neighbors::ball_cover::detail:: \ + rbc_low_dim_pass_one( \ + raft::resources const& handle, \ + const cuvs::neighbors::ball_cover::index& \ + index, \ + const Mvalue_t* query, \ + const Mvalue_int n_query_rows, \ + Mvalue_int k, \ + const Mvalue_idx* R_knn_inds, \ + const Mvalue_t* R_knn_dists, \ + Mdist_func& dfunc, \ + Mvalue_idx* inds, \ + Mvalue_t* dists, \ + float weight, \ + Mvalue_int* dists_counter) + +instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( + std::int64_t, + float, + std::int64_t, + std::int64_t, + 2, + cuvs::neighbors::ball_cover::detail::EuclideanFunc); +#undef instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one diff --git a/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_2d_haversine.cu b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_2d_haversine.cu new file mode 100644 index 000000000..1ed575055 --- /dev/null +++ b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_2d_haversine.cu @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by registers_00_generate.py + * + * Make changes there and run in this directory: + * + * > python registers_00_generate.py + * + */ + +#include "../../registers-inl.cuh" +#include // int64_t +#include + +#define instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( \ + Mvalue_idx, Mvalue_t, Mvalue_int, Mmatrix_idx, Mdims, Mdist_func) \ + template void cuvs::neighbors::ball_cover::detail:: \ + rbc_low_dim_pass_one( \ + raft::resources const& handle, \ + const cuvs::neighbors::ball_cover::index& \ + index, \ + const Mvalue_t* query, \ + const Mvalue_int n_query_rows, \ + Mvalue_int k, \ + const Mvalue_idx* R_knn_inds, \ + const Mvalue_t* R_knn_dists, \ + Mdist_func& dfunc, \ + Mvalue_idx* inds, \ + Mvalue_t* dists, \ + float weight, \ + Mvalue_int* dists_counter) + +instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( + std::int64_t, + float, + std::int64_t, + std::int64_t, + 2, + cuvs::neighbors::ball_cover::detail::HaversineFunc); +#undef instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one diff --git a/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_3d_dist.cu b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_3d_dist.cu new file mode 100644 index 000000000..2600b8d0b --- /dev/null +++ b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_3d_dist.cu @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by registers_00_generate.py + * + * Make changes there and run in this directory: + * + * > python registers_00_generate.py + * + */ + +#include "../../registers-inl.cuh" +#include // int64_t +#include + +#define instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( \ + Mvalue_idx, Mvalue_t, Mvalue_int, Mmatrix_idx, Mdims, Mdist_func) \ + template void cuvs::neighbors::ball_cover::detail:: \ + rbc_low_dim_pass_one( \ + raft::resources const& handle, \ + const cuvs::neighbors::ball_cover::index& \ + index, \ + const Mvalue_t* query, \ + const Mvalue_int n_query_rows, \ + Mvalue_int k, \ + const Mvalue_idx* R_knn_inds, \ + const Mvalue_t* R_knn_dists, \ + Mdist_func& dfunc, \ + Mvalue_idx* inds, \ + Mvalue_t* dists, \ + float weight, \ + Mvalue_int* dists_counter) + +instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( + std::int64_t, + float, + std::int64_t, + std::int64_t, + 3, + cuvs::neighbors::ball_cover::detail::DistFunc); +#undef instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one diff --git a/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_3d_euclidean.cu b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_3d_euclidean.cu new file mode 100644 index 000000000..a93acbce4 --- /dev/null +++ b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_3d_euclidean.cu @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by registers_00_generate.py + * + * Make changes there and run in this directory: + * + * > python registers_00_generate.py + * + */ + +#include "../../registers-inl.cuh" +#include // int64_t +#include + +#define instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( \ + Mvalue_idx, Mvalue_t, Mvalue_int, Mmatrix_idx, Mdims, Mdist_func) \ + template void cuvs::neighbors::ball_cover::detail:: \ + rbc_low_dim_pass_one( \ + raft::resources const& handle, \ + const cuvs::neighbors::ball_cover::index& \ + index, \ + const Mvalue_t* query, \ + const Mvalue_int n_query_rows, \ + Mvalue_int k, \ + const Mvalue_idx* R_knn_inds, \ + const Mvalue_t* R_knn_dists, \ + Mdist_func& dfunc, \ + Mvalue_idx* inds, \ + Mvalue_t* dists, \ + float weight, \ + Mvalue_int* dists_counter) + +instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( + std::int64_t, + float, + std::int64_t, + std::int64_t, + 3, + cuvs::neighbors::ball_cover::detail::EuclideanFunc); +#undef instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one diff --git a/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_3d_haversine.cu b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_3d_haversine.cu new file mode 100644 index 000000000..fd3d01feb --- /dev/null +++ b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_one_3d_haversine.cu @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by registers_00_generate.py + * + * Make changes there and run in this directory: + * + * > python registers_00_generate.py + * + */ + +#include "../../registers-inl.cuh" +#include // int64_t +#include + +#define instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( \ + Mvalue_idx, Mvalue_t, Mvalue_int, Mmatrix_idx, Mdims, Mdist_func) \ + template void cuvs::neighbors::ball_cover::detail:: \ + rbc_low_dim_pass_one( \ + raft::resources const& handle, \ + const cuvs::neighbors::ball_cover::index& \ + index, \ + const Mvalue_t* query, \ + const Mvalue_int n_query_rows, \ + Mvalue_int k, \ + const Mvalue_idx* R_knn_inds, \ + const Mvalue_t* R_knn_dists, \ + Mdist_func& dfunc, \ + Mvalue_idx* inds, \ + Mvalue_t* dists, \ + float weight, \ + Mvalue_int* dists_counter) + +instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( + std::int64_t, + float, + std::int64_t, + std::int64_t, + 3, + cuvs::neighbors::ball_cover::detail::HaversineFunc); +#undef instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one diff --git a/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_2d_dist.cu b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_2d_dist.cu new file mode 100644 index 000000000..c30a55991 --- /dev/null +++ b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_2d_dist.cu @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by registers_00_generate.py + * + * Make changes there and run in this directory: + * + * > python registers_00_generate.py + * + */ + +#include "../../registers-inl.cuh" +#include // int64_t +#include + +#define instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( \ + Mvalue_idx, Mvalue_t, Mvalue_int, Mmatrix_idx, Mdims, Mdist_func) \ + template void cuvs::neighbors::ball_cover::detail:: \ + rbc_low_dim_pass_two( \ + raft::resources const& handle, \ + const cuvs::neighbors::ball_cover::index& \ + index, \ + const Mvalue_t* query, \ + const Mvalue_int n_query_rows, \ + Mvalue_int k, \ + const Mvalue_idx* R_knn_inds, \ + const Mvalue_t* R_knn_dists, \ + Mdist_func& dfunc, \ + Mvalue_idx* inds, \ + Mvalue_t* dists, \ + float weight, \ + Mvalue_int* dists_counter) + +instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( + std::int64_t, + float, + std::int64_t, + std::int64_t, + 2, + cuvs::neighbors::ball_cover::detail::DistFunc); +#undef instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two diff --git a/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_2d_euclidean.cu b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_2d_euclidean.cu new file mode 100644 index 000000000..49cc8404c --- /dev/null +++ b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_2d_euclidean.cu @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by registers_00_generate.py + * + * Make changes there and run in this directory: + * + * > python registers_00_generate.py + * + */ + +#include "../../registers-inl.cuh" +#include // int64_t +#include + +#define instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( \ + Mvalue_idx, Mvalue_t, Mvalue_int, Mmatrix_idx, Mdims, Mdist_func) \ + template void cuvs::neighbors::ball_cover::detail:: \ + rbc_low_dim_pass_two( \ + raft::resources const& handle, \ + const cuvs::neighbors::ball_cover::index& \ + index, \ + const Mvalue_t* query, \ + const Mvalue_int n_query_rows, \ + Mvalue_int k, \ + const Mvalue_idx* R_knn_inds, \ + const Mvalue_t* R_knn_dists, \ + Mdist_func& dfunc, \ + Mvalue_idx* inds, \ + Mvalue_t* dists, \ + float weight, \ + Mvalue_int* dists_counter) + +instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( + std::int64_t, + float, + std::int64_t, + std::int64_t, + 2, + cuvs::neighbors::ball_cover::detail::EuclideanFunc); +#undef instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two diff --git a/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_2d_haversine.cu b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_2d_haversine.cu new file mode 100644 index 000000000..4cc9ec992 --- /dev/null +++ b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_2d_haversine.cu @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by registers_00_generate.py + * + * Make changes there and run in this directory: + * + * > python registers_00_generate.py + * + */ + +#include "../../registers-inl.cuh" +#include // int64_t +#include + +#define instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( \ + Mvalue_idx, Mvalue_t, Mvalue_int, Mmatrix_idx, Mdims, Mdist_func) \ + template void cuvs::neighbors::ball_cover::detail:: \ + rbc_low_dim_pass_two( \ + raft::resources const& handle, \ + const cuvs::neighbors::ball_cover::index& \ + index, \ + const Mvalue_t* query, \ + const Mvalue_int n_query_rows, \ + Mvalue_int k, \ + const Mvalue_idx* R_knn_inds, \ + const Mvalue_t* R_knn_dists, \ + Mdist_func& dfunc, \ + Mvalue_idx* inds, \ + Mvalue_t* dists, \ + float weight, \ + Mvalue_int* dists_counter) + +instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( + std::int64_t, + float, + std::int64_t, + std::int64_t, + 2, + cuvs::neighbors::ball_cover::detail::HaversineFunc); +#undef instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two diff --git a/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_3d_dist.cu b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_3d_dist.cu new file mode 100644 index 000000000..abc51994d --- /dev/null +++ b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_3d_dist.cu @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by registers_00_generate.py + * + * Make changes there and run in this directory: + * + * > python registers_00_generate.py + * + */ + +#include "../../registers-inl.cuh" +#include // int64_t +#include + +#define instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( \ + Mvalue_idx, Mvalue_t, Mvalue_int, Mmatrix_idx, Mdims, Mdist_func) \ + template void cuvs::neighbors::ball_cover::detail:: \ + rbc_low_dim_pass_two( \ + raft::resources const& handle, \ + const cuvs::neighbors::ball_cover::index& \ + index, \ + const Mvalue_t* query, \ + const Mvalue_int n_query_rows, \ + Mvalue_int k, \ + const Mvalue_idx* R_knn_inds, \ + const Mvalue_t* R_knn_dists, \ + Mdist_func& dfunc, \ + Mvalue_idx* inds, \ + Mvalue_t* dists, \ + float weight, \ + Mvalue_int* dists_counter) + +instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( + std::int64_t, + float, + std::int64_t, + std::int64_t, + 3, + cuvs::neighbors::ball_cover::detail::DistFunc); +#undef instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two diff --git a/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_3d_euclidean.cu b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_3d_euclidean.cu new file mode 100644 index 000000000..a24ce0dd6 --- /dev/null +++ b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_3d_euclidean.cu @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by registers_00_generate.py + * + * Make changes there and run in this directory: + * + * > python registers_00_generate.py + * + */ + +#include "../../registers-inl.cuh" +#include // int64_t +#include + +#define instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( \ + Mvalue_idx, Mvalue_t, Mvalue_int, Mmatrix_idx, Mdims, Mdist_func) \ + template void cuvs::neighbors::ball_cover::detail:: \ + rbc_low_dim_pass_two( \ + raft::resources const& handle, \ + const cuvs::neighbors::ball_cover::index& \ + index, \ + const Mvalue_t* query, \ + const Mvalue_int n_query_rows, \ + Mvalue_int k, \ + const Mvalue_idx* R_knn_inds, \ + const Mvalue_t* R_knn_dists, \ + Mdist_func& dfunc, \ + Mvalue_idx* inds, \ + Mvalue_t* dists, \ + float weight, \ + Mvalue_int* dists_counter) + +instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( + std::int64_t, + float, + std::int64_t, + std::int64_t, + 3, + cuvs::neighbors::ball_cover::detail::EuclideanFunc); +#undef instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two diff --git a/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_3d_haversine.cu b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_3d_haversine.cu new file mode 100644 index 000000000..954753b63 --- /dev/null +++ b/cpp/src/neighbors/ball_cover/detail/ball_cover/registers_pass_two_3d_haversine.cu @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * NOTE: this file is generated by registers_00_generate.py + * + * Make changes there and run in this directory: + * + * > python registers_00_generate.py + * + */ + +#include "../../registers-inl.cuh" +#include // int64_t +#include + +#define instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( \ + Mvalue_idx, Mvalue_t, Mvalue_int, Mmatrix_idx, Mdims, Mdist_func) \ + template void cuvs::neighbors::ball_cover::detail:: \ + rbc_low_dim_pass_two( \ + raft::resources const& handle, \ + const cuvs::neighbors::ball_cover::index& \ + index, \ + const Mvalue_t* query, \ + const Mvalue_int n_query_rows, \ + Mvalue_int k, \ + const Mvalue_idx* R_knn_inds, \ + const Mvalue_t* R_knn_dists, \ + Mdist_func& dfunc, \ + Mvalue_idx* inds, \ + Mvalue_t* dists, \ + float weight, \ + Mvalue_int* dists_counter) + +instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( + std::int64_t, + float, + std::int64_t, + std::int64_t, + 3, + cuvs::neighbors::ball_cover::detail::HaversineFunc); +#undef instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two diff --git a/cpp/src/neighbors/ball_cover/registers-ext.cuh b/cpp/src/neighbors/ball_cover/registers-ext.cuh index 6b8782a7e..7de9e11ce 100644 --- a/cpp/src/neighbors/ball_cover/registers-ext.cuh +++ b/cpp/src/neighbors/ball_cover/registers-ext.cuh @@ -23,7 +23,7 @@ #include // uint32_t -namespace cuvs::neighbors::detail { +namespace cuvs::neighbors::ball_cover::detail { template ( \ raft::resources const& handle, \ const cuvs::neighbors::ball_cover::index& \ @@ -121,7 +121,7 @@ void rbc_eps_pass( #define instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( \ Mvalue_idx, Mvalue_t, Mvalue_int, Mmatrix_idx, Mdims, Mdist_func) \ - extern template void cuvs::neighbors::detail:: \ + extern template void cuvs::neighbors::ball_cover::detail:: \ rbc_low_dim_pass_two( \ raft::resources const& handle, \ const cuvs::neighbors::ball_cover::index& \ @@ -137,64 +137,128 @@ void rbc_eps_pass( float weight, \ Mvalue_int* dists_counter) -#define instantiate_cuvs_neighbors_detail_rbc_eps_pass( \ - Mvalue_idx, Mvalue_t, Mvalue_int, Mmatrix_idx, Mdist_func) \ - extern template void \ - cuvs::neighbors::detail::rbc_eps_pass( \ - raft::resources const& handle, \ - const cuvs::neighbors::ball_cover::index& \ - index, \ - const Mvalue_t* query, \ - const Mvalue_int n_query_rows, \ - Mvalue_t eps, \ - const Mvalue_t* R_dists, \ - Mdist_func& dfunc, \ - bool* adj, \ - Mvalue_idx* vd); \ - \ - extern template void \ - cuvs::neighbors::detail::rbc_eps_pass( \ - raft::resources const& handle, \ - const cuvs::neighbors::ball_cover::index& \ - index, \ - const Mvalue_t* query, \ - const Mvalue_int n_query_rows, \ - Mvalue_t eps, \ - Mvalue_int* max_k, \ - const Mvalue_t* R_dists, \ - Mdist_func& dfunc, \ - Mvalue_idx* adj_ia, \ - Mvalue_idx* adj_ja, \ - Mvalue_idx* vd); +#define instantiate_cuvs_neighbors_detail_rbc_eps_pass( \ + Mvalue_idx, Mvalue_t, Mvalue_int, Mmatrix_idx, Mdist_func) \ + extern template void cuvs::neighbors::ball_cover::detail:: \ + rbc_eps_pass( \ + raft::resources const& handle, \ + const cuvs::neighbors::ball_cover::index& \ + index, \ + const Mvalue_t* query, \ + const Mvalue_int n_query_rows, \ + Mvalue_t eps, \ + const Mvalue_t* R_dists, \ + Mdist_func& dfunc, \ + bool* adj, \ + Mvalue_idx* vd); \ + \ + extern template void cuvs::neighbors::ball_cover::detail:: \ + rbc_eps_pass( \ + raft::resources const& handle, \ + const cuvs::neighbors::ball_cover::index& \ + index, \ + const Mvalue_t* query, \ + const Mvalue_int n_query_rows, \ + Mvalue_t eps, \ + Mvalue_int* max_k, \ + const Mvalue_t* R_dists, \ + Mdist_func& dfunc, \ + Mvalue_idx* adj_ia, \ + Mvalue_idx* adj_ja, \ + Mvalue_idx* vd); instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( - std::int64_t, float, std::int64_t, std::int64_t, 2, cuvs::neighbors::detail::HaversineFunc); + std::int64_t, + float, + std::int64_t, + std::int64_t, + 2, + cuvs::neighbors::ball_cover::detail::HaversineFunc); instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( - std::int64_t, float, std::int64_t, std::int64_t, 3, cuvs::neighbors::detail::HaversineFunc); + std::int64_t, + float, + std::int64_t, + std::int64_t, + 3, + cuvs::neighbors::ball_cover::detail::HaversineFunc); instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( - std::int64_t, float, std::int64_t, std::int64_t, 2, cuvs::neighbors::detail::EuclideanFunc); + std::int64_t, + float, + std::int64_t, + std::int64_t, + 2, + cuvs::neighbors::ball_cover::detail::EuclideanFunc); instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( - std::int64_t, float, std::int64_t, std::int64_t, 3, cuvs::neighbors::detail::EuclideanFunc); + std::int64_t, + float, + std::int64_t, + std::int64_t, + 3, + cuvs::neighbors::ball_cover::detail::EuclideanFunc); instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( - std::int64_t, float, std::int64_t, std::int64_t, 2, cuvs::neighbors::detail::DistFunc); + std::int64_t, + float, + std::int64_t, + std::int64_t, + 2, + cuvs::neighbors::ball_cover::detail::DistFunc); instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one( - std::int64_t, float, std::int64_t, std::int64_t, 3, cuvs::neighbors::detail::DistFunc); + std::int64_t, + float, + std::int64_t, + std::int64_t, + 3, + cuvs::neighbors::ball_cover::detail::DistFunc); instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( - std::int64_t, float, std::int64_t, std::int64_t, 2, cuvs::neighbors::detail::HaversineFunc); + std::int64_t, + float, + std::int64_t, + std::int64_t, + 2, + cuvs::neighbors::ball_cover::detail::HaversineFunc); instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( - std::int64_t, float, std::int64_t, std::int64_t, 3, cuvs::neighbors::detail::HaversineFunc); + std::int64_t, + float, + std::int64_t, + std::int64_t, + 3, + cuvs::neighbors::ball_cover::detail::HaversineFunc); instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( - std::int64_t, float, std::int64_t, std::int64_t, 2, cuvs::neighbors::detail::EuclideanFunc); + std::int64_t, + float, + std::int64_t, + std::int64_t, + 2, + cuvs::neighbors::ball_cover::detail::EuclideanFunc); instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( - std::int64_t, float, std::int64_t, std::int64_t, 3, cuvs::neighbors::detail::EuclideanFunc); + std::int64_t, + float, + std::int64_t, + std::int64_t, + 3, + cuvs::neighbors::ball_cover::detail::EuclideanFunc); instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( - std::int64_t, float, std::int64_t, std::int64_t, 2, cuvs::neighbors::detail::DistFunc); + std::int64_t, + float, + std::int64_t, + std::int64_t, + 2, + cuvs::neighbors::ball_cover::detail::DistFunc); instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two( - std::int64_t, float, std::int64_t, std::int64_t, 3, cuvs::neighbors::detail::DistFunc); + std::int64_t, + float, + std::int64_t, + std::int64_t, + 3, + cuvs::neighbors::ball_cover::detail::DistFunc); instantiate_cuvs_neighbors_detail_rbc_eps_pass( - std::int64_t, float, std::int64_t, std::int64_t, cuvs::neighbors::detail::EuclideanSqFunc); + std::int64_t, + float, + std::int64_t, + std::int64_t, + cuvs::neighbors::ball_cover::detail::EuclideanSqFunc); #undef instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_two #undef instantiate_cuvs_neighbors_detail_rbc_low_dim_pass_one diff --git a/cpp/src/neighbors/ball_cover/registers-inl.cuh b/cpp/src/neighbors/ball_cover/registers-inl.cuh index 2565a48fc..a94c21ab2 100644 --- a/cpp/src/neighbors/ball_cover/registers-inl.cuh +++ b/cpp/src/neighbors/ball_cover/registers-inl.cuh @@ -34,7 +34,7 @@ #include -namespace cuvs::neighbors::detail { +namespace cuvs::neighbors::ball_cover::detail { /** * To find exact neighbors, we perform a post-processing stage @@ -181,7 +181,7 @@ RAFT_KERNEL compute_final_dists_registers(const value_t* X_reordered, local_x_ptr[j] = x_ptr[j]; } - using namespace cuvs::neighbors::detail::faiss_select; + using namespace cuvs::neighbors::ball_cover::detail::faiss_select; KeyValueBlockSelect, warp_q, thread_q, tpb> heap( std::numeric_limits::max(), std::numeric_limits::max(), @@ -345,7 +345,7 @@ RAFT_KERNEL block_rbc_kernel_registers(const value_t* X_reordered, } // Each warp works on 1 R - using namespace cuvs::neighbors::detail::faiss_select; + using namespace cuvs::neighbors::ball_cover::detail::faiss_select; KeyValueBlockSelect, warp_q, thread_q, tpb> heap( std::numeric_limits::max(), std::numeric_limits::max(), @@ -1627,4 +1627,4 @@ void rbc_eps_pass( raft::resource::sync_stream(handle); } -}; // namespace cuvs::neighbors::detail +}; // namespace cuvs::neighbors::ball_cover::detail diff --git a/cpp/src/neighbors/ball_cover/registers.cuh b/cpp/src/neighbors/ball_cover/registers.cuh index 1cd32ba00..6fe0cfd27 100644 --- a/cpp/src/neighbors/ball_cover/registers.cuh +++ b/cpp/src/neighbors/ball_cover/registers.cuh @@ -15,10 +15,8 @@ */ #pragma once -#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY +#ifndef CUVS_EXPLICIT_INSTANTIATE_ONLY #include "registers-inl.cuh" #endif -#ifdef RAFT_COMPILED #include "registers-ext.cuh" -#endif diff --git a/cpp/src/neighbors/ball_cover/registers_types.cuh b/cpp/src/neighbors/ball_cover/registers_types.cuh index bf9d21452..3777932a7 100644 --- a/cpp/src/neighbors/ball_cover/registers_types.cuh +++ b/cpp/src/neighbors/ball_cover/registers_types.cuh @@ -20,7 +20,7 @@ #include // uint32_t -namespace cuvs::neighbors::detail { +namespace cuvs::neighbors::ball_cover::detail { template struct DistFunc { @@ -38,7 +38,7 @@ struct HaversineFunc : public DistFunc { const value_t* b, const value_int n_dims) override { - return cuvs::neighbors::detail::compute_haversine(a[0], b[0], a[1], b[1]); + return cuvs::neighbors::detail::compute_haversine(a[0], b[0], a[1], b[1]); } }; @@ -73,4 +73,4 @@ struct EuclideanSqFunc : public DistFunc { } }; -}; // namespace cuvs::neighbors::detail +}; // namespace cuvs::neighbors::ball_cover::detail diff --git a/cpp/src/neighbors/faiss_select/Comparators.cuh b/cpp/src/neighbors/faiss_select/Comparators.cuh index 9ced61e13..3983cc9ba 100644 --- a/cpp/src/neighbors/faiss_select/Comparators.cuh +++ b/cpp/src/neighbors/faiss_select/Comparators.cuh @@ -10,7 +10,7 @@ #include #include -namespace cuvs::neighbors::detail::faiss_select { +namespace cuvs::neighbors::ball_cover::detail::faiss_select { template struct Comparator { @@ -26,4 +26,4 @@ struct Comparator { __device__ static inline bool gt(half a, half b) { return __hgt(a, b); } }; -} // namespace cuvs::neighbors::detail::faiss_select +} // namespace cuvs::neighbors::ball_cover::detail::faiss_select diff --git a/cpp/src/neighbors/faiss_select/DistanceUtils.h b/cpp/src/neighbors/faiss_select/DistanceUtils.h index e8a41c1aa..71fdbf0cf 100644 --- a/cpp/src/neighbors/faiss_select/DistanceUtils.h +++ b/cpp/src/neighbors/faiss_select/DistanceUtils.h @@ -7,7 +7,7 @@ #pragma once -namespace cuvs::neighbors::detail::faiss_select { +namespace cuvs::neighbors::ball_cover::detail::faiss_select { // If the inner size (dim) of the vectors is small, we want a larger query tile // size, like 1024 inline void chooseTileSize(size_t numQueries, @@ -49,4 +49,4 @@ inline void chooseTileSize(size_t numQueries, // tileCols is the remainder size tileCols = std::min(targetUsage / preferredTileRows, numCentroids); } -} // namespace cuvs::neighbors::detail::faiss_select +} // namespace cuvs::neighbors::ball_cover::detail::faiss_select diff --git a/cpp/src/neighbors/faiss_select/MergeNetworkBlock.cuh b/cpp/src/neighbors/faiss_select/MergeNetworkBlock.cuh index 345b9186a..0258183b0 100644 --- a/cpp/src/neighbors/faiss_select/MergeNetworkBlock.cuh +++ b/cpp/src/neighbors/faiss_select/MergeNetworkBlock.cuh @@ -12,7 +12,7 @@ #include -namespace cuvs::neighbors::detail::faiss_select { +namespace cuvs::neighbors::ball_cover::detail::faiss_select { // Merge pairs of lists smaller than blockDim.x (NumThreads) template ::merge(listK, listV); } -} // namespace cuvs::neighbors::detail::faiss_select +} // namespace cuvs::neighbors::ball_cover::detail::faiss_select diff --git a/cpp/src/neighbors/faiss_select/MergeNetworkUtils.cuh b/cpp/src/neighbors/faiss_select/MergeNetworkUtils.cuh index 7f7796fad..4406c3545 100644 --- a/cpp/src/neighbors/faiss_select/MergeNetworkUtils.cuh +++ b/cpp/src/neighbors/faiss_select/MergeNetworkUtils.cuh @@ -7,7 +7,7 @@ #pragma once -namespace cuvs::neighbors::detail::faiss_select { +namespace cuvs::neighbors::ball_cover::detail::faiss_select { template inline __device__ void swap(bool swap, T& x, T& y) @@ -22,4 +22,4 @@ inline __device__ void assign(bool assign, T& x, T y) { x = assign ? y : x; } -} // namespace cuvs::neighbors::detail::faiss_select +} // namespace cuvs::neighbors::ball_cover::detail::faiss_select diff --git a/cpp/src/neighbors/faiss_select/MergeNetworkWarp.cuh b/cpp/src/neighbors/faiss_select/MergeNetworkWarp.cuh index 0a9226e77..b6039accc 100644 --- a/cpp/src/neighbors/faiss_select/MergeNetworkWarp.cuh +++ b/cpp/src/neighbors/faiss_select/MergeNetworkWarp.cuh @@ -11,7 +11,7 @@ #include "StaticUtils.h" #include -namespace cuvs::neighbors::detail::faiss_select { +namespace cuvs::neighbors::ball_cover::detail::faiss_select { // // This file contains functions to: @@ -516,4 +516,4 @@ inline __device__ void warpSortAnyRegisters(K k[N], V v[N]) BitonicSortStep::sort(k, v); } -} // namespace cuvs::neighbors::detail::faiss_select +} // namespace cuvs::neighbors::ball_cover::detail::faiss_select diff --git a/cpp/src/neighbors/faiss_select/Select.cuh b/cpp/src/neighbors/faiss_select/Select.cuh index 873688418..17f682523 100644 --- a/cpp/src/neighbors/faiss_select/Select.cuh +++ b/cpp/src/neighbors/faiss_select/Select.cuh @@ -13,7 +13,7 @@ #include #include -namespace cuvs::neighbors::detail::faiss_select { +namespace cuvs::neighbors::ball_cover::detail::faiss_select { // Specialization for block-wide monotonic merges producing a merge sort // since what we really want is a constexpr loop expansion @@ -566,4 +566,4 @@ struct WarpSelect { V threadV; }; -} // namespace cuvs::neighbors::detail::faiss_select +} // namespace cuvs::neighbors::ball_cover::detail::faiss_select diff --git a/cpp/src/neighbors/faiss_select/StaticUtils.h b/cpp/src/neighbors/faiss_select/StaticUtils.h index 05ee3c0a3..87124ffe0 100644 --- a/cpp/src/neighbors/faiss_select/StaticUtils.h +++ b/cpp/src/neighbors/faiss_select/StaticUtils.h @@ -15,7 +15,7 @@ #define __device__ #endif -namespace cuvs::neighbors::detail::faiss_select::utils { +namespace cuvs::neighbors::ball_cover::detail::faiss_select::utils { template constexpr __host__ __device__ bool isPowerOf2(T v) @@ -45,4 +45,4 @@ static_assert(nextHighestPowerOf2(1536000000u) == 2147483648u, "nextHighestPower static_assert(nextHighestPowerOf2((size_t)2147483648ULL) == (size_t)4294967296ULL, "nextHighestPowerOf2"); -} // namespace cuvs::neighbors::detail::faiss_select::utils +} // namespace cuvs::neighbors::ball_cover::detail::faiss_select::utils diff --git a/cpp/src/neighbors/faiss_select/key_value_block_select.cuh b/cpp/src/neighbors/faiss_select/key_value_block_select.cuh index 2bb5f84cc..67882a308 100644 --- a/cpp/src/neighbors/faiss_select/key_value_block_select.cuh +++ b/cpp/src/neighbors/faiss_select/key_value_block_select.cuh @@ -14,7 +14,7 @@ // because this will change the max k that can be processed. One solution might be to break // up k into multiple batches for larger k. -namespace cuvs::neighbors::detail::faiss_select { +namespace cuvs::neighbors::ball_cover::detail::faiss_select { // `Dir` true, produce largest values. // `Dir` false, produce smallest values. @@ -226,4 +226,4 @@ struct KeyValueBlockSelect { int kMinus1; }; -} // namespace cuvs::neighbors::detail::faiss_select +} // namespace cuvs::neighbors::ball_cover::detail::faiss_select From 7d7283c4ceaf1bf43c903d6eb29c6e6856744dac Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 27 Aug 2024 12:23:59 -0400 Subject: [PATCH 6/9] Adding ball cover gtest --- cpp/test/CMakeLists.txt | 13 ++++--------- cpp/test/neighbors/ball_cover.cu | 3 ++- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 53ead3e1b..780bdd7f8 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -130,15 +130,8 @@ if(BUILD_TESTS) ) ConfigureTest( - NAME - NEIGHBORS_ANN_BRUTE_FORCE_TEST - PATH - neighbors/ann_brute_force/test_float.cu - neighbors/ann_brute_force/test_half.cu - GPUS - 1 - PERCENT - 100 + NAME NEIGHBORS_ANN_BRUTE_FORCE_TEST PATH neighbors/ann_brute_force/test_float.cu + neighbors/ann_brute_force/test_half.cu GPUS 1 PERCENT 100 ) ConfigureTest( @@ -167,6 +160,8 @@ if(BUILD_TESTS) 100 ) + ConfigureTest(NAME NEIGHBORS_BALL_COVER_TEST PATH neighbors/ball_cover.cu GPUS 1 PERCENT 100) + if(BUILD_CAGRA_HNSWLIB) ConfigureTest(NAME NEIGHBORS_HNSW_TEST PATH neighbors/hnsw.cu GPUS 1 PERCENT 100) endif() diff --git a/cpp/test/neighbors/ball_cover.cu b/cpp/test/neighbors/ball_cover.cu index 1545982f5..9a2f76059 100644 --- a/cpp/test/neighbors/ball_cover.cu +++ b/cpp/test/neighbors/ball_cover.cu @@ -121,7 +121,8 @@ void compute_bfknn(const raft::resources& handle, bfindex, raft::make_device_matrix_view(X2, n_query_rows, d), raft::make_device_matrix_view(inds, n_query_rows, k), - raft::make_device_matrix_view(dists, n_query_rows, k)); + raft::make_device_matrix_view(dists, n_query_rows, k), + std::nullopt); } struct ToRadians { From 6e26ee970679c0cfb2a59e23f269841acce616a0 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 28 Aug 2024 14:06:18 -0400 Subject: [PATCH 7/9] Upating usage examples --- cpp/include/cuvs/neighbors/ball_cover.hpp | 25 +++++++++++------------ 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/cpp/include/cuvs/neighbors/ball_cover.hpp b/cpp/include/cuvs/neighbors/ball_cover.hpp index 1ca588aa2..148e099f1 100644 --- a/cpp/include/cuvs/neighbors/ball_cover.hpp +++ b/cpp/include/cuvs/neighbors/ball_cover.hpp @@ -168,15 +168,14 @@ struct index : cuvs::neighbors::index { * @code{.cpp} * * #include - * #include - * #include - * using namespace raft::neighbors; + * #include + * #include + * using namespace cuvs::neighbors; * * raft::resources handle; * ... * auto metric = cuvs::distance::DistanceType::L2Expanded; - * cuvs::neighbors::ball_cover::index index(handle, X, metric); - * + * ball_cover::index index(handle, X, metric); * ball_cover::build_index(handle, index); * @endcode * @@ -206,16 +205,16 @@ void build(raft::resources const& handle, index - * #include - * #include - * using namespace raft::neighbors; + * #include + * #include + * using namespace cuvs::neighbors; * * raft::resources handle; * ... * auto metric = cuvs::distance::DistanceType::L2Expanded; * * // Construct a ball cover index - * cuvs::neighbors::ball_cover::index index(handle, X, metric); + * ball_cover::index index(handle, X, metric); * * // Perform all neighbors knn query * ball_cover::all_knn_query(handle, index, inds, dists, k); @@ -315,16 +314,16 @@ void eps_nn(raft::resources const& handle, * @code{.cpp} * * #include - * #include - * #include - * using namespace raft::neighbors; + * #include + * #include + * using namespace cuvs::neighbors; * * raft::resources handle; * ... * auto metric = cuvs::distance::DistanceType::L2Expanded; * * // Build a ball cover index - * cuvs::neighbors::ball_cover::index index(handle, X, metric); + * ball_cover::index index(handle, X, metric); * ball_cover::build_index(handle, index); * * // Perform all neighbors knn query From c25f35ada37ae7211053095f6668de6ccb618f5e Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 28 Aug 2024 14:18:48 -0400 Subject: [PATCH 8/9] updating copyrights --- cpp/include/cuvs/neighbors/ball_cover.hpp | 2 +- cpp/src/neighbors/ball_cover/ball_cover.cuh | 3 +-- cpp/src/neighbors/ball_cover/registers.cuh | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/include/cuvs/neighbors/ball_cover.hpp b/cpp/include/cuvs/neighbors/ball_cover.hpp index 148e099f1..97365eb78 100644 --- a/cpp/include/cuvs/neighbors/ball_cover.hpp +++ b/cpp/include/cuvs/neighbors/ball_cover.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/neighbors/ball_cover/ball_cover.cuh b/cpp/src/neighbors/ball_cover/ball_cover.cuh index fa6f1902d..d8a1410a6 100644 --- a/cpp/src/neighbors/ball_cover/ball_cover.cuh +++ b/cpp/src/neighbors/ball_cover/ball_cover.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,7 +22,6 @@ #include "registers_types.cuh" #include -#include "../faiss_select/key_value_block_select.cuh" #include #include #include diff --git a/cpp/src/neighbors/ball_cover/registers.cuh b/cpp/src/neighbors/ball_cover/registers.cuh index 6fe0cfd27..1dc4a0bc9 100644 --- a/cpp/src/neighbors/ball_cover/registers.cuh +++ b/cpp/src/neighbors/ball_cover/registers.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 30aedf6cb63e8b98247b08a51e7e4ed0becfe59d Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 28 Aug 2024 14:29:15 -0400 Subject: [PATCH 9/9] Removing faiss_select --- .../neighbors/ball_cover/registers-inl.cuh | 6 +- .../neighbors/faiss_select/Comparators.cuh | 29 - .../neighbors/faiss_select/DistanceUtils.h | 52 -- .../faiss_select/MergeNetworkBlock.cuh | 277 --------- .../faiss_select/MergeNetworkUtils.cuh | 25 - .../faiss_select/MergeNetworkWarp.cuh | 519 ---------------- cpp/src/neighbors/faiss_select/Select.cuh | 569 ------------------ cpp/src/neighbors/faiss_select/StaticUtils.h | 48 -- .../faiss_select/key_value_block_select.cuh | 229 ------- notebooks/rmm_log.txt | 2 - 10 files changed, 3 insertions(+), 1753 deletions(-) delete mode 100644 cpp/src/neighbors/faiss_select/Comparators.cuh delete mode 100644 cpp/src/neighbors/faiss_select/DistanceUtils.h delete mode 100644 cpp/src/neighbors/faiss_select/MergeNetworkBlock.cuh delete mode 100644 cpp/src/neighbors/faiss_select/MergeNetworkUtils.cuh delete mode 100644 cpp/src/neighbors/faiss_select/MergeNetworkWarp.cuh delete mode 100644 cpp/src/neighbors/faiss_select/Select.cuh delete mode 100644 cpp/src/neighbors/faiss_select/StaticUtils.h delete mode 100644 cpp/src/neighbors/faiss_select/key_value_block_select.cuh delete mode 100644 notebooks/rmm_log.txt diff --git a/cpp/src/neighbors/ball_cover/registers-inl.cuh b/cpp/src/neighbors/ball_cover/registers-inl.cuh index a94c21ab2..07a723e85 100644 --- a/cpp/src/neighbors/ball_cover/registers-inl.cuh +++ b/cpp/src/neighbors/ball_cover/registers-inl.cuh @@ -21,9 +21,9 @@ #include "registers_types.cuh" // DistFunc #include -#include "../faiss_select/key_value_block_select.cuh" #include #include +#include #include #include @@ -181,7 +181,7 @@ RAFT_KERNEL compute_final_dists_registers(const value_t* X_reordered, local_x_ptr[j] = x_ptr[j]; } - using namespace cuvs::neighbors::ball_cover::detail::faiss_select; + using namespace raft::neighbors::detail::faiss_select; KeyValueBlockSelect, warp_q, thread_q, tpb> heap( std::numeric_limits::max(), std::numeric_limits::max(), @@ -345,7 +345,7 @@ RAFT_KERNEL block_rbc_kernel_registers(const value_t* X_reordered, } // Each warp works on 1 R - using namespace cuvs::neighbors::ball_cover::detail::faiss_select; + using namespace raft::neighbors::detail::faiss_select; KeyValueBlockSelect, warp_q, thread_q, tpb> heap( std::numeric_limits::max(), std::numeric_limits::max(), diff --git a/cpp/src/neighbors/faiss_select/Comparators.cuh b/cpp/src/neighbors/faiss_select/Comparators.cuh deleted file mode 100644 index 3983cc9ba..000000000 --- a/cpp/src/neighbors/faiss_select/Comparators.cuh +++ /dev/null @@ -1,29 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file thirdparty/LICENSES/LICENSE.faiss - */ - -#pragma once - -#include -#include - -namespace cuvs::neighbors::ball_cover::detail::faiss_select { - -template -struct Comparator { - __device__ static inline bool lt(T a, T b) { return a < b; } - - __device__ static inline bool gt(T a, T b) { return a > b; } -}; - -template <> -struct Comparator { - __device__ static inline bool lt(half a, half b) { return __hlt(a, b); } - - __device__ static inline bool gt(half a, half b) { return __hgt(a, b); } -}; - -} // namespace cuvs::neighbors::ball_cover::detail::faiss_select diff --git a/cpp/src/neighbors/faiss_select/DistanceUtils.h b/cpp/src/neighbors/faiss_select/DistanceUtils.h deleted file mode 100644 index 71fdbf0cf..000000000 --- a/cpp/src/neighbors/faiss_select/DistanceUtils.h +++ /dev/null @@ -1,52 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file thirdparty/LICENSES/LICENSE.faiss - */ - -#pragma once - -namespace cuvs::neighbors::ball_cover::detail::faiss_select { -// If the inner size (dim) of the vectors is small, we want a larger query tile -// size, like 1024 -inline void chooseTileSize(size_t numQueries, - size_t numCentroids, - size_t dim, - size_t elementSize, - size_t totalMem, - size_t& tileRows, - size_t& tileCols) -{ - // The matrix multiplication should be large enough to be efficient, but if - // it is too large, we seem to lose efficiency as opposed to - // double-streaming. Each tile size here defines 1/2 of the memory use due - // to double streaming. We ignore available temporary memory, as that is - // adjusted independently by the user and can thus meet these requirements - // (or not). For <= 4 GB GPUs, prefer 512 MB of usage. For <= 8 GB GPUs, - // prefer 768 MB of usage. Otherwise, prefer 1 GB of usage. - size_t targetUsage = 0; - - if (totalMem <= ((size_t)4) * 1024 * 1024 * 1024) { - targetUsage = 512 * 1024 * 1024; - } else if (totalMem <= ((size_t)8) * 1024 * 1024 * 1024) { - targetUsage = 768 * 1024 * 1024; - } else { - targetUsage = 1024 * 1024 * 1024; - } - - targetUsage /= 2 * elementSize; - - // 512 seems to be a batch size sweetspot for float32. - // If we are on float16, increase to 512. - // If the k size (vec dim) of the matrix multiplication is small (<= 32), - // increase to 1024. - size_t preferredTileRows = 512; - if (dim <= 32) { preferredTileRows = 1024; } - - tileRows = std::min(preferredTileRows, numQueries); - - // tileCols is the remainder size - tileCols = std::min(targetUsage / preferredTileRows, numCentroids); -} -} // namespace cuvs::neighbors::ball_cover::detail::faiss_select diff --git a/cpp/src/neighbors/faiss_select/MergeNetworkBlock.cuh b/cpp/src/neighbors/faiss_select/MergeNetworkBlock.cuh deleted file mode 100644 index 0258183b0..000000000 --- a/cpp/src/neighbors/faiss_select/MergeNetworkBlock.cuh +++ /dev/null @@ -1,277 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file thirdparty/LICENSES/LICENSE.faiss - */ - -#pragma once - -#include "MergeNetworkUtils.cuh" -#include "StaticUtils.h" - -#include - -namespace cuvs::neighbors::ball_cover::detail::faiss_select { - -// Merge pairs of lists smaller than blockDim.x (NumThreads) -template -inline __device__ void blockMergeSmall(K* listK, V* listV) -{ - static_assert(utils::isPowerOf2(L), "L must be a power-of-2"); - static_assert(utils::isPowerOf2(NumThreads), "NumThreads must be a power-of-2"); - static_assert(L <= NumThreads, "merge list size must be <= NumThreads"); - - // Which pair of lists we are merging - int mergeId = threadIdx.x / L; - - // Which thread we are within the merge - int tid = threadIdx.x % L; - - // listK points to a region of size N * 2 * L - listK += 2 * L * mergeId; - listV += 2 * L * mergeId; - - // It's not a bitonic merge, both lists are in the same direction, - // so handle the first swap assuming the second list is reversed - int pos = L - 1 - tid; - int stride = 2 * tid + 1; - - if (AllThreads || (threadIdx.x < N * L)) { - K ka = listK[pos]; - K kb = listK[pos + stride]; - - bool swap = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); - listK[pos] = swap ? kb : ka; - listK[pos + stride] = swap ? ka : kb; - - V va = listV[pos]; - V vb = listV[pos + stride]; - listV[pos] = swap ? vb : va; - listV[pos + stride] = swap ? va : vb; - - // FIXME: is this a CUDA 9 compiler bug? - // K& ka = listK[pos]; - // K& kb = listK[pos + stride]; - - // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); - // swap(s, ka, kb); - - // V& va = listV[pos]; - // V& vb = listV[pos + stride]; - // swap(s, va, vb); - } - - __syncthreads(); - -#pragma unroll - for (int stride = L / 2; stride > 0; stride /= 2) { - int pos = 2 * tid - (tid & (stride - 1)); - - if (AllThreads || (threadIdx.x < N * L)) { - K ka = listK[pos]; - K kb = listK[pos + stride]; - - bool swap = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); - listK[pos] = swap ? kb : ka; - listK[pos + stride] = swap ? ka : kb; - - V va = listV[pos]; - V vb = listV[pos + stride]; - listV[pos] = swap ? vb : va; - listV[pos + stride] = swap ? va : vb; - - // FIXME: is this a CUDA 9 compiler bug? - // K& ka = listK[pos]; - // K& kb = listK[pos + stride]; - - // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); - // swap(s, ka, kb); - - // V& va = listV[pos]; - // V& vb = listV[pos + stride]; - // swap(s, va, vb); - } - - __syncthreads(); - } -} - -// Merge pairs of sorted lists larger than blockDim.x (NumThreads) -template -inline __device__ void blockMergeLarge(K* listK, V* listV) -{ - static_assert(utils::isPowerOf2(L), "L must be a power-of-2"); - static_assert(L >= raft::WarpSize, "merge list size must be >= 32"); - static_assert(utils::isPowerOf2(NumThreads), "NumThreads must be a power-of-2"); - static_assert(L >= NumThreads, "merge list size must be >= NumThreads"); - - // For L > NumThreads, each thread has to perform more work - // per each stride. - constexpr int kLoopPerThread = L / NumThreads; - - // It's not a bitonic merge, both lists are in the same direction, - // so handle the first swap assuming the second list is reversed -#pragma unroll - for (int loop = 0; loop < kLoopPerThread; ++loop) { - int tid = loop * NumThreads + threadIdx.x; - int pos = L - 1 - tid; - int stride = 2 * tid + 1; - - K ka = listK[pos]; - K kb = listK[pos + stride]; - - bool swap = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); - listK[pos] = swap ? kb : ka; - listK[pos + stride] = swap ? ka : kb; - - V va = listV[pos]; - V vb = listV[pos + stride]; - listV[pos] = swap ? vb : va; - listV[pos + stride] = swap ? va : vb; - - // FIXME: is this a CUDA 9 compiler bug? - // K& ka = listK[pos]; - // K& kb = listK[pos + stride]; - - // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); - // swap(s, ka, kb); - - // V& va = listV[pos]; - // V& vb = listV[pos + stride]; - // swap(s, va, vb); - } - - __syncthreads(); - - constexpr int kSecondLoopPerThread = FullMerge ? kLoopPerThread : kLoopPerThread / 2; - -#pragma unroll - for (int stride = L / 2; stride > 0; stride /= 2) { -#pragma unroll - for (int loop = 0; loop < kSecondLoopPerThread; ++loop) { - int tid = loop * NumThreads + threadIdx.x; - int pos = 2 * tid - (tid & (stride - 1)); - - K ka = listK[pos]; - K kb = listK[pos + stride]; - - bool swap = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); - listK[pos] = swap ? kb : ka; - listK[pos + stride] = swap ? ka : kb; - - V va = listV[pos]; - V vb = listV[pos + stride]; - listV[pos] = swap ? vb : va; - listV[pos + stride] = swap ? va : vb; - - // FIXME: is this a CUDA 9 compiler bug? - // K& ka = listK[pos]; - // K& kb = listK[pos + stride]; - - // bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); - // swap(s, ka, kb); - - // V& va = listV[pos]; - // V& vb = listV[pos + stride]; - // swap(s, va, vb); - } - - __syncthreads(); - } -} - -/// Class template to prevent static_assert from firing for -/// mixing smaller/larger than block cases -template -struct BlockMerge {}; - -/// Merging lists smaller than a block -template -struct BlockMerge { - static inline __device__ void merge(K* listK, V* listV) - { - constexpr int kNumParallelMerges = NumThreads / L; - constexpr int kNumIterations = N / kNumParallelMerges; - - static_assert(L <= NumThreads, "list must be <= NumThreads"); - static_assert((N < kNumParallelMerges) || (kNumIterations * kNumParallelMerges == N), - "improper selection of N and L"); - - if (N < kNumParallelMerges) { - // We only need L threads per each list to perform the merge - blockMergeSmall(listK, listV); - } else { - // All threads participate -#pragma unroll - for (int i = 0; i < kNumIterations; ++i) { - int start = i * kNumParallelMerges * 2 * L; - - blockMergeSmall(listK + start, - listV + start); - } - } - } -}; - -/// Merging lists larger than a block -template -struct BlockMerge { - static inline __device__ void merge(K* listK, V* listV) - { - // Each pair of lists is merged sequentially -#pragma unroll - for (int i = 0; i < N; ++i) { - int start = i * 2 * L; - - blockMergeLarge(listK + start, listV + start); - } - } -}; - -template -inline __device__ void blockMerge(K* listK, V* listV) -{ - constexpr bool kSmallerThanBlock = (L <= NumThreads); - - BlockMerge::merge(listK, listV); -} - -} // namespace cuvs::neighbors::ball_cover::detail::faiss_select diff --git a/cpp/src/neighbors/faiss_select/MergeNetworkUtils.cuh b/cpp/src/neighbors/faiss_select/MergeNetworkUtils.cuh deleted file mode 100644 index 4406c3545..000000000 --- a/cpp/src/neighbors/faiss_select/MergeNetworkUtils.cuh +++ /dev/null @@ -1,25 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file thirdparty/LICENSES/LICENSE.faiss - */ - -#pragma once - -namespace cuvs::neighbors::ball_cover::detail::faiss_select { - -template -inline __device__ void swap(bool swap, T& x, T& y) -{ - T tmp = x; - x = swap ? y : x; - y = swap ? tmp : y; -} - -template -inline __device__ void assign(bool assign, T& x, T y) -{ - x = assign ? y : x; -} -} // namespace cuvs::neighbors::ball_cover::detail::faiss_select diff --git a/cpp/src/neighbors/faiss_select/MergeNetworkWarp.cuh b/cpp/src/neighbors/faiss_select/MergeNetworkWarp.cuh deleted file mode 100644 index b6039accc..000000000 --- a/cpp/src/neighbors/faiss_select/MergeNetworkWarp.cuh +++ /dev/null @@ -1,519 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file thirdparty/LICENSES/LICENSE.faiss - */ - -#pragma once - -#include "MergeNetworkUtils.cuh" -#include "StaticUtils.h" -#include - -namespace cuvs::neighbors::ball_cover::detail::faiss_select { - -// -// This file contains functions to: -// -// -perform bitonic merges on pairs of sorted lists, held in -// registers. Each list contains N * raft::WarpSize (multiple of 32) -// elements for some N. -// The bitonic merge is implemented for arbitrary sizes; -// sorted list A of size N1 * raft::WarpSize registers -// sorted list B of size N2 * raft::WarpSize registers => -// sorted list C if size (N1 + N2) * raft::WarpSize registers. N1 and N2 -// are >= 1 and don't have to be powers of 2. -// -// -perform bitonic sorts on a set of N * raft::WarpSize key/value pairs -// held in registers, by using the above bitonic merge as a -// primitive. -// N can be an arbitrary N >= 1; i.e., the bitonic sort here supports -// odd sizes and doesn't require the input to be a power of 2. -// -// The sort or merge network is completely statically instantiated via -// template specialization / expansion and constexpr, and it uses warp -// shuffles to exchange values between warp lanes. -// -// A note about comparisons: -// -// For a sorting network of keys only, we only need one -// comparison (a < b). However, what we really need to know is -// if one lane chooses to exchange a value, then the -// corresponding lane should also do the exchange. -// Thus, if one just uses the negation !(x < y) in the higher -// lane, this will also include the case where (x == y). Thus, one -// lane in fact performs an exchange and the other doesn't, but -// because the only value being exchanged is equivalent, nothing has -// changed. -// So, you can get away with just one comparison and its negation. -// -// If we're sorting keys and values, where equivalent keys can -// exist, then this is a problem, since we want to treat (x, v1) -// as not equivalent to (x, v2). -// -// To remedy this, you can either compare with a lexicographic -// ordering (a.k < b.k || (a.k == b.k && a.v < b.v)), which since -// we're predicating all of the choices results in 3 comparisons -// being executed, or we can invert the selection so that there is no -// middle choice of equality; the other lane will likewise -// check that (b.k > a.k) (the higher lane has the values -// swapped). Then, the first lane swaps if and only if the -// second lane swaps; if both lanes have equivalent keys, no -// swap will be performed. This results in only two comparisons -// being executed. -// -// If you don't consider values as well, then this does not produce a -// consistent ordering among (k, v) pairs with equivalent keys but -// different values; for us, we don't really care about ordering or -// stability here. -// -// I have tried both re-arranging the order in the higher lane to get -// away with one comparison or adding the value to the check; both -// result in greater register consumption or lower speed than just -// performing both < and > comparisons with the variables, so I just -// stick with this. - -// This function merges raft::WarpSize / 2L lists in parallel using warp -// shuffles. -// It works on at most size-16 lists, as we need 32 threads for this -// shuffle merge. -// -// If IsBitonic is false, the first stage is reversed, so we don't -// need to sort directionally. It's still technically a bitonic sort. -template -inline __device__ void warpBitonicMergeLE16(K& k, V& v) -{ - static_assert(utils::isPowerOf2(L), "L must be a power-of-2"); - static_assert(L <= raft::WarpSize / 2, "merge list size must be <= 16"); - - int laneId = raft::laneId(); - - if (!IsBitonic) { - // Reverse the first comparison stage. - // For example, merging a list of size 8 has the exchanges: - // 0 <-> 15, 1 <-> 14, ... - K otherK = raft::shfl_xor(k, 2 * L - 1); - V otherV = raft::shfl_xor(v, 2 * L - 1); - - // Whether we are the lesser thread in the exchange - bool small = !(laneId & L); - - if (Dir) { - // See the comment above how performing both of these - // comparisons in the warp seems to win out over the - // alternatives in practice - bool s = small ? Comp::gt(k, otherK) : Comp::lt(k, otherK); - assign(s, k, otherK); - assign(s, v, otherV); - - } else { - bool s = small ? Comp::lt(k, otherK) : Comp::gt(k, otherK); - assign(s, k, otherK); - assign(s, v, otherV); - } - } - -#pragma unroll - for (int stride = IsBitonic ? L : L / 2; stride > 0; stride /= 2) { - K otherK = raft::shfl_xor(k, stride); - V otherV = raft::shfl_xor(v, stride); - - // Whether we are the lesser thread in the exchange - bool small = !(laneId & stride); - - if (Dir) { - bool s = small ? Comp::gt(k, otherK) : Comp::lt(k, otherK); - assign(s, k, otherK); - assign(s, v, otherV); - - } else { - bool s = small ? Comp::lt(k, otherK) : Comp::gt(k, otherK); - assign(s, k, otherK); - assign(s, v, otherV); - } - } -} - -// Template for performing a bitonic merge of an arbitrary set of -// registers -template -struct BitonicMergeStep {}; - -// -// Power-of-2 merge specialization -// - -// All merges eventually call this -template -struct BitonicMergeStep { - static inline __device__ void merge(K k[1], V v[1]) - { - // Use warp shuffles - warpBitonicMergeLE16(k[0], v[0]); - } -}; - -template -struct BitonicMergeStep { - static inline __device__ void merge(K k[N], V v[N]) - { - static_assert(utils::isPowerOf2(N), "must be power of 2"); - static_assert(N > 1, "must be N > 1"); - -#pragma unroll - for (int i = 0; i < N / 2; ++i) { - K& ka = k[i]; - V& va = v[i]; - - K& kb = k[i + N / 2]; - V& vb = v[i + N / 2]; - - bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); - swap(s, ka, kb); - swap(s, va, vb); - } - - { - K newK[N / 2]; - V newV[N / 2]; - -#pragma unroll - for (int i = 0; i < N / 2; ++i) { - newK[i] = k[i]; - newV[i] = v[i]; - } - - BitonicMergeStep::merge(newK, newV); - -#pragma unroll - for (int i = 0; i < N / 2; ++i) { - k[i] = newK[i]; - v[i] = newV[i]; - } - } - - { - K newK[N / 2]; - V newV[N / 2]; - -#pragma unroll - for (int i = 0; i < N / 2; ++i) { - newK[i] = k[i + N / 2]; - newV[i] = v[i + N / 2]; - } - - BitonicMergeStep::merge(newK, newV); - -#pragma unroll - for (int i = 0; i < N / 2; ++i) { - k[i + N / 2] = newK[i]; - v[i + N / 2] = newV[i]; - } - } - } -}; - -// -// Non-power-of-2 merge specialization -// - -// Low recursion -template -struct BitonicMergeStep { - static inline __device__ void merge(K k[N], V v[N]) - { - static_assert(!utils::isPowerOf2(N), "must be non-power-of-2"); - static_assert(N >= 3, "must be N >= 3"); - - constexpr int kNextHighestPowerOf2 = utils::nextHighestPowerOf2(N); - -#pragma unroll - for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) { - K& ka = k[i]; - V& va = v[i]; - - K& kb = k[i + kNextHighestPowerOf2 / 2]; - V& vb = v[i + kNextHighestPowerOf2 / 2]; - - bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); - swap(s, ka, kb); - swap(s, va, vb); - } - - constexpr int kLowSize = N - kNextHighestPowerOf2 / 2; - constexpr int kHighSize = kNextHighestPowerOf2 / 2; - { - K newK[kLowSize]; - V newV[kLowSize]; - -#pragma unroll - for (int i = 0; i < kLowSize; ++i) { - newK[i] = k[i]; - newV[i] = v[i]; - } - - constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(N - kNextHighestPowerOf2 / 2); - // FIXME: compiler doesn't like this expression? compiler bug? - // constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize); - BitonicMergeStep::merge(newK, newV); - -#pragma unroll - for (int i = 0; i < kLowSize; ++i) { - k[i] = newK[i]; - v[i] = newV[i]; - } - } - - { - K newK[kHighSize]; - V newV[kHighSize]; - -#pragma unroll - for (int i = 0; i < kHighSize; ++i) { - newK[i] = k[i + kLowSize]; - newV[i] = v[i + kLowSize]; - } - - constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kNextHighestPowerOf2 / 2); - // FIXME: compiler doesn't like this expression? compiler bug? - // constexpr bool kHighIsPowerOf2 = - // utils::isPowerOf2(kHighSize); - BitonicMergeStep::merge(newK, newV); - -#pragma unroll - for (int i = 0; i < kHighSize; ++i) { - k[i + kLowSize] = newK[i]; - v[i + kLowSize] = newV[i]; - } - } - } -}; - -// High recursion -template -struct BitonicMergeStep { - static inline __device__ void merge(K k[N], V v[N]) - { - static_assert(!utils::isPowerOf2(N), "must be non-power-of-2"); - static_assert(N >= 3, "must be N >= 3"); - - constexpr int kNextHighestPowerOf2 = utils::nextHighestPowerOf2(N); - -#pragma unroll - for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) { - K& ka = k[i]; - V& va = v[i]; - - K& kb = k[i + kNextHighestPowerOf2 / 2]; - V& vb = v[i + kNextHighestPowerOf2 / 2]; - - bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb); - swap(s, ka, kb); - swap(s, va, vb); - } - - constexpr int kLowSize = kNextHighestPowerOf2 / 2; - constexpr int kHighSize = N - kNextHighestPowerOf2 / 2; - { - K newK[kLowSize]; - V newV[kLowSize]; - -#pragma unroll - for (int i = 0; i < kLowSize; ++i) { - newK[i] = k[i]; - newV[i] = v[i]; - } - - constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kNextHighestPowerOf2 / 2); - // FIXME: compiler doesn't like this expression? compiler bug? - // constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize); - BitonicMergeStep::merge(newK, newV); - -#pragma unroll - for (int i = 0; i < kLowSize; ++i) { - k[i] = newK[i]; - v[i] = newV[i]; - } - } - - { - K newK[kHighSize]; - V newV[kHighSize]; - -#pragma unroll - for (int i = 0; i < kHighSize; ++i) { - newK[i] = k[i + kLowSize]; - newV[i] = v[i + kLowSize]; - } - - constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(N - kNextHighestPowerOf2 / 2); - // FIXME: compiler doesn't like this expression? compiler bug? - // constexpr bool kHighIsPowerOf2 = - // utils::isPowerOf2(kHighSize); - BitonicMergeStep::merge(newK, newV); - -#pragma unroll - for (int i = 0; i < kHighSize; ++i) { - k[i + kLowSize] = newK[i]; - v[i + kLowSize] = newV[i]; - } - } - } -}; - -/// Merges two sets of registers across the warp of any size; -/// i.e., merges a sorted k/v list of size raft::WarpSize * N1 with a -/// sorted k/v list of size raft::WarpSize * N2, where N1 and N2 are any -/// value >= 1 -template -inline __device__ void warpMergeAnyRegisters(K k1[N1], V v1[N1], K k2[N2], V v2[N2]) -{ - constexpr int kSmallestN = N1 < N2 ? N1 : N2; - -#pragma unroll - for (int i = 0; i < kSmallestN; ++i) { - K& ka = k1[N1 - 1 - i]; - V& va = v1[N1 - 1 - i]; - - K& kb = k2[i]; - V& vb = v2[i]; - - K otherKa; - V otherVa; - - if (FullMerge) { - // We need the other values - otherKa = raft::shfl_xor(ka, raft::WarpSize - 1); - otherVa = raft::shfl_xor(va, raft::WarpSize - 1); - } - - K otherKb = raft::shfl_xor(kb, raft::WarpSize - 1); - V otherVb = raft::shfl_xor(vb, raft::WarpSize - 1); - - // ka is always first in the list, so we needn't use our lane - // in this comparison - bool swapa = Dir ? Comp::gt(ka, otherKb) : Comp::lt(ka, otherKb); - assign(swapa, ka, otherKb); - assign(swapa, va, otherVb); - - // kb is always second in the list, so we needn't use our lane - // in this comparison - if (FullMerge) { - bool swapb = Dir ? Comp::lt(kb, otherKa) : Comp::gt(kb, otherKa); - assign(swapb, kb, otherKa); - assign(swapb, vb, otherVa); - - } else { - // We don't care about updating elements in the second list - } - } - - BitonicMergeStep::merge(k1, v1); - if (FullMerge) { - // Only if we care about N2 do we need to bother merging it fully - BitonicMergeStep::merge(k2, v2); - } -} - -// Recursive template that uses the above bitonic merge to perform a -// bitonic sort -template -struct BitonicSortStep { - static inline __device__ void sort(K k[N], V v[N]) - { - static_assert(N > 1, "did not hit specialized case"); - - // Sort recursively - constexpr int kSizeA = N / 2; - constexpr int kSizeB = N - kSizeA; - - K aK[kSizeA]; - V aV[kSizeA]; - -#pragma unroll - for (int i = 0; i < kSizeA; ++i) { - aK[i] = k[i]; - aV[i] = v[i]; - } - - BitonicSortStep::sort(aK, aV); - - K bK[kSizeB]; - V bV[kSizeB]; - -#pragma unroll - for (int i = 0; i < kSizeB; ++i) { - bK[i] = k[i + kSizeA]; - bV[i] = v[i + kSizeA]; - } - - BitonicSortStep::sort(bK, bV); - - // Merge halves - warpMergeAnyRegisters(aK, aV, bK, bV); - -#pragma unroll - for (int i = 0; i < kSizeA; ++i) { - k[i] = aK[i]; - v[i] = aV[i]; - } - -#pragma unroll - for (int i = 0; i < kSizeB; ++i) { - k[i + kSizeA] = bK[i]; - v[i + kSizeA] = bV[i]; - } - } -}; - -// Single warp (N == 1) sorting specialization -template -struct BitonicSortStep { - static inline __device__ void sort(K k[1], V v[1]) - { - // Update this code if this changes - // should go from 1 -> raft::WarpSize in multiples of 2 - static_assert(raft::WarpSize == 32, "unexpected warp size"); - - warpBitonicMergeLE16(k[0], v[0]); - warpBitonicMergeLE16(k[0], v[0]); - warpBitonicMergeLE16(k[0], v[0]); - warpBitonicMergeLE16(k[0], v[0]); - warpBitonicMergeLE16(k[0], v[0]); - } -}; - -/// Sort a list of raft::WarpSize * N elements in registers, where N is an -/// arbitrary >= 1 -template -inline __device__ void warpSortAnyRegisters(K k[N], V v[N]) -{ - BitonicSortStep::sort(k, v); -} - -} // namespace cuvs::neighbors::ball_cover::detail::faiss_select diff --git a/cpp/src/neighbors/faiss_select/Select.cuh b/cpp/src/neighbors/faiss_select/Select.cuh deleted file mode 100644 index 17f682523..000000000 --- a/cpp/src/neighbors/faiss_select/Select.cuh +++ /dev/null @@ -1,569 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file thirdparty/LICENSES/LICENSE.faiss - */ - -#pragma once - -#include "Comparators.cuh" -#include "MergeNetworkBlock.cuh" -#include "MergeNetworkWarp.cuh" -#include -#include - -namespace cuvs::neighbors::ball_cover::detail::faiss_select { - -// Specialization for block-wide monotonic merges producing a merge sort -// since what we really want is a constexpr loop expansion -template -struct FinalBlockMerge {}; - -template -struct FinalBlockMerge<1, NumThreads, K, V, NumWarpQ, Dir, Comp> { - static inline __device__ void merge(K* sharedK, V* sharedV) - { - // no merge required; single warp - } -}; - -template -struct FinalBlockMerge<2, NumThreads, K, V, NumWarpQ, Dir, Comp> { - static inline __device__ void merge(K* sharedK, V* sharedV) - { - // Final merge doesn't need to fully merge the second list - blockMerge( - sharedK, sharedV); - } -}; - -template -struct FinalBlockMerge<4, NumThreads, K, V, NumWarpQ, Dir, Comp> { - static inline __device__ void merge(K* sharedK, V* sharedV) - { - blockMerge(sharedK, - sharedV); - // Final merge doesn't need to fully merge the second list - blockMerge(sharedK, sharedV); - } -}; - -template -struct FinalBlockMerge<8, NumThreads, K, V, NumWarpQ, Dir, Comp> { - static inline __device__ void merge(K* sharedK, V* sharedV) - { - blockMerge(sharedK, - sharedV); - blockMerge( - sharedK, sharedV); - // Final merge doesn't need to fully merge the second list - blockMerge(sharedK, sharedV); - } -}; - -// `Dir` true, produce largest values. -// `Dir` false, produce smallest values. -template -struct BlockSelect { - static constexpr int kNumWarps = ThreadsPerBlock / raft::WarpSize; - static constexpr int kTotalWarpSortSize = NumWarpQ; - - __device__ inline BlockSelect(K initKVal, V initVVal, K* smemK, V* smemV, int k) - : initK(initKVal), - initV(initVVal), - numVals(0), - warpKTop(initKVal), - sharedK(smemK), - sharedV(smemV), - kMinus1(k - 1) - { - static_assert(utils::isPowerOf2(ThreadsPerBlock), "threads must be a power-of-2"); - static_assert(utils::isPowerOf2(NumWarpQ), "warp queue must be power-of-2"); - - // Fill the per-thread queue keys with the default value -#pragma unroll - for (int i = 0; i < NumThreadQ; ++i) { - threadK[i] = initK; - threadV[i] = initV; - } - - int laneId = raft::laneId(); - int warpId = threadIdx.x / raft::WarpSize; - warpK = sharedK + warpId * kTotalWarpSortSize; - warpV = sharedV + warpId * kTotalWarpSortSize; - - // Fill warp queue (only the actual queue space is fine, not where - // we write the per-thread queues for merging) - for (int i = laneId; i < NumWarpQ; i += raft::WarpSize) { - warpK[i] = initK; - warpV[i] = initV; - } - - raft::warpFence(); - } - - __device__ inline void addThreadQ(K k, V v) - { - if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) { - // Rotate right -#pragma unroll - for (int i = NumThreadQ - 1; i > 0; --i) { - threadK[i] = threadK[i - 1]; - threadV[i] = threadV[i - 1]; - } - - threadK[0] = k; - threadV[0] = v; - ++numVals; - } - } - - __device__ inline void checkThreadQ() - { - bool needSort = (numVals == NumThreadQ); - -#if CUDA_VERSION >= 9000 - needSort = __any_sync(0xffffffff, needSort); -#else - needSort = __any(needSort); -#endif - - if (!needSort) { - // no lanes have triggered a sort - return; - } - - // This has a trailing raft::warpFence - mergeWarpQ(); - - // Any top-k elements have been merged into the warp queue; we're - // free to reset the thread queues - numVals = 0; - -#pragma unroll - for (int i = 0; i < NumThreadQ; ++i) { - threadK[i] = initK; - threadV[i] = initV; - } - - // We have to beat at least this element - warpKTop = warpK[kMinus1]; - - raft::warpFence(); - } - - /// This function handles sorting and merging together the - /// per-thread queues with the warp-wide queue, creating a sorted - /// list across both - __device__ inline void mergeWarpQ() - { - int laneId = raft::laneId(); - - // Sort all of the per-thread queues - warpSortAnyRegisters(threadK, threadV); - - constexpr int kNumWarpQRegisters = NumWarpQ / raft::WarpSize; - K warpKRegisters[kNumWarpQRegisters]; - V warpVRegisters[kNumWarpQRegisters]; - -#pragma unroll - for (int i = 0; i < kNumWarpQRegisters; ++i) { - warpKRegisters[i] = warpK[i * raft::WarpSize + laneId]; - warpVRegisters[i] = warpV[i * raft::WarpSize + laneId]; - } - - raft::warpFence(); - - // The warp queue is already sorted, and now that we've sorted the - // per-thread queue, merge both sorted lists together, producing - // one sorted list - warpMergeAnyRegisters( - warpKRegisters, warpVRegisters, threadK, threadV); - - // Write back out the warp queue -#pragma unroll - for (int i = 0; i < kNumWarpQRegisters; ++i) { - warpK[i * raft::WarpSize + laneId] = warpKRegisters[i]; - warpV[i * raft::WarpSize + laneId] = warpVRegisters[i]; - } - - raft::warpFence(); - } - - /// WARNING: all threads in a warp must participate in this. - /// Otherwise, you must call the constituent parts separately. - __device__ inline void add(K k, V v) - { - addThreadQ(k, v); - checkThreadQ(); - } - - __device__ inline void reduce() - { - // Have all warps dump and merge their queues; this will produce - // the final per-warp results - mergeWarpQ(); - - // block-wide dep; thus far, all warps have been completely - // independent - __syncthreads(); - - // All warp queues are contiguous in smem. - // Now, we have kNumWarps lists of NumWarpQ elements. - // This is a power of 2. - FinalBlockMerge::merge(sharedK, sharedV); - - // The block-wide merge has a trailing syncthreads - } - - // Default element key - const K initK; - - // Default element value - const V initV; - - // Number of valid elements in our thread queue - int numVals; - - // The k-th highest (Dir) or lowest (!Dir) element - K warpKTop; - - // Thread queue values - K threadK[NumThreadQ]; - V threadV[NumThreadQ]; - - // Queues for all warps - K* sharedK; - V* sharedV; - - // Our warp's queue (points into sharedK/sharedV) - // warpK[0] is highest (Dir) or lowest (!Dir) - K* warpK; - V* warpV; - - // This is a cached k-1 value - int kMinus1; -}; - -/// Specialization for k == 1 (NumWarpQ == 1) -template -struct BlockSelect { - static constexpr int kNumWarps = ThreadsPerBlock / raft::WarpSize; - - __device__ inline BlockSelect(K initK, V initV, K* smemK, V* smemV, int k) - : threadK(initK), threadV(initV), sharedK(smemK), sharedV(smemV) - { - } - - __device__ inline void addThreadQ(K k, V v) - { - bool swap = Dir ? Comp::gt(k, threadK) : Comp::lt(k, threadK); - threadK = swap ? k : threadK; - threadV = swap ? v : threadV; - } - - __device__ inline void checkThreadQ() - { - // We don't need to do anything here, since the warp doesn't - // cooperate until the end - } - - __device__ inline void add(K k, V v) { addThreadQ(k, v); } - - __device__ inline void reduce() - { - // Reduce within the warp - raft::KeyValuePair pair(threadK, threadV); - - if (Dir) { - pair = raft::warpReduce(pair, raft::max_op{}); - } else { - pair = raft::warpReduce(pair, raft::min_op{}); - } - - // Each warp writes out a single value - int laneId = raft::laneId(); - int warpId = threadIdx.x / raft::WarpSize; - - if (laneId == 0) { - sharedK[warpId] = pair.key; - sharedV[warpId] = pair.value; - } - - __syncthreads(); - - // We typically use this for small blocks (<= 128), just having the - // first thread in the block perform the reduction across warps is - // faster - if (threadIdx.x == 0) { - threadK = sharedK[0]; - threadV = sharedV[0]; - -#pragma unroll - for (int i = 1; i < kNumWarps; ++i) { - K k = sharedK[i]; - V v = sharedV[i]; - - bool swap = Dir ? Comp::gt(k, threadK) : Comp::lt(k, threadK); - threadK = swap ? k : threadK; - threadV = swap ? v : threadV; - } - - // Hopefully a thread's smem reads/writes are ordered wrt - // itself, so no barrier needed :) - sharedK[0] = threadK; - sharedV[0] = threadV; - } - - // In case other threads wish to read this value - __syncthreads(); - } - - // threadK is lowest (Dir) or highest (!Dir) - K threadK; - V threadV; - - // Where we reduce in smem - K* sharedK; - V* sharedV; -}; - -// -// per-warp WarpSelect -// - -// `Dir` true, produce largest values. -// `Dir` false, produce smallest values. -template -struct WarpSelect { - static constexpr int kNumWarpQRegisters = NumWarpQ / raft::WarpSize; - - __device__ inline WarpSelect(K initKVal, V initVVal, int k) - : initK(initKVal), - initV(initVVal), - numVals(0), - warpKTop(initKVal), - kLane((k - 1) % raft::WarpSize) - { - static_assert(utils::isPowerOf2(ThreadsPerBlock), "threads must be a power-of-2"); - static_assert(utils::isPowerOf2(NumWarpQ), "warp queue must be power-of-2"); - - // Fill the per-thread queue keys with the default value -#pragma unroll - for (int i = 0; i < NumThreadQ; ++i) { - threadK[i] = initK; - threadV[i] = initV; - } - - // Fill the warp queue with the default value -#pragma unroll - for (int i = 0; i < kNumWarpQRegisters; ++i) { - warpK[i] = initK; - warpV[i] = initV; - } - } - - __device__ inline void addThreadQ(K k, V v) - { - if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) { - // Rotate right -#pragma unroll - for (int i = NumThreadQ - 1; i > 0; --i) { - threadK[i] = threadK[i - 1]; - threadV[i] = threadV[i - 1]; - } - - threadK[0] = k; - threadV[0] = v; - ++numVals; - } - } - - __device__ inline void checkThreadQ() - { - bool needSort = (numVals == NumThreadQ); - -#if CUDA_VERSION >= 9000 - needSort = __any_sync(0xffffffff, needSort); -#else - needSort = __any(needSort); -#endif - - if (!needSort) { - // no lanes have triggered a sort - return; - } - - mergeWarpQ(); - - // Any top-k elements have been merged into the warp queue; we're - // free to reset the thread queues - numVals = 0; - -#pragma unroll - for (int i = 0; i < NumThreadQ; ++i) { - threadK[i] = initK; - threadV[i] = initV; - } - - // We have to beat at least this element - warpKTop = shfl(warpK[kNumWarpQRegisters - 1], kLane); - } - - /// This function handles sorting and merging together the - /// per-thread queues with the warp-wide queue, creating a sorted - /// list across both - __device__ inline void mergeWarpQ() - { - // Sort all of the per-thread queues - warpSortAnyRegisters(threadK, threadV); - - // The warp queue is already sorted, and now that we've sorted the - // per-thread queue, merge both sorted lists together, producing - // one sorted list - warpMergeAnyRegisters( - warpK, warpV, threadK, threadV); - } - - /// WARNING: all threads in a warp must participate in this. - /// Otherwise, you must call the constituent parts separately. - __device__ inline void add(K k, V v) - { - addThreadQ(k, v); - checkThreadQ(); - } - - __device__ inline void reduce() - { - // Have all warps dump and merge their queues; this will produce - // the final per-warp results - mergeWarpQ(); - } - - /// Dump final k selected values for this warp out - __device__ inline void writeOut(K* outK, V* outV, int k) - { - int laneId = raft::laneId(); - -#pragma unroll - for (int i = 0; i < kNumWarpQRegisters; ++i) { - int idx = i * raft::WarpSize + laneId; - - if (idx < k) { - outK[idx] = warpK[i]; - outV[idx] = warpV[i]; - } - } - } - - // Default element key - const K initK; - - // Default element value - const V initV; - - // Number of valid elements in our thread queue - int numVals; - - // The k-th highest (Dir) or lowest (!Dir) element - K warpKTop; - - // Thread queue values - K threadK[NumThreadQ]; - V threadV[NumThreadQ]; - - // warpK[0] is highest (Dir) or lowest (!Dir) - K warpK[kNumWarpQRegisters]; - V warpV[kNumWarpQRegisters]; - - // This is what lane we should load an approximation (>=k) to the - // kth element from the last register in the warp queue (i.e., - // warpK[kNumWarpQRegisters - 1]). - int kLane; -}; - -/// Specialization for k == 1 (NumWarpQ == 1) -template -struct WarpSelect { - static constexpr int kNumWarps = ThreadsPerBlock / raft::WarpSize; - - __device__ inline WarpSelect(K initK, V initV, int k) : threadK(initK), threadV(initV) {} - - __device__ inline void addThreadQ(K k, V v) - { - bool swap = Dir ? Comp::gt(k, threadK) : Comp::lt(k, threadK); - threadK = swap ? k : threadK; - threadV = swap ? v : threadV; - } - - __device__ inline void checkThreadQ() - { - // We don't need to do anything here, since the warp doesn't - // cooperate until the end - } - - __device__ inline void add(K k, V v) { addThreadQ(k, v); } - - __device__ inline void reduce() - { - // Reduce within the warp - raft::KeyValuePair pair(threadK, threadV); - - if (Dir) { - pair = raft::warpReduce(pair, raft::max_op{}); - } else { - pair = raft::warpReduce(pair, raft::min_op{}); - } - - threadK = pair.key; - threadV = pair.value; - } - - /// Dump final k selected values for this warp out - __device__ inline void writeOut(K* outK, V* outV, int k) - { - if (raft::laneId() == 0) { - *outK = threadK; - *outV = threadV; - } - } - - // threadK is lowest (Dir) or highest (!Dir) - K threadK; - V threadV; -}; - -} // namespace cuvs::neighbors::ball_cover::detail::faiss_select diff --git a/cpp/src/neighbors/faiss_select/StaticUtils.h b/cpp/src/neighbors/faiss_select/StaticUtils.h deleted file mode 100644 index 87124ffe0..000000000 --- a/cpp/src/neighbors/faiss_select/StaticUtils.h +++ /dev/null @@ -1,48 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file thirdparty/LICENSES/LICENSE.faiss - */ - -#pragma once - -#include - -// allow usage for non-CUDA files -#ifndef __host__ -#define __host__ -#define __device__ -#endif - -namespace cuvs::neighbors::ball_cover::detail::faiss_select::utils { - -template -constexpr __host__ __device__ bool isPowerOf2(T v) -{ - return (v && !(v & (v - 1))); -} - -static_assert(isPowerOf2(2048), "isPowerOf2"); -static_assert(!isPowerOf2(3333), "isPowerOf2"); - -template -constexpr __host__ __device__ T nextHighestPowerOf2(T v) -{ - return (isPowerOf2(v) ? (T)2 * v : ((T)1 << (raft::log2(v) + (T)1))); -} - -static_assert(nextHighestPowerOf2(1) == 2, "nextHighestPowerOf2"); -static_assert(nextHighestPowerOf2(2) == 4, "nextHighestPowerOf2"); -static_assert(nextHighestPowerOf2(3) == 4, "nextHighestPowerOf2"); -static_assert(nextHighestPowerOf2(4) == 8, "nextHighestPowerOf2"); - -static_assert(nextHighestPowerOf2(15) == 16, "nextHighestPowerOf2"); -static_assert(nextHighestPowerOf2(16) == 32, "nextHighestPowerOf2"); -static_assert(nextHighestPowerOf2(17) == 32, "nextHighestPowerOf2"); - -static_assert(nextHighestPowerOf2(1536000000u) == 2147483648u, "nextHighestPowerOf2"); -static_assert(nextHighestPowerOf2((size_t)2147483648ULL) == (size_t)4294967296ULL, - "nextHighestPowerOf2"); - -} // namespace cuvs::neighbors::ball_cover::detail::faiss_select::utils diff --git a/cpp/src/neighbors/faiss_select/key_value_block_select.cuh b/cpp/src/neighbors/faiss_select/key_value_block_select.cuh deleted file mode 100644 index 67882a308..000000000 --- a/cpp/src/neighbors/faiss_select/key_value_block_select.cuh +++ /dev/null @@ -1,229 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file thirdparty/LICENSES/LICENSE.faiss - */ - -#pragma once - -#include "MergeNetworkUtils.cuh" -#include "Select.cuh" - -// TODO: Need to think further about the impact (and new boundaries created) on the registers -// because this will change the max k that can be processed. One solution might be to break -// up k into multiple batches for larger k. - -namespace cuvs::neighbors::ball_cover::detail::faiss_select { - -// `Dir` true, produce largest values. -// `Dir` false, produce smallest values. -template -struct KeyValueBlockSelect { - static constexpr int kNumWarps = ThreadsPerBlock / raft::WarpSize; - static constexpr int kTotalWarpSortSize = NumWarpQ; - - __device__ inline KeyValueBlockSelect( - K initKVal, K initVKey, V initVVal, K* smemK, raft::KeyValuePair* smemV, int k) - : initK(initKVal), - initVk(initVKey), - initVv(initVVal), - numVals(0), - warpKTop(initKVal), - warpKTopRDist(initKVal), - sharedK(smemK), - sharedV(smemV), - kMinus1(k - 1) - { - static_assert(utils::isPowerOf2(ThreadsPerBlock), "threads must be a power-of-2"); - static_assert(utils::isPowerOf2(NumWarpQ), "warp queue must be power-of-2"); - - // Fill the per-thread queue keys with the default value -#pragma unroll - for (int i = 0; i < NumThreadQ; ++i) { - threadK[i] = initK; - threadV[i].key = initVk; - threadV[i].value = initVv; - } - - int laneId = raft::laneId(); - int warpId = threadIdx.x / raft::WarpSize; - warpK = sharedK + warpId * kTotalWarpSortSize; - warpV = sharedV + warpId * kTotalWarpSortSize; - - // Fill warp queue (only the actual queue space is fine, not where - // we write the per-thread queues for merging) - for (int i = laneId; i < NumWarpQ; i += raft::WarpSize) { - warpK[i] = initK; - warpV[i].key = initVk; - warpV[i].value = initVv; - } - - raft::warpFence(); - } - - __device__ inline void addThreadQ(K k, K vk, V vv) - { - if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) { - // Rotate right -#pragma unroll - for (int i = NumThreadQ - 1; i > 0; --i) { - threadK[i] = threadK[i - 1]; - threadV[i].key = threadV[i - 1].key; - threadV[i].value = threadV[i - 1].value; - } - - threadK[0] = k; - threadV[0].key = vk; - threadV[0].value = vv; - ++numVals; - } - } - - __device__ inline void checkThreadQ() - { - bool needSort = (numVals == NumThreadQ); - -#if CUDA_VERSION >= 9000 - needSort = __any_sync(0xffffffff, needSort); -#else - needSort = __any(needSort); -#endif - - if (!needSort) { - // no lanes have triggered a sort - return; - } - - // This has a trailing raft::warpFence - mergeWarpQ(); - - // Any top-k elements have been merged into the warp queue; we're - // free to reset the thread queues - numVals = 0; - -#pragma unroll - for (int i = 0; i < NumThreadQ; ++i) { - threadK[i] = initK; - threadV[i].key = initVk; - threadV[i].value = initVv; - } - - // We have to beat at least this element - warpKTop = warpK[kMinus1]; - warpKTopRDist = warpV[kMinus1].key; - - raft::warpFence(); - } - - /// This function handles sorting and merging together the - /// per-thread queues with the warp-wide queue, creating a sorted - /// list across both - __device__ inline void mergeWarpQ() - { - int laneId = raft::laneId(); - - // Sort all of the per-thread queues - warpSortAnyRegisters, NumThreadQ, !Dir, Comp>(threadK, threadV); - - constexpr int kNumWarpQRegisters = NumWarpQ / raft::WarpSize; - K warpKRegisters[kNumWarpQRegisters]; - raft::KeyValuePair warpVRegisters[kNumWarpQRegisters]; - -#pragma unroll - for (int i = 0; i < kNumWarpQRegisters; ++i) { - warpKRegisters[i] = warpK[i * raft::WarpSize + laneId]; - warpVRegisters[i].key = warpV[i * raft::WarpSize + laneId].key; - warpVRegisters[i].value = warpV[i * raft::WarpSize + laneId].value; - } - - raft::warpFence(); - - // The warp queue is already sorted, and now that we've sorted the - // per-thread queue, merge both sorted lists together, producing - // one sorted list - warpMergeAnyRegisters, - kNumWarpQRegisters, - NumThreadQ, - !Dir, - Comp, - false>(warpKRegisters, warpVRegisters, threadK, threadV); - - // Write back out the warp queue -#pragma unroll - for (int i = 0; i < kNumWarpQRegisters; ++i) { - warpK[i * raft::WarpSize + laneId] = warpKRegisters[i]; - warpV[i * raft::WarpSize + laneId].key = warpVRegisters[i].key; - warpV[i * raft::WarpSize + laneId].value = warpVRegisters[i].value; - } - - raft::warpFence(); - } - - /// WARNING: all threads in a warp must participate in this. - /// Otherwise, you must call the constituent parts separately. - __device__ inline void add(K k, K vk, V vv) - { - addThreadQ(k, vk, vv); - checkThreadQ(); - } - - __device__ inline void reduce() - { - // Have all warps dump and merge their queues; this will produce - // the final per-warp results - mergeWarpQ(); - - // block-wide dep; thus far, all warps have been completely - // independent - __syncthreads(); - - // All warp queues are contiguous in smem. - // Now, we have kNumWarps lists of NumWarpQ elements. - // This is a power of 2. - FinalBlockMerge, NumWarpQ, Dir, Comp>:: - merge(sharedK, sharedV); - - // The block-wide merge has a trailing syncthreads - } - - // Default element key - const K initK; - - // Default element value - const K initVk; - const V initVv; - - // Number of valid elements in our thread queue - int numVals; - - // The k-th highest (Dir) or lowest (!Dir) element - K warpKTop; - - K warpKTopRDist; - - // Thread queue values - K threadK[NumThreadQ]; - raft::KeyValuePair threadV[NumThreadQ]; - - // Queues for all warps - K* sharedK; - raft::KeyValuePair* sharedV; - - // Our warp's queue (points into sharedK/sharedV) - // warpK[0] is highest (Dir) or lowest (!Dir) - K* warpK; - raft::KeyValuePair* warpV; - - // This is a cached k-1 value - int kMinus1; -}; - -} // namespace cuvs::neighbors::ball_cover::detail::faiss_select diff --git a/notebooks/rmm_log.txt b/notebooks/rmm_log.txt deleted file mode 100644 index 681eba61a..000000000 --- a/notebooks/rmm_log.txt +++ /dev/null @@ -1,2 +0,0 @@ -[266514][18:28:55:663533][info ] ----- RMM LOG BEGIN [PTDS DISABLED] ----- -[266514][18:40:02:947176][error ] [A][Stream 0x2][Upstream 14270349312B][FAILURE maximum pool size exceeded]