From 6f2db2069dc9be70e3b393b07babc0134b43d71d Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Fri, 15 Nov 2024 09:31:54 -0500
Subject: [PATCH 01/39] DOC v25.02 Updates [skip ci]

---
 .../cuda11.8-conda/devcontainer.json          |  6 ++---
 .devcontainer/cuda11.8-pip/devcontainer.json  |  8 +++---
 .../cuda12.5-conda/devcontainer.json          |  6 ++---
 .devcontainer/cuda12.5-pip/devcontainer.json  |  8 +++---
 .github/workflows/build.yaml                  | 14 +++++-----
 .github/workflows/pr.yaml                     | 26 +++++++++----------
 .github/workflows/test.yaml                   |  8 +++---
 README.md                                     |  2 +-
 VERSION                                       |  2 +-
 .../all_cuda-118_arch-aarch64.yaml            |  4 +--
 .../all_cuda-118_arch-x86_64.yaml             |  4 +--
 .../all_cuda-125_arch-aarch64.yaml            |  4 +--
 .../all_cuda-125_arch-x86_64.yaml             |  4 +--
 .../bench_ann_cuda-118_arch-aarch64.yaml      |  4 +--
 .../bench_ann_cuda-118_arch-x86_64.yaml       |  4 +--
 .../bench_ann_cuda-125_arch-aarch64.yaml      |  4 +--
 .../bench_ann_cuda-125_arch-x86_64.yaml       |  4 +--
 dependencies.yaml                             | 12 ++++-----
 docs/source/developer_guide.md                |  6 ++---
 examples/cmake/thirdparty/fetch_rapids.cmake  |  2 +-
 python/cuvs/pyproject.toml                    |  2 +-
 rust/Cargo.toml                               |  2 +-
 rust/cuvs/Cargo.toml                          |  2 +-
 23 files changed, 69 insertions(+), 69 deletions(-)

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 05f11c005..f03ec7b19 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda11.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index b4c507f86..a59c499d3 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,24 +5,24 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-ucx1.17.0-openmpi-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda11.8-ucx1.17.0-openmpi-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda11.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/cuda:24.12": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:25.2": {
       "version": "11.8",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
       "installcuSPARSE": true
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/ucx",
diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
index 4f8d628c2..39852cec1 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.5-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.12-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.02-cpp-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.5-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
index 8e6ba4de8..d84966656 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.5-pip/devcontainer.json
@@ -5,24 +5,24 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda12.5-ucx1.17.0-openmpi-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda12.5-ucx1.17.0-openmpi-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.5-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/cuda:24.12": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:25.2": {
       "version": "12.5",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
       "installcuSPARSE": true
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:25.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/ucx",
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 7ac02e365..e93b7a694 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   rust-build:
     needs: cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -50,7 +50,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -59,7 +59,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -70,7 +70,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -82,7 +82,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cuvs:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -92,7 +92,7 @@ jobs:
   wheel-publish-cuvs:
     needs: wheel-build-cuvs
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index e18e82df0..a62b4e00a 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -25,13 +25,13 @@ jobs:
       - wheel-tests-cuvs
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-25.02
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
   changed-files:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.02
     with:
       files_yaml: |
         test_cpp:
@@ -64,27 +64,27 @@ jobs:
           - '!thirdparty/LICENSES/**'
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-25.02
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-25.02
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.02
     with:
       build_type: pull-request
       enable_check_symbols: true
@@ -92,20 +92,20 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-25.02
     with:
       build_type: pull-request
   conda-python-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -115,7 +115,7 @@ jobs:
   rust-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.02
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -125,21 +125,21 @@ jobs:
   wheel-build-cuvs:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       build_type: pull-request
       script: ci/build_wheel_cuvs.sh
   wheel-tests-cuvs:
     needs: [wheel-build-cuvs, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
       script: ci/test_wheel_cuvs.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.02
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 5f60c0a34..2645e5d5d 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -26,7 +26,7 @@ jobs:
       symbol_exclusions: (void (thrust::|cub::)|raft_cutlass)
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -42,7 +42,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-cuvs:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-25.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/README.md b/README.md
index 572e8d098..8b7e529af 100755
--- a/README.md
+++ b/README.md
@@ -109,7 +109,7 @@ pip install cuvs-cu12 --extra-index-url=https://pypi.nvidia.com
 If installing a version that has not yet been released, the `rapidsai` channel can be replaced with `rapidsai-nightly`:
 
 ```bash
-conda install -c conda-forge -c nvidia -c rapidsai-nightly cuvs=24.12
+conda install -c conda-forge -c nvidia -c rapidsai-nightly cuvs=25.02
 ```
 
 cuVS also has `pip` wheel packages that can be installed. Please see the [Build and Install Guide](https://docs.rapids.ai/api/cuvs/nightly/build/) for more information on installing the available cuVS packages and building from source.
diff --git a/VERSION b/VERSION
index af28c42b5..72eefaf7c 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.12.00
+25.02.00
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 80bfb0c24..1daf668b1 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -35,7 +35,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- librmm==24.12.*,>=0.0.0a0
+- librmm==25.2.*,>=0.0.0a0
 - make
 - nccl>=2.19
 - ninja
@@ -45,7 +45,7 @@ dependencies:
 - openblas
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==24.12.*,>=0.0.0a0
+- pylibraft==25.2.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 07937726c..098156397 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -35,7 +35,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- librmm==24.12.*,>=0.0.0a0
+- librmm==25.2.*,>=0.0.0a0
 - make
 - nccl>=2.19
 - ninja
@@ -45,7 +45,7 @@ dependencies:
 - openblas
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==24.12.*,>=0.0.0a0
+- pylibraft==25.2.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index b7fd6fcfa..b94b44749 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- librmm==24.12.*,>=0.0.0a0
+- librmm==25.2.*,>=0.0.0a0
 - make
 - nccl>=2.19
 - ninja
@@ -41,7 +41,7 @@ dependencies:
 - openblas
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==24.12.*,>=0.0.0a0
+- pylibraft==25.2.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 83a457465..10e30a8c2 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -32,7 +32,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- librmm==24.12.*,>=0.0.0a0
+- librmm==25.2.*,>=0.0.0a0
 - make
 - nccl>=2.19
 - ninja
@@ -41,7 +41,7 @@ dependencies:
 - openblas
 - pre-commit
 - pydata-sphinx-theme
-- pylibraft==24.12.*,>=0.0.0a0
+- pylibraft==25.2.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index 21cb98180..2a1d80aaa 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -33,7 +33,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- librmm==24.12.*,>=0.0.0a0
+- librmm==25.2.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.19
 - ninja
@@ -41,7 +41,7 @@ dependencies:
 - nvcc_linux-aarch64=11.8
 - openblas
 - pandas
-- pylibraft==24.12.*,>=0.0.0a0
+- pylibraft==25.2.*,>=0.0.0a0
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - setuptools
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 432509bcb..6507f55cc 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -33,7 +33,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
-- librmm==24.12.*,>=0.0.0a0
+- librmm==25.2.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.19
 - ninja
@@ -41,7 +41,7 @@ dependencies:
 - nvcc_linux-64=11.8
 - openblas
 - pandas
-- pylibraft==24.12.*,>=0.0.0a0
+- pylibraft==25.2.*,>=0.0.0a0
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - setuptools
diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
index 0c5043ac2..e53606a06 100644
--- a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
@@ -30,14 +30,14 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- librmm==24.12.*,>=0.0.0a0
+- librmm==25.2.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
 - openblas
 - pandas
-- pylibraft==24.12.*,>=0.0.0a0
+- pylibraft==25.2.*,>=0.0.0a0
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - setuptools
diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
index cbb22333c..e37c507c7 100644
--- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
@@ -30,14 +30,14 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
-- librmm==24.12.*,>=0.0.0a0
+- librmm==25.2.*,>=0.0.0a0
 - matplotlib
 - nccl>=2.19
 - ninja
 - nlohmann_json>=3.11.2
 - openblas
 - pandas
-- pylibraft==24.12.*,>=0.0.0a0
+- pylibraft==25.2.*,>=0.0.0a0
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - setuptools
diff --git a/dependencies.yaml b/dependencies.yaml
index e909ad0dc..a7be191d6 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -488,7 +488,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &librmm_unsuffixed librmm==24.12.*,>=0.0.0a0
+          - &librmm_unsuffixed librmm==25.2.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -501,18 +501,18 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - librmm-cu12==24.12.*,>=0.0.0a0
+              - librmm-cu12==25.2.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - librmm-cu11==24.12.*,>=0.0.0a0
+              - librmm-cu11==25.2.*,>=0.0.0a0
           - {matrix: null, packages: [*librmm_unsuffixed]}
   depends_on_pylibraft:
     common:
       - output_types: conda
         packages:
-          - &pylibraft_unsuffixed pylibraft==24.12.*,>=0.0.0a0
+          - &pylibraft_unsuffixed pylibraft==25.2.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -525,10 +525,10 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - pylibraft-cu12==24.12.*,>=0.0.0a0
+              - pylibraft-cu12==25.2.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - pylibraft-cu11==24.12.*,>=0.0.0a0
+              - pylibraft-cu11==25.2.*,>=0.0.0a0
           - {matrix: null, packages: [*pylibraft_unsuffixed]}
diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md
index 7702f80b3..4fdd6405e 100644
--- a/docs/source/developer_guide.md
+++ b/docs/source/developer_guide.md
@@ -187,7 +187,7 @@ RAFT relies on `clang-format` to enforce code style across all C++ and CUDA sour
 1. Do not split empty functions/records/namespaces.
 2. Two-space indentation everywhere, including the line continuations.
 3. Disable reflowing of comments.
-   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-24.12/cpp/.clang-format).
+   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-25.02/cpp/.clang-format).
 
 [`doxygen`](https://doxygen.nl/) is used as documentation generator and also as a documentation linter.
 In order to run doxygen as a linter on C++/CUDA code, run
@@ -205,7 +205,7 @@ you can run  `codespell -i 3 -w .` from the repository root directory.
 This will bring up an interactive prompt to select which spelling fixes to apply.
 
 ### #include style
-[include_checker.py](https://github.com/rapidsai/raft/blob/branch-24.12/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
+[include_checker.py](https://github.com/rapidsai/raft/blob/branch-25.02/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
 1. `#include "..."` should be used for referencing local files only. It is acceptable to be used for referencing files in a sub-folder/parent-folder of the same algorithm, but should never be used to include files in other algorithms or between algorithms and the primitives or other dependencies.
 2. `#include <...>` should be used for referencing everything else
 
@@ -230,7 +230,7 @@ Call CUDA APIs via the provided helper macros `RAFT_CUDA_TRY`, `RAFT_CUBLAS_TRY`
 ## Logging
 
 ### Introduction
-Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-24.12/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
+Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-25.02/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
 
 ### Usage
 ```cpp
diff --git a/examples/cmake/thirdparty/fetch_rapids.cmake b/examples/cmake/thirdparty/fetch_rapids.cmake
index 6f4c627ed..23c8490b4 100644
--- a/examples/cmake/thirdparty/fetch_rapids.cmake
+++ b/examples/cmake/thirdparty/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 
 # Use this variable to update RAPIDS and RAFT versions
-set(RAPIDS_VERSION "24.12")
+set(RAPIDS_VERSION "25.02")
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake)
     file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
diff --git a/python/cuvs/pyproject.toml b/python/cuvs/pyproject.toml
index d40026776..894b8820f 100644
--- a/python/cuvs/pyproject.toml
+++ b/python/cuvs/pyproject.toml
@@ -37,7 +37,7 @@ dependencies = [
     "nvidia-curand",
     "nvidia-cusolver",
     "nvidia-cusparse",
-    "pylibraft==24.12.*,>=0.0.0a0",
+    "pylibraft==25.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
diff --git a/rust/Cargo.toml b/rust/Cargo.toml
index 79aa5756a..ddb8b32cd 100644
--- a/rust/Cargo.toml
+++ b/rust/Cargo.toml
@@ -6,7 +6,7 @@ members = [
 resolver = "2"
 
 [workspace.package]
-version = "24.12.0"
+version = "25.2.0"
 edition = "2021"
 repository = "https://github.com/rapidsai/cuvs"
 homepage = "https://github.com/rapidsai/cuvs"
diff --git a/rust/cuvs/Cargo.toml b/rust/cuvs/Cargo.toml
index 13cc658e3..1095b1fea 100644
--- a/rust/cuvs/Cargo.toml
+++ b/rust/cuvs/Cargo.toml
@@ -9,7 +9,7 @@ authors.workspace = true
 license.workspace = true
 
 [dependencies]
-ffi = { package = "cuvs-sys", path = "../cuvs-sys", version = "24.12.0" }
+ffi = { package = "cuvs-sys", path = "../cuvs-sys", version = "25.2.0" }
 ndarray = "0.15"
 
 [dev-dependencies]

From aba3fa7bb38690d394082aff65531c5858836450 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 26 Nov 2024 08:55:39 -0600
Subject: [PATCH 02/39] Update example code fetching rapids-cmake to use CUVS
 instead of RAFT (#493)

Small update to CMake example code to use cuVS instead of RAFT.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Micka (https://github.com/lowener)

URL: https://github.com/rapidsai/cuvs/pull/493
---
 examples/cmake/thirdparty/fetch_rapids.cmake | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/cmake/thirdparty/fetch_rapids.cmake b/examples/cmake/thirdparty/fetch_rapids.cmake
index 23c8490b4..3c5510b8b 100644
--- a/examples/cmake/thirdparty/fetch_rapids.cmake
+++ b/examples/cmake/thirdparty/fetch_rapids.cmake
@@ -11,11 +11,11 @@
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
 
-# Use this variable to update RAPIDS and RAFT versions
+# Use this variable to update RAPIDS and cuVS versions
 set(RAPIDS_VERSION "25.02")
 
-if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake)
+if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUVS_RAPIDS.cmake)
     file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
-            ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake)
+            ${CMAKE_CURRENT_BINARY_DIR}/CUVS_RAPIDS.cmake)
 endif()
-include(${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake)
+include(${CMAKE_CURRENT_BINARY_DIR}/CUVS_RAPIDS.cmake)

From 6e5c0c8b0ce57b4b2069cbe5255619f210420792 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 26 Nov 2024 11:24:01 -0600
Subject: [PATCH 03/39] Remove RAFT BUILD_ANN_BENCH option (#497)

This cleans up a reference to RAFT's `BUILD_ANN_BENCH` CMake option which no longer exists.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ben Frederickson (https://github.com/benfred)

URL: https://github.com/rapidsai/cuvs/pull/497
---
 cpp/cmake/thirdparty/get_raft.cmake | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake
index 7640fbfa6..5def74f4b 100644
--- a/cpp/cmake/thirdparty/get_raft.cmake
+++ b/cpp/cmake/thirdparty/get_raft.cmake
@@ -50,7 +50,6 @@ function(find_and_configure_raft)
               OPTIONS
               "BUILD_TESTS OFF"
               "BUILD_PRIMS_BENCH OFF"
-              "BUILD_ANN_BENCH OFF"
               "RAFT_NVTX ${PKG_ENABLE_NVTX}"
               "RAFT_COMPILE_LIBRARY OFF"
             )

From bd0620df0c143711352c9f5e312268aaa1801cbd Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Tue, 26 Nov 2024 15:28:58 -0600
Subject: [PATCH 04/39] Add breaking change workflow trigger (#442)

Adds a workflow that triggers a second workflow which sends a
notification to a designated Slack channel on every PR labelled with
breaking, whenever any of the following events are triggered on the PR:

- closed
- reopened
- labeled
- unlabeled

Depends on https://github.com/rapidsai/shared-workflows/pull/257
---
 .../trigger-breaking-change-alert.yaml        | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 .github/workflows/trigger-breaking-change-alert.yaml

diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
new file mode 100644
index 000000000..3b972f31c
--- /dev/null
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -0,0 +1,26 @@
+name: Trigger Breaking Change Notifications
+
+on:
+  pull_request_target:
+    types:
+      - closed
+      - reopened
+      - labeled
+      - unlabeled
+
+jobs:
+  trigger-notifier:
+    if: contains(github.event.pull_request.labels.*.name, 'breaking')
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-24.12
+    with:
+      sender_login: ${{ github.event.sender.login }}
+      sender_avatar: ${{ github.event.sender.avatar_url }}
+      repo: ${{ github.repository }}
+      pr_number: ${{ github.event.pull_request.number }}
+      pr_title: "${{ github.event.pull_request.title }}"
+      pr_body: "${{ github.event.pull_request.body || '_Empty PR description_' }}"
+      pr_base_ref: ${{ github.event.pull_request.base.ref }}
+      pr_author: ${{ github.event.pull_request.user.login }}
+      event_action: ${{ github.event.action }}
+      pr_merged: ${{ github.event.pull_request.merged }}

From ae6816c9be622fe18f1d80f3b32b70ac9f5566fe Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 26 Nov 2024 16:32:11 -0600
Subject: [PATCH 05/39] Require approval to run CI on draft PRs (#498)

By default, CI runs on draft PRs. This leads to many CI runs that may be unnecessary.

With this PR's change to `.github/copy-pr-bot.yaml`, an `/ok to test` comment from a trusted user is required to trigger CI on draft PRs. Non-draft PRs will run CI by default, assuming that all commits are signed by trusted users. Otherwise an `/ok to test` is required (as before) -- see the `copy-pr-bot` docs at https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/ for more information.

Part of https://github.com/rapidsai/build-planning/issues/123.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cuvs/pull/498
---
 .github/copy-pr-bot.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
index 895ba83ee..e0ea775aa 100644
--- a/.github/copy-pr-bot.yaml
+++ b/.github/copy-pr-bot.yaml
@@ -2,3 +2,4 @@
 # https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
 
 enabled: true
+auto_sync_draft: false

From 31c59ce0cae2505c89e8e4cdd8d77fd29256df4a Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 29 Nov 2024 16:42:44 -0800
Subject: [PATCH 06/39] Adapt to rmm logger changes (#499)

This PR adapts to breaking changes in rmm in https://github.com/rapidsai/rmm/pull/1722.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ben Frederickson (https://github.com/benfred)

URL: https://github.com/rapidsai/cuvs/pull/499
---
 cpp/CMakeLists.txt                                  |  9 +++++----
 cpp/bench/ann/CMakeLists.txt                        |  8 +++++++-
 cpp/test/CMakeLists.txt                             |  4 ++++
 examples/cpp/CMakeLists.txt                         | 13 ++++++++-----
 python/cuvs/CMakeLists.txt                          |  3 +++
 python/cuvs/cuvs/common/CMakeLists.txt              |  4 ++++
 python/cuvs/cuvs/distance/CMakeLists.txt            |  4 ++++
 python/cuvs/cuvs/neighbors/CMakeLists.txt           |  4 ++++
 .../cuvs/cuvs/neighbors/brute_force/CMakeLists.txt  |  4 ++++
 python/cuvs/cuvs/neighbors/cagra/CMakeLists.txt     |  4 ++++
 python/cuvs/cuvs/neighbors/filters/CMakeLists.txt   |  4 ++++
 python/cuvs/cuvs/neighbors/hnsw/CMakeLists.txt      |  4 ++++
 python/cuvs/cuvs/neighbors/ivf_flat/CMakeLists.txt  |  4 ++++
 python/cuvs/cuvs/neighbors/ivf_pq/CMakeLists.txt    |  4 ++++
 python/cuvs/cuvs/test/conftest.py                   |  5 +++++
 15 files changed, 68 insertions(+), 10 deletions(-)
 create mode 100644 python/cuvs/cuvs/test/conftest.py

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index eb2e7c7a4..3c8ef69fd 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -480,12 +480,13 @@ if(BUILD_SHARED_LIBS)
                       "$<$<COMPILE_LANGUAGE:CUDA>:${CUVS_CUDA_FLAGS}>"
   )
   target_link_libraries(
-    cuvs_objs PUBLIC raft::raft rmm::rmm ${CUVS_CTK_MATH_DEPENDENCIES}
+    cuvs_objs PUBLIC raft::raft rmm::rmm rmm::rmm_logger ${CUVS_CTK_MATH_DEPENDENCIES}
                      $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+              PRIVATE rmm::rmm_logger_impl
   )
 
-  add_library(cuvs SHARED $<TARGET_OBJECTS:cuvs_objs>)
-  add_library(cuvs_static STATIC $<TARGET_OBJECTS:cuvs_objs>)
+  add_library(cuvs SHARED $<FILTER:$<TARGET_OBJECTS:cuvs_objs>,EXCLUDE,rmm.*logger>)
+  add_library(cuvs_static STATIC  $<FILTER:$<TARGET_OBJECTS:cuvs_objs>,EXCLUDE,rmm.*logger>)
 
   target_compile_options(
     cuvs INTERFACE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-extended-lambda
@@ -696,7 +697,7 @@ target_compile_definitions(cuvs::cuvs INTERFACE $<$<BOOL:${CUVS_NVTX}>:NVTX_ENAB
     target_link_libraries(
       cuvs_c
       PUBLIC cuvs::cuvs ${CUVS_CTK_MATH_DEPENDENCIES}
-      PRIVATE raft::raft
+      PRIVATE raft::raft rmm::rmm_logger_impl
     )
 
     # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index 0f6b42ae9..572b792a7 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -129,6 +129,7 @@ function(ConfigureAnnBench)
             $<$<BOOL:${GPU_BUILD}>:CUDA::cudart_static>
             $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
             $<TARGET_NAME_IF_EXISTS:conda_env>
+            $<TARGET_NAME_IF_EXISTS:cuvs_bench_rmm_logger>
   )
 
   set_target_properties(
@@ -174,6 +175,11 @@ function(ConfigureAnnBench)
   add_dependencies(CUVS_ANN_BENCH_ALL ${BENCH_NAME})
 endfunction()
 
+if(CUVS_FAISS_ENABLE_GPU)
+  add_library(cuvs_bench_rmm_logger OBJECT)
+  target_link_libraries(cuvs_bench_rmm_logger PRIVATE rmm::rmm_logger_impl)
+endif()
+
 # ##################################################################################################
 # * Configure benchmark targets -------------------------------------------------------------
 
@@ -300,7 +306,7 @@ if(CUVS_ANN_BENCH_SINGLE_EXE)
   target_link_libraries(
     ANN_BENCH
     PRIVATE raft::raft nlohmann_json::nlohmann_json benchmark::benchmark dl fmt::fmt-header-only
-            spdlog::spdlog_header_only $<$<TARGET_EXISTS:CUDA::nvtx3>:CUDA::nvtx3>
+            spdlog::spdlog_header_only $<$<TARGET_EXISTS:CUDA::nvtx3>:CUDA::nvtx3> rmm::rmm_logger_impl
   )
   set_target_properties(
     ANN_BENCH
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 286d721d7..16663ba08 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -49,6 +49,7 @@ function(ConfigureTest)
     PRIVATE cuvs
             cuvs::cuvs
             raft::raft
+            test_rmm_logger
             GTest::gtest
             GTest::gtest_main
             Threads::Threads
@@ -87,6 +88,9 @@ function(ConfigureTest)
   )
 endfunction()
 
+add_library(test_rmm_logger OBJECT)
+target_link_libraries(test_rmm_logger PRIVATE rmm::rmm_logger_impl)
+
 # ##################################################################################################
 # test sources ##################################################################################
 # ##################################################################################################
diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
index 092b65ed9..48815b870 100644
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -44,10 +44,13 @@ add_executable(VAMANA_EXAMPLE src/vamana_example.cu)
 
 # `$<TARGET_NAME_IF_EXISTS:conda_env>` is a generator expression that ensures that targets are
 # installed in a conda environment, if one exists
-target_link_libraries(CAGRA_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
+add_library(rmm_logger OBJECT)
+target_link_libraries(rmm_logger PRIVATE rmm::rmm_logger_impl)
+
+target_link_libraries(CAGRA_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> rmm_logger)
 target_link_libraries(
-  CAGRA_PERSISTENT_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> Threads::Threads
+  CAGRA_PERSISTENT_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> Threads::Threads rmm_logger
 )
-target_link_libraries(IVF_PQ_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
-target_link_libraries(IVF_FLAT_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
-target_link_libraries(VAMANA_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env>)
+target_link_libraries(IVF_PQ_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> rmm_logger)
+target_link_libraries(IVF_FLAT_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> rmm_logger)
+target_link_libraries(VAMANA_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> rmm_logger)
diff --git a/python/cuvs/CMakeLists.txt b/python/cuvs/CMakeLists.txt
index feb3bd58c..c0990995f 100644
--- a/python/cuvs/CMakeLists.txt
+++ b/python/cuvs/CMakeLists.txt
@@ -110,6 +110,9 @@ endif()
 
 rapids_cython_init()
 
+add_library(cuvs_rmm_logger OBJECT)
+target_link_libraries(cuvs_rmm_logger PRIVATE rmm::rmm_logger_impl)
+
 add_subdirectory(cuvs/common)
 add_subdirectory(cuvs/distance)
 add_subdirectory(cuvs/neighbors)
diff --git a/python/cuvs/cuvs/common/CMakeLists.txt b/python/cuvs/cuvs/common/CMakeLists.txt
index 202919e01..361f2fafc 100644
--- a/python/cuvs/cuvs/common/CMakeLists.txt
+++ b/python/cuvs/cuvs/common/CMakeLists.txt
@@ -22,3 +22,7 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX common_
 )
+
+foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_link_libraries(${tgt} PRIVATE cuvs_rmm_logger)
+endforeach()
diff --git a/python/cuvs/cuvs/distance/CMakeLists.txt b/python/cuvs/cuvs/distance/CMakeLists.txt
index 363778a9c..514b08c43 100644
--- a/python/cuvs/cuvs/distance/CMakeLists.txt
+++ b/python/cuvs/cuvs/distance/CMakeLists.txt
@@ -22,3 +22,7 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX distance_
 )
+
+foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_link_libraries(${tgt} PRIVATE cuvs_rmm_logger)
+endforeach()
diff --git a/python/cuvs/cuvs/neighbors/CMakeLists.txt b/python/cuvs/cuvs/neighbors/CMakeLists.txt
index f68bbea53..031fd485e 100644
--- a/python/cuvs/cuvs/neighbors/CMakeLists.txt
+++ b/python/cuvs/cuvs/neighbors/CMakeLists.txt
@@ -29,3 +29,7 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX neighbors_refine_
 )
+
+foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_link_libraries(${tgt} PRIVATE cuvs_rmm_logger)
+endforeach()
diff --git a/python/cuvs/cuvs/neighbors/brute_force/CMakeLists.txt b/python/cuvs/cuvs/neighbors/brute_force/CMakeLists.txt
index 4806fb9fc..61eda649c 100644
--- a/python/cuvs/cuvs/neighbors/brute_force/CMakeLists.txt
+++ b/python/cuvs/cuvs/neighbors/brute_force/CMakeLists.txt
@@ -23,3 +23,7 @@ rapids_cython_create_modules(
   LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX
                    neighbors_brute_force_
 )
+
+foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_link_libraries(${tgt} PRIVATE cuvs_rmm_logger)
+endforeach()
diff --git a/python/cuvs/cuvs/neighbors/cagra/CMakeLists.txt b/python/cuvs/cuvs/neighbors/cagra/CMakeLists.txt
index 87e6597fe..1f40daab2 100644
--- a/python/cuvs/cuvs/neighbors/cagra/CMakeLists.txt
+++ b/python/cuvs/cuvs/neighbors/cagra/CMakeLists.txt
@@ -22,3 +22,7 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX neighbors_cagra_
 )
+
+foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_link_libraries(${tgt} PRIVATE cuvs_rmm_logger)
+endforeach()
diff --git a/python/cuvs/cuvs/neighbors/filters/CMakeLists.txt b/python/cuvs/cuvs/neighbors/filters/CMakeLists.txt
index c90615feb..a678852d9 100644
--- a/python/cuvs/cuvs/neighbors/filters/CMakeLists.txt
+++ b/python/cuvs/cuvs/neighbors/filters/CMakeLists.txt
@@ -22,3 +22,7 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX neighbors_prefilter_
 )
+
+foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_link_libraries(${tgt} PRIVATE cuvs_rmm_logger)
+endforeach()
diff --git a/python/cuvs/cuvs/neighbors/hnsw/CMakeLists.txt b/python/cuvs/cuvs/neighbors/hnsw/CMakeLists.txt
index 1f9c422ca..8351916e6 100644
--- a/python/cuvs/cuvs/neighbors/hnsw/CMakeLists.txt
+++ b/python/cuvs/cuvs/neighbors/hnsw/CMakeLists.txt
@@ -22,3 +22,7 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX neighbors_hnsw_
 )
+
+foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_link_libraries(${tgt} PRIVATE cuvs_rmm_logger)
+endforeach()
diff --git a/python/cuvs/cuvs/neighbors/ivf_flat/CMakeLists.txt b/python/cuvs/cuvs/neighbors/ivf_flat/CMakeLists.txt
index 09bd8f422..f5663cdaa 100644
--- a/python/cuvs/cuvs/neighbors/ivf_flat/CMakeLists.txt
+++ b/python/cuvs/cuvs/neighbors/ivf_flat/CMakeLists.txt
@@ -22,3 +22,7 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX neighbors_ivf_flat_
 )
+
+foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_link_libraries(${tgt} PRIVATE cuvs_rmm_logger)
+endforeach()
diff --git a/python/cuvs/cuvs/neighbors/ivf_pq/CMakeLists.txt b/python/cuvs/cuvs/neighbors/ivf_pq/CMakeLists.txt
index 97c3a1824..a24320ded 100644
--- a/python/cuvs/cuvs/neighbors/ivf_pq/CMakeLists.txt
+++ b/python/cuvs/cuvs/neighbors/ivf_pq/CMakeLists.txt
@@ -22,3 +22,7 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX neighbors_pq_
 )
+
+foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_link_libraries(${tgt} PRIVATE cuvs_rmm_logger)
+endforeach()
diff --git a/python/cuvs/cuvs/test/conftest.py b/python/cuvs/cuvs/test/conftest.py
new file mode 100644
index 000000000..d84de5d21
--- /dev/null
+++ b/python/cuvs/cuvs/test/conftest.py
@@ -0,0 +1,5 @@
+# arm tests sporadically run into
+# https://bugzilla.redhat.com/show_bug.cgi?id=1722181.
+# This is a workaround to ensure that OpenMP gets the TLS that it needs.
+
+import sklearn  # noqa: F401

From 121588259b3f48381c6c154556e78e46b3119eb4 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Mon, 2 Dec 2024 14:18:23 -0600
Subject: [PATCH 07/39] prefer system install of UCX in devcontainers (#501)

Contributes to https://github.com/rapidsai/build-planning/issues/118

Proposes the following changes for pip devcontainers:

* prefer system installation of ucx to the one provided by the `libucx-cu{11,12}` wheels (ref: https://github.com/rapidsai/devcontainers/pull/421#issuecomment-2502324982)

And some other related changes noticed while doing that:

* update lingering `24.*` references to `25.02`

## Notes for Reviewers

### How I tested this

Relying on CI for most things. Double-checked that `update-version.sh` would have caught the one lingering `24.12` reference like this:

```shell
./ci/release/update-version.sh '25.02.00'
git grep -E '24\.'
```

Similar to https://github.com/rapidsai/cuml/pull/6149

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cuvs/pull/501
---
 .devcontainer/Dockerfile                             | 1 +
 .github/workflows/trigger-breaking-change-alert.yaml | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 594ba8c3c..77b90fa20 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -13,6 +13,7 @@ RUN apt update -y \
  && rm -rf /tmp/* /var/tmp/* /var/cache/apt/* /var/lib/apt/lists/*;
 
 ENV DEFAULT_VIRTUAL_ENV=rapids
+ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true
 
 FROM ${BASE} as conda-base
 
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index 3b972f31c..01dd2436b 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -12,7 +12,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.02
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}

From 69199c2297e5a6012dd0f26a491550066cecdd4b Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 4 Dec 2024 18:19:19 -0600
Subject: [PATCH 08/39] Remove upper bounds on cuda-python to allow 12.6.2 and
 11.8.5 (#508)

Now that some upstream bugs have been fixed, we can allow cuda-python 12.6.2 and 11.8.5.

See https://github.com/NVIDIA/cuda-python/issues/226#issuecomment-2472355738 for more information.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cuvs/pull/508
---
 conda/environments/all_cuda-118_arch-aarch64.yaml       | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml        | 2 +-
 conda/environments/all_cuda-125_arch-aarch64.yaml       | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml        | 2 +-
 conda/environments/bench_ann_cuda-118_arch-aarch64.yaml | 2 +-
 conda/environments/bench_ann_cuda-118_arch-x86_64.yaml  | 2 +-
 conda/environments/bench_ann_cuda-125_arch-aarch64.yaml | 2 +-
 conda/environments/bench_ann_cuda-125_arch-x86_64.yaml  | 2 +-
 conda/recipes/cuvs/meta.yaml                            | 8 ++++----
 dependencies.yaml                                       | 4 ++--
 10 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 1daf668b1..b27e9d341 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0,<=11.8.3
+- cuda-python>=11.7.1,<12.0a0
 - cuda-version=11.8
 - cudatoolkit
 - cupy>=12.0.0
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 098156397..2a2791824 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0,<=11.8.3
+- cuda-python>=11.7.1,<12.0a0
 - cuda-version=11.8
 - cudatoolkit
 - cupy>=12.0.0
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index b94b44749..800d8c5cc 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0,<=12.6.0
+- cuda-python>=12.0,<13.0a0
 - cuda-version=12.5
 - cupy>=12.0.0
 - cxx-compiler
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 10e30a8c2..15addf9da 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0,<=12.6.0
+- cuda-python>=12.0,<13.0a0
 - cuda-version=12.5
 - cupy>=12.0.0
 - cxx-compiler
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index dd7499c78..ced5176e9 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0,<=11.8.3
+- cuda-python>=11.7.1,<12.0a0
 - cuda-version=11.8
 - cudatoolkit
 - cxx-compiler
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index f12e01c60..d8e49519f 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0,<=11.8.3
+- cuda-python>=11.7.1,<12.0a0
 - cuda-version=11.8
 - cudatoolkit
 - cxx-compiler
diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
index 89134093c..5e6373ad1 100644
--- a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0,<=12.6.0
+- cuda-python>=12.0,<13.0a0
 - cuda-version=12.5
 - cxx-compiler
 - cython>=3.0.0
diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
index 88b4c859a..fece589d5 100644
--- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0,<=12.6.0
+- cuda-python>=12.0,<13.0a0
 - cuda-version=12.5
 - cxx-compiler
 - cython>=3.0.0
diff --git a/conda/recipes/cuvs/meta.yaml b/conda/recipes/cuvs/meta.yaml
index 560c95feb..f799d9b0b 100644
--- a/conda/recipes/cuvs/meta.yaml
+++ b/conda/recipes/cuvs/meta.yaml
@@ -43,10 +43,10 @@ requirements:
     - {{ stdlib("c") }}
   host:
     {% if cuda_major == "11" %}
-    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
+    - cuda-python >=11.7.1,<12.0a0
     - cudatoolkit
     {% else %}
-    - cuda-python >=12.0,<13.0a0,<=12.6.0
+    - cuda-python >=12.0,<13.0a0
     - cuda-cudart-dev
     {% endif %}
     - cuda-version ={{ cuda_version }}
@@ -61,10 +61,10 @@ requirements:
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     {% if cuda_major == "11" %}
     - cudatoolkit
-    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
+    - cuda-python >=11.7.1,<12.0a0
     {% else %}
     - cuda-cudart
-    - cuda-python >=12.0,<13.0a0,<=12.6.0
+    - cuda-python >=12.0,<13.0a0
     {% endif %}
     - pylibraft {{ minor_version }}
     - libcuvs {{ version }}
diff --git a/dependencies.yaml b/dependencies.yaml
index ee5155489..28ded3671 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -213,11 +213,11 @@ dependencies:
           - matrix:
               cuda: "12.*"
             packages:
-              - &cuda_python12 cuda-python>=12.0,<13.0a0,<=12.6.0
+              - &cuda_python12 cuda-python>=12.0,<13.0a0
           - matrix:
               cuda: "11.*"
             packages:
-              - &cuda_python11 cuda-python>=11.7.1,<12.0a0,<=11.8.3
+              - &cuda_python11 cuda-python>=11.7.1,<12.0a0
           - matrix:
             packages:
               - &cuda_python cuda-python

From e1a57084a61aeffdd5b45e35dcff11e418527c1d Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Sat, 7 Dec 2024 00:34:59 -0600
Subject: [PATCH 09/39] Update cuda-python lower bounds to 12.6.2 / 11.8.5
 (#524)

We require a newer cuda-python lower bound for new features and to use the new layout.
This will fix a number of errors observed when the runtime version of cuda-python is older than the version used to build packages using Cython features from cuda-python.

See https://github.com/rapidsai/build-planning/issues/117#issuecomment-2524250915 for details.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cuvs/pull/524
---
 conda/environments/all_cuda-118_arch-aarch64.yaml       | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml        | 2 +-
 conda/environments/all_cuda-125_arch-aarch64.yaml       | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml        | 2 +-
 conda/environments/bench_ann_cuda-118_arch-aarch64.yaml | 2 +-
 conda/environments/bench_ann_cuda-118_arch-x86_64.yaml  | 2 +-
 conda/environments/bench_ann_cuda-125_arch-aarch64.yaml | 2 +-
 conda/environments/bench_ann_cuda-125_arch-x86_64.yaml  | 2 +-
 conda/recipes/cuvs/meta.yaml                            | 8 ++++----
 dependencies.yaml                                       | 4 ++--
 10 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index b27e9d341..50aa3fe7e 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.8.5,<12.0a0
 - cuda-version=11.8
 - cudatoolkit
 - cupy>=12.0.0
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 2a2791824..8f15b6164 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.8.5,<12.0a0
 - cuda-version=11.8
 - cudatoolkit
 - cupy>=12.0.0
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index 800d8c5cc..f194c01a3 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.6.2,<13.0a0
 - cuda-version=12.5
 - cupy>=12.0.0
 - cxx-compiler
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 15addf9da..912d1629b 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.6.2,<13.0a0
 - cuda-version=12.5
 - cupy>=12.0.0
 - cxx-compiler
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index ced5176e9..bb85af66f 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.8.5,<12.0a0
 - cuda-version=11.8
 - cudatoolkit
 - cxx-compiler
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index d8e49519f..225340fbb 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
 - cuda-profiler-api=11.8.86
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.8.5,<12.0a0
 - cuda-version=11.8
 - cudatoolkit
 - cxx-compiler
diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
index 5e6373ad1..a541db0ca 100644
--- a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.6.2,<13.0a0
 - cuda-version=12.5
 - cxx-compiler
 - cython>=3.0.0
diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
index fece589d5..25de38443 100644
--- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
@@ -17,7 +17,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvtx-dev
 - cuda-profiler-api
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.6.2,<13.0a0
 - cuda-version=12.5
 - cxx-compiler
 - cython>=3.0.0
diff --git a/conda/recipes/cuvs/meta.yaml b/conda/recipes/cuvs/meta.yaml
index f799d9b0b..ad7ffe756 100644
--- a/conda/recipes/cuvs/meta.yaml
+++ b/conda/recipes/cuvs/meta.yaml
@@ -43,10 +43,10 @@ requirements:
     - {{ stdlib("c") }}
   host:
     {% if cuda_major == "11" %}
-    - cuda-python >=11.7.1,<12.0a0
+    - cuda-python >=11.8.5,<12.0a0
     - cudatoolkit
     {% else %}
-    - cuda-python >=12.0,<13.0a0
+    - cuda-python >=12.6.2,<13.0a0
     - cuda-cudart-dev
     {% endif %}
     - cuda-version ={{ cuda_version }}
@@ -61,10 +61,10 @@ requirements:
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     {% if cuda_major == "11" %}
     - cudatoolkit
-    - cuda-python >=11.7.1,<12.0a0
+    - cuda-python >=11.8.5,<12.0a0
     {% else %}
     - cuda-cudart
-    - cuda-python >=12.0,<13.0a0
+    - cuda-python >=12.6.2,<13.0a0
     {% endif %}
     - pylibraft {{ minor_version }}
     - libcuvs {{ version }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 28ded3671..7adb38d2a 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -213,11 +213,11 @@ dependencies:
           - matrix:
               cuda: "12.*"
             packages:
-              - &cuda_python12 cuda-python>=12.0,<13.0a0
+              - &cuda_python12 cuda-python>=12.6.2,<13.0a0
           - matrix:
               cuda: "11.*"
             packages:
-              - &cuda_python11 cuda-python>=11.7.1,<12.0a0
+              - &cuda_python11 cuda-python>=11.8.5,<12.0a0
           - matrix:
             packages:
               - &cuda_python cuda-python

From ef16a9e7fa7af418019e8cc7bcdd33828aee9f67 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Thu, 12 Dec 2024 00:04:37 +0100
Subject: [PATCH 10/39] Fix Grace-specific issues in CAGRA (#527)

Fix Grace-specific test failures:
  1. Add stream syncs at the places where host-allocated memory may be destructed while still being accessed by GPU to avoid relying on stream-ordered semantics of memory allocations.
  2. A bug in tests: CAGRA index produced by `cagra::build` is not guaranteed to be owning. The tests assumed otherwise; when the host dataset is passed, and it's accessible on the device (it's the case with Grace), the created index ended up non-owning. The lifetime of the host dataset in the tests was smaller than of the index, which led to invalid host accesses from the device.
  3. A bug in `dataset_deserialize.hpp`: `deserialize_strided` function constructed a non-owning strided dataset, because the host data was accessible by the GPU. The current fix is to add a move-semantics overload of `make_strided_dataset` that always owns the passed data (either via moving the mdarray or by copying the data).

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/527
---
 cpp/include/cuvs/neighbors/common.hpp         | 78 ++++++++++++++++++-
 cpp/src/neighbors/detail/ann_utils.cuh        | 11 +++
 cpp/src/neighbors/detail/cagra/utils.hpp      | 16 +++-
 .../neighbors/detail/dataset_serialize.hpp    |  2 +-
 cpp/test/neighbors/ann_cagra.cuh              | 23 +++---
 5 files changed, 115 insertions(+), 15 deletions(-)

diff --git a/cpp/include/cuvs/neighbors/common.hpp b/cpp/include/cuvs/neighbors/common.hpp
index 60b8cc122..bd9ea4834 100644
--- a/cpp/include/cuvs/neighbors/common.hpp
+++ b/cpp/include/cuvs/neighbors/common.hpp
@@ -264,6 +264,77 @@ auto make_strided_dataset(const raft::resources& res, const SrcT& src, uint32_t
   return std::make_unique<out_owning_type>(std::move(out_array), out_layout);
 }
 
+/**
+ * @brief Contstruct a strided matrix from any mdarray.
+ *
+ * This function constructs an owning device matrix and copies the data.
+ * When the data is copied, padding elements are filled with zeroes.
+ *
+ * @tparam DataT
+ * @tparam IdxT
+ * @tparam LayoutPolicy
+ * @tparam ContainerPolicy
+ *
+ * @param[in] res raft resources handle
+ * @param[in] src the source mdarray or mdspan
+ * @param[in] required_stride the leading dimension (in elements)
+ * @return owning current-device-accessible strided matrix
+ */
+template <typename DataT, typename IdxT, typename LayoutPolicy, typename ContainerPolicy>
+auto make_strided_dataset(
+  const raft::resources& res,
+  raft::mdarray<DataT, raft::matrix_extent<IdxT>, LayoutPolicy, ContainerPolicy>&& src,
+  uint32_t required_stride) -> std::unique_ptr<strided_dataset<DataT, IdxT>>
+{
+  using value_type            = DataT;
+  using index_type            = IdxT;
+  using layout_type           = LayoutPolicy;
+  using container_policy_type = ContainerPolicy;
+  static_assert(std::is_same_v<layout_type, raft::layout_right> ||
+                  std::is_same_v<layout_type, raft::layout_right_padded<value_type>> ||
+                  std::is_same_v<layout_type, raft::layout_stride>,
+                "The input must be row-major");
+  RAFT_EXPECTS(src.extent(1) <= required_stride,
+               "The input row length must be not larger than the desired stride.");
+  const uint32_t src_stride = src.stride(0) > 0 ? src.stride(0) : src.extent(1);
+  const bool stride_matches = required_stride == src_stride;
+
+  auto out_layout =
+    raft::make_strided_layout(src.extents(), std::array<index_type, 2>{required_stride, 1});
+
+  using out_mdarray_type          = raft::device_matrix<value_type, index_type>;
+  using out_layout_type           = typename out_mdarray_type::layout_type;
+  using out_container_policy_type = typename out_mdarray_type::container_policy_type;
+  using out_owning_type =
+    owning_dataset<value_type, index_type, out_layout_type, out_container_policy_type>;
+
+  if constexpr (std::is_same_v<layout_type, out_layout_type> &&
+                std::is_same_v<container_policy_type, out_container_policy_type>) {
+    if (stride_matches) {
+      // Everything matches, we can own the mdarray
+      return std::make_unique<out_owning_type>(std::move(src), out_layout);
+    }
+  }
+  // Something is wrong: have to make a copy and produce an owning dataset
+  auto out_array =
+    raft::make_device_matrix<value_type, index_type>(res, src.extent(0), required_stride);
+
+  RAFT_CUDA_TRY(cudaMemsetAsync(out_array.data_handle(),
+                                0,
+                                out_array.size() * sizeof(value_type),
+                                raft::resource::get_cuda_stream(res)));
+  RAFT_CUDA_TRY(cudaMemcpy2DAsync(out_array.data_handle(),
+                                  sizeof(value_type) * required_stride,
+                                  src.data_handle(),
+                                  sizeof(value_type) * src_stride,
+                                  sizeof(value_type) * src.extent(1),
+                                  src.extent(0),
+                                  cudaMemcpyDefault,
+                                  raft::resource::get_cuda_stream(res)));
+
+  return std::make_unique<out_owning_type>(std::move(out_array), out_layout);
+}
+
 /**
  * @brief Contstruct a strided matrix from any mdarray or mdspan.
  *
@@ -278,14 +349,15 @@ auto make_strided_dataset(const raft::resources& res, const SrcT& src, uint32_t
  * @return maybe owning current-device-accessible strided matrix
  */
 template <typename SrcT>
-auto make_aligned_dataset(const raft::resources& res, const SrcT& src, uint32_t align_bytes = 16)
+auto make_aligned_dataset(const raft::resources& res, SrcT src, uint32_t align_bytes = 16)
   -> std::unique_ptr<strided_dataset<typename SrcT::value_type, typename SrcT::index_type>>
 {
-  using value_type       = typename SrcT::value_type;
+  using source_type      = std::remove_cv_t<std::remove_reference_t<SrcT>>;
+  using value_type       = typename source_type::value_type;
   constexpr size_t kSize = sizeof(value_type);
   uint32_t required_stride =
     raft::round_up_safe<size_t>(src.extent(1) * kSize, std::lcm(align_bytes, kSize)) / kSize;
-  return make_strided_dataset(res, src, required_stride);
+  return make_strided_dataset(res, std::forward<SrcT>(src), required_stride);
 }
 /**
  * @brief VPQ compressed dataset.
diff --git a/cpp/src/neighbors/detail/ann_utils.cuh b/cpp/src/neighbors/detail/ann_utils.cuh
index 652d41c85..529356351 100644
--- a/cpp/src/neighbors/detail/ann_utils.cuh
+++ b/cpp/src/neighbors/detail/ann_utils.cuh
@@ -403,6 +403,17 @@ struct batch_load_iterator {
 
   /** A single batch of data residing in device memory. */
   struct batch {
+    ~batch() noexcept
+    {
+      /*
+      If there's no copy, there's no allocation owned by the batch.
+      If there's no allocation, there's no guarantee that the device pointer is stream-ordered.
+      If there's no stream order guarantee, we must synchronize with the stream before the batch is
+      destroyed to make sure all GPU operations in that stream finish earlier.
+      */
+      if (!does_copy()) { RAFT_CUDA_TRY_NO_THROW(cudaStreamSynchronize(stream_)); }
+    }
+
     /** Logical width of a single row in a batch, in elements of type `T`. */
     [[nodiscard]] auto row_width() const -> size_type { return row_width_; }
     /** Logical offset of the batch, in rows (`row_width()`) */
diff --git a/cpp/src/neighbors/detail/cagra/utils.hpp b/cpp/src/neighbors/detail/cagra/utils.hpp
index 0f8309328..9f95c5b1c 100644
--- a/cpp/src/neighbors/detail/cagra/utils.hpp
+++ b/cpp/src/neighbors/detail/cagra/utils.hpp
@@ -179,7 +179,7 @@ class device_matrix_view_from_host {
  public:
   device_matrix_view_from_host(raft::resources const& res,
                                raft::host_matrix_view<T, IdxT> host_view)
-    : host_view_(host_view)
+    : res_(res), host_view_(host_view)
   {
     cudaPointerAttributes attr;
     RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, host_view.data_handle()));
@@ -199,6 +199,17 @@ class device_matrix_view_from_host {
     }
   }
 
+  ~device_matrix_view_from_host() noexcept
+  {
+    /*
+    If there's no copy, there's no allocation owned by this struct.
+    If there's no allocation, there's no guarantee that the device pointer is stream-ordered.
+    If there's no stream order guarantee, we must synchronize with the stream before the struct is
+    destroyed to make sure all GPU operations in that stream finish earlier.
+    */
+    if (!allocated_memory()) { raft::resource::sync_stream(res_); }
+  }
+
   raft::device_matrix_view<T, IdxT> view()
   {
     return raft::make_device_matrix_view<T, IdxT>(
@@ -207,9 +218,10 @@ class device_matrix_view_from_host {
 
   T* data_handle() { return device_ptr; }
 
-  bool allocated_memory() const { return device_mem_.has_value(); }
+  [[nodiscard]] bool allocated_memory() const { return device_mem_.has_value(); }
 
  private:
+  const raft::resources& res_;
   std::optional<raft::device_matrix<T, IdxT>> device_mem_;
   raft::host_matrix_view<T, IdxT> host_view_;
   T* device_ptr;
diff --git a/cpp/src/neighbors/detail/dataset_serialize.hpp b/cpp/src/neighbors/detail/dataset_serialize.hpp
index 40d9df930..0ecc2cf5d 100644
--- a/cpp/src/neighbors/detail/dataset_serialize.hpp
+++ b/cpp/src/neighbors/detail/dataset_serialize.hpp
@@ -140,7 +140,7 @@ auto deserialize_strided(raft::resources const& res, std::istream& is)
   auto stride     = raft::deserialize_scalar<uint32_t>(res, is);
   auto host_array = raft::make_host_matrix<DataT, IdxT>(n_rows, dim);
   raft::deserialize_mdspan(res, is, host_array.view());
-  return make_strided_dataset(res, host_array, stride);
+  return make_strided_dataset(res, std::move(host_array), stride);
 }
 
 template <typename MathT, typename IdxT>
diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh
index 8d5701439..c1cd3ca09 100644
--- a/cpp/test/neighbors/ann_cagra.cuh
+++ b/cpp/test/neighbors/ann_cagra.cuh
@@ -389,12 +389,13 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
           (const DataT*)database.data(), ps.n_rows, ps.dim);
 
         {
+          std::optional<raft::host_matrix<DataT, int64_t>> database_host{std::nullopt};
           cagra::index<DataT, IdxT> index(handle_, index_params.metric);
           if (ps.host_dataset) {
-            auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
-            raft::copy(database_host.data_handle(), database.data(), database.size(), stream_);
+            database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
+            raft::copy(database_host->data_handle(), database.data(), database.size(), stream_);
             auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>(
-              (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim);
+              (const DataT*)database_host->data_handle(), ps.n_rows, ps.dim);
 
             index = cagra::build(handle_, index_params, database_host_view);
           } else {
@@ -567,13 +568,16 @@ class AnnCagraAddNodesTest : public ::testing::TestWithParam<AnnCagraInputs> {
         auto initial_database_view = raft::make_device_matrix_view<const DataT, int64_t>(
           (const DataT*)database.data(), initial_database_size, ps.dim);
 
+        std::optional<raft::host_matrix<DataT, int64_t>> database_host{std::nullopt};
         cagra::index<DataT, IdxT> index(handle_);
         if (ps.host_dataset) {
-          auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
+          database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
           raft::copy(
-            database_host.data_handle(), database.data(), initial_database_view.size(), stream_);
+            database_host->data_handle(), database.data(), initial_database_view.size(), stream_);
           auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>(
-            (const DataT*)database_host.data_handle(), initial_database_size, ps.dim);
+            (const DataT*)database_host->data_handle(), initial_database_size, ps.dim);
+          // NB: database_host must live no less than the index, because the index _may_be_
+          //     non-onwning
           index = cagra::build(handle_, index_params, database_host_view);
         } else {
           index = cagra::build(handle_, index_params, initial_database_view);
@@ -763,12 +767,13 @@ class AnnCagraFilterTest : public ::testing::TestWithParam<AnnCagraInputs> {
         auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
           (const DataT*)database.data(), ps.n_rows, ps.dim);
 
+        std::optional<raft::host_matrix<DataT, int64_t>> database_host{std::nullopt};
         cagra::index<DataT, IdxT> index(handle_);
         if (ps.host_dataset) {
-          auto database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
-          raft::copy(database_host.data_handle(), database.data(), database.size(), stream_);
+          database_host = raft::make_host_matrix<DataT, int64_t>(ps.n_rows, ps.dim);
+          raft::copy(database_host->data_handle(), database.data(), database.size(), stream_);
           auto database_host_view = raft::make_host_matrix_view<const DataT, int64_t>(
-            (const DataT*)database_host.data_handle(), ps.n_rows, ps.dim);
+            (const DataT*)database_host->data_handle(), ps.n_rows, ps.dim);
           index = cagra::build(handle_, index_params, database_host_view);
         } else {
           index = cagra::build(handle_, index_params, database_view);

From b859bc5edd41009604219416356cb6c96b189c2f Mon Sep 17 00:00:00 2001
From: tsuki <12711693+enp1s0@users.noreply.github.com>
Date: Fri, 13 Dec 2024 15:30:32 +0900
Subject: [PATCH 11/39] Fix cagra::extend error message (#532)

When extending a CAGRA index that is built with `index_param.attach_dataset_on_build = false`, an error message "Only uncompressed dataset is supported" is displayed even if the dataset used to build the graph is not compressed. This problem occurs since the extend function does not check whether the dataset is empty. This PR fixes it.

Authors:
  - tsuki (https://github.com/enp1s0)

Approvers:
  - Artem M. Chirkin (https://github.com/achirkin)

URL: https://github.com/rapidsai/cuvs/pull/532
---
 cpp/src/neighbors/detail/cagra/add_nodes.cuh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/cpp/src/neighbors/detail/cagra/add_nodes.cuh b/cpp/src/neighbors/detail/cagra/add_nodes.cuh
index b03b8214b..952039130 100644
--- a/cpp/src/neighbors/detail/cagra/add_nodes.cuh
+++ b/cpp/src/neighbors/detail/cagra/add_nodes.cuh
@@ -432,8 +432,14 @@ void extend_core(
     } else {
       index.update_graph(handle, raft::make_const_mdspan(updated_graph.view()));
     }
+  } else if (dynamic_cast<const cuvs::neighbors::empty_dataset<int64_t>*>(&index.data()) !=
+             nullptr) {
+    RAFT_FAIL(
+      "cagra::extend only supports an index to which the dataset is attached. Please check if the "
+      "index was built with index_param.attach_dataset_on_build = true, or if a dataset was "
+      "attached after the build.");
   } else {
-    RAFT_FAIL("Only uncompressed dataset is supported");
+    RAFT_FAIL("cagra::extend only supports an uncompressed dataset index");
   }
 }
 }  // namespace cuvs::neighbors::cagra

From b3ce774d39e149d4e34c401068f24136eac44e13 Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Tue, 17 Dec 2024 17:45:35 -0800
Subject: [PATCH 12/39] Fix CI for python cuvs_bench (#541)

I'm seeing CI failures due to a missing 'setuptools' on a recent PR https://github.com/rapidsai/cuvs/actions/runs/12363479650/job/34545223389?pr=536 in building the cuvs_bench code.

This seems to be because we were missing some requires in dependencies.yaml. Fix

Authors:
  - Ben Frederickson (https://github.com/benfred)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cuvs/pull/541
---
 conda/recipes/cuvs-bench/meta.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conda/recipes/cuvs-bench/meta.yaml b/conda/recipes/cuvs-bench/meta.yaml
index 0681a1038..d77aee8ce 100644
--- a/conda/recipes/cuvs-bench/meta.yaml
+++ b/conda/recipes/cuvs-bench/meta.yaml
@@ -79,6 +79,7 @@ requirements:
     - python
     - rapids-build-backend>=0.3.0,<0.4.0.dev0
     - rmm ={{ minor_version }}
+    - setuptools>=64.0.0
 
   run:
     - benchmark

From 660a2caa64f864e38e0e7bd19df86556d25aa7db Mon Sep 17 00:00:00 2001
From: Tarang Jain <40517122+tarang-jain@users.noreply.github.com>
Date: Thu, 19 Dec 2024 13:29:55 -0800
Subject: [PATCH 13/39] Additional Distances for CAGRA C and Python API (#546)

Add InnerProduct metric to CAGRA C and Python API + updates to CAGRA pytests.
Closes https://github.com/rapidsai/cuvs/issues/545

Authors:
  - Tarang Jain (https://github.com/tarang-jain)

Approvers:
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/cuvs/pull/546
---
 cpp/include/cuvs/neighbors/cagra.h         |  3 +++
 cpp/src/neighbors/cagra_c.cpp              |  6 ++++--
 python/cuvs/cuvs/neighbors/cagra/cagra.pxd |  2 ++
 python/cuvs/cuvs/neighbors/cagra/cagra.pyx | 20 ++++++++++++--------
 python/cuvs/cuvs/test/test_cagra.py        |  8 +++++---
 5 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/cpp/include/cuvs/neighbors/cagra.h b/cpp/include/cuvs/neighbors/cagra.h
index 14331ebbc..f7f58a19c 100644
--- a/cpp/include/cuvs/neighbors/cagra.h
+++ b/cpp/include/cuvs/neighbors/cagra.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cuvs/core/c_api.h>
+#include <cuvs/distance/distance.h>
 #include <dlpack/dlpack.h>
 #include <stdbool.h>
 #include <stdint.h>
@@ -87,6 +88,8 @@ typedef struct cuvsCagraCompressionParams* cuvsCagraCompressionParams_t;
  *
  */
 struct cuvsCagraIndexParams {
+  /** Distance type. */
+  cuvsDistanceType metric;
   /** Degree of input graph for pruning. */
   size_t intermediate_graph_degree;
   /** Degree of output graph. */
diff --git a/cpp/src/neighbors/cagra_c.cpp b/cpp/src/neighbors/cagra_c.cpp
index 326a89665..02b7a566e 100644
--- a/cpp/src/neighbors/cagra_c.cpp
+++ b/cpp/src/neighbors/cagra_c.cpp
@@ -41,7 +41,8 @@ void* _build(cuvsResources_t res, cuvsCagraIndexParams params, DLManagedTensor*
   auto res_ptr = reinterpret_cast<raft::resources*>(res);
   auto index   = new cuvs::neighbors::cagra::index<T, uint32_t>(*res_ptr);
 
-  auto index_params                      = cuvs::neighbors::cagra::index_params();
+  auto index_params   = cuvs::neighbors::cagra::index_params();
+  index_params.metric = static_cast<cuvs::distance::DistanceType>((int)params.metric),
   index_params.intermediate_graph_degree = params.intermediate_graph_degree;
   index_params.graph_degree              = params.graph_degree;
 
@@ -252,7 +253,8 @@ extern "C" cuvsError_t cuvsCagraSearch(cuvsResources_t res,
 extern "C" cuvsError_t cuvsCagraIndexParamsCreate(cuvsCagraIndexParams_t* params)
 {
   return cuvs::core::translate_exceptions([=] {
-    *params = new cuvsCagraIndexParams{.intermediate_graph_degree = 128,
+    *params = new cuvsCagraIndexParams{.metric                    = L2Expanded,
+                                       .intermediate_graph_degree = 128,
                                        .graph_degree              = 64,
                                        .build_algo                = IVF_PQ,
                                        .nn_descent_niter          = 20};
diff --git a/python/cuvs/cuvs/neighbors/cagra/cagra.pxd b/python/cuvs/cuvs/neighbors/cagra/cagra.pxd
index bba5a91a8..a0f811480 100644
--- a/python/cuvs/cuvs/neighbors/cagra/cagra.pxd
+++ b/python/cuvs/cuvs/neighbors/cagra/cagra.pxd
@@ -28,6 +28,7 @@ from libcpp cimport bool
 
 from cuvs.common.c_api cimport cuvsError_t, cuvsResources_t
 from cuvs.common.cydlpack cimport DLDataType, DLManagedTensor
+from cuvs.distance_type cimport cuvsDistanceType
 
 
 cdef extern from "cuvs/neighbors/cagra.h" nogil:
@@ -47,6 +48,7 @@ cdef extern from "cuvs/neighbors/cagra.h" nogil:
     ctypedef cuvsCagraCompressionParams* cuvsCagraCompressionParams_t
 
     ctypedef struct cuvsCagraIndexParams:
+        cuvsDistanceType metric
         size_t intermediate_graph_degree
         size_t graph_degree
         cuvsCagraGraphBuildAlgo build_algo
diff --git a/python/cuvs/cuvs/neighbors/cagra/cagra.pyx b/python/cuvs/cuvs/neighbors/cagra/cagra.pyx
index 752aef741..fd55905cf 100644
--- a/python/cuvs/cuvs/neighbors/cagra/cagra.pyx
+++ b/python/cuvs/cuvs/neighbors/cagra/cagra.pyx
@@ -28,11 +28,13 @@ from libcpp cimport bool, cast
 from libcpp.string cimport string
 
 from cuvs.common cimport cydlpack
+from cuvs.distance_type cimport cuvsDistanceType
 
 from pylibraft.common import auto_convert_output, cai_wrapper, device_ndarray
 from pylibraft.common.cai_wrapper import wrap_array
 from pylibraft.common.interruptible import cuda_interruptible
 
+from cuvs.distance import DISTANCE_TYPES
 from cuvs.neighbors.common import _check_input_array
 
 from libc.stdint cimport (
@@ -131,9 +133,11 @@ cdef class IndexParams:
     Parameters
     ----------
     metric : string denoting the metric type, default="sqeuclidean"
-        Valid values for metric: ["sqeuclidean"], where
+        Valid values for metric: ["sqeuclidean", "inner_product"], where
             - sqeuclidean is the euclidean distance without the square root
               operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2
+            - inner_product distance is defined as
+              distance(a, b) = \\sum_i a_i * b_i.
     intermediate_graph_degree : int, default = 128
 
     graph_degree : int, default = 64
@@ -151,6 +155,7 @@ cdef class IndexParams:
     """
 
     cdef cuvsCagraIndexParams* params
+    cdef object _metric
 
     # hold on to a reference to the compression, to keep from being GC'ed
     cdef public object compression
@@ -170,10 +175,8 @@ cdef class IndexParams:
                  nn_descent_niter=20,
                  compression=None):
 
-        # todo (dgd): enable once other metrics are present
-        # and exposed in cuVS C API
-        # self.params.metric = _get_metric(metric)
-        # self.params.metric_arg = 0
+        self._metric = metric
+        self.params.metric = <cuvsDistanceType>DISTANCE_TYPES[metric]
         self.params.intermediate_graph_degree = intermediate_graph_degree
         self.params.graph_degree = graph_degree
         if build_algo == "ivf_pq":
@@ -186,9 +189,9 @@ cdef class IndexParams:
             self.params.compression = \
                 <cuvsCagraCompressionParams_t><size_t>compression.get_handle()
 
-    # @property
-    # def metric(self):
-        # return self.params.metric
+    @property
+    def metric(self):
+        return self._metric
 
     @property
     def intermediate_graph_degree(self):
@@ -247,6 +250,7 @@ def build(IndexParams index_params, dataset, resources=None):
 
     The following distance metrics are supported:
         - L2
+        - InnerProduct
 
     Parameters
     ----------
diff --git a/python/cuvs/cuvs/test/test_cagra.py b/python/cuvs/cuvs/test/test_cagra.py
index 56e132c23..d3b03a5d0 100644
--- a/python/cuvs/cuvs/test/test_cagra.py
+++ b/python/cuvs/cuvs/test/test_cagra.py
@@ -29,7 +29,7 @@ def run_cagra_build_search_test(
     n_queries=100,
     k=10,
     dtype=np.float32,
-    metric="euclidean",
+    metric="sqeuclidean",
     intermediate_graph_degree=128,
     graph_degree=64,
     build_algo="ivf_pq",
@@ -42,6 +42,8 @@ def run_cagra_build_search_test(
 ):
     dataset = generate_data((n_rows, n_cols), dtype)
     if metric == "inner_product":
+        if dtype in [np.int8, np.uint8]:
+            pytest.skip("skip normalization for int8/uint8 data")
         dataset = normalize(dataset, norm="l2", axis=1)
     dataset_device = device_ndarray(dataset)
 
@@ -122,7 +124,7 @@ def run_cagra_build_search_test(
 @pytest.mark.parametrize("dtype", [np.float32, np.int8, np.uint8])
 @pytest.mark.parametrize("array_type", ["device", "host"])
 @pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"])
-@pytest.mark.parametrize("metric", ["euclidean"])
+@pytest.mark.parametrize("metric", ["sqeuclidean", "inner_product"])
 def test_cagra_dataset_dtype_host_device(
     dtype, array_type, inplace, build_algo, metric
 ):
@@ -145,7 +147,7 @@ def test_cagra_dataset_dtype_host_device(
             "graph_degree": 32,
             "add_data_on_build": True,
             "k": 1,
-            "metric": "euclidean",
+            "metric": "sqeuclidean",
             "build_algo": "ivf_pq",
         },
         {

From 89ebf15150223f4bce4a08bf3a6a4089380a1d0a Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Thu, 19 Dec 2024 19:34:09 -0800
Subject: [PATCH 14/39] Use nvidia-sphinx-theme for docs (#528)

Authors:
  - Ben Frederickson (https://github.com/benfred)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cuvs/pull/528
---
 conda/environments/all_cuda-118_arch-aarch64.yaml | 6 ++++--
 conda/environments/all_cuda-118_arch-x86_64.yaml  | 6 ++++--
 conda/environments/all_cuda-125_arch-aarch64.yaml | 6 ++++--
 conda/environments/all_cuda-125_arch-x86_64.yaml  | 6 ++++--
 dependencies.yaml                                 | 8 +++++---
 docs/source/conf.py                               | 2 +-
 6 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 50aa3fe7e..a6d98ea3b 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -7,7 +7,6 @@ channels:
 - conda-forge
 - nvidia
 dependencies:
-- breathe
 - c-compiler
 - clang
 - clang-tools=16.0.6
@@ -44,7 +43,6 @@ dependencies:
 - nvcc_linux-aarch64=11.8
 - openblas
 - pre-commit
-- pydata-sphinx-theme
 - pylibraft==25.2.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
@@ -55,5 +53,9 @@ dependencies:
 - scikit-learn
 - sphinx-copybutton
 - sphinx-markdown-tables
+- sphinx>=8.0.0
 - sysroot_linux-aarch64==2.17
+- pip:
+  - breathe>=4.35.0
+  - nvidia-sphinx-theme
 name: all_cuda-118_arch-aarch64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 8f15b6164..1063e4d6c 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -7,7 +7,6 @@ channels:
 - conda-forge
 - nvidia
 dependencies:
-- breathe
 - c-compiler
 - clang
 - clang-tools=16.0.6
@@ -44,7 +43,6 @@ dependencies:
 - nvcc_linux-64=11.8
 - openblas
 - pre-commit
-- pydata-sphinx-theme
 - pylibraft==25.2.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
@@ -55,5 +53,9 @@ dependencies:
 - scikit-learn
 - sphinx-copybutton
 - sphinx-markdown-tables
+- sphinx>=8.0.0
 - sysroot_linux-64==2.17
+- pip:
+  - breathe>=4.35.0
+  - nvidia-sphinx-theme
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index f194c01a3..ee7b37695 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -7,7 +7,6 @@ channels:
 - conda-forge
 - nvidia
 dependencies:
-- breathe
 - c-compiler
 - clang
 - clang-tools=16.0.6
@@ -40,7 +39,6 @@ dependencies:
 - numpydoc
 - openblas
 - pre-commit
-- pydata-sphinx-theme
 - pylibraft==25.2.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
@@ -51,5 +49,9 @@ dependencies:
 - scikit-learn
 - sphinx-copybutton
 - sphinx-markdown-tables
+- sphinx>=8.0.0
 - sysroot_linux-aarch64==2.17
+- pip:
+  - breathe>=4.35.0
+  - nvidia-sphinx-theme
 name: all_cuda-125_arch-aarch64
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 912d1629b..7c8e1fd99 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -7,7 +7,6 @@ channels:
 - conda-forge
 - nvidia
 dependencies:
-- breathe
 - c-compiler
 - clang
 - clang-tools=16.0.6
@@ -40,7 +39,6 @@ dependencies:
 - numpydoc
 - openblas
 - pre-commit
-- pydata-sphinx-theme
 - pylibraft==25.2.*,>=0.0.0a0
 - pytest-cov
 - pytest==7.*
@@ -51,5 +49,9 @@ dependencies:
 - scikit-learn
 - sphinx-copybutton
 - sphinx-markdown-tables
+- sphinx>=8.0.0
 - sysroot_linux-64==2.17
+- pip:
+  - breathe>=4.35.0
+  - nvidia-sphinx-theme
 name: all_cuda-125_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
index eca97d2f5..a73fe7b8f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -394,22 +394,24 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - breathe
           - doxygen>=1.8.20
           - graphviz
           - ipython
           - numpydoc
-          - pydata-sphinx-theme
           - recommonmark
+          - sphinx>=8.0.0
           - sphinx-copybutton
           - sphinx-markdown-tables
+          - pip:
+            - nvidia-sphinx-theme
+            - breathe>=4.35.0
   rust:
     common:
       - output_types: [conda]
         packages:
           - make
           - rust
-          # clang/liblclang only needed for bindgen support
+          # clang/libclang only needed for bindgen support
           - clang
           - libclang
   build_wheels:
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 0d667833a..c14919568 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -99,7 +99,7 @@
 # a list of builtin themes.
 #
 
-html_theme = "pydata_sphinx_theme"
+html_theme = "nvidia_sphinx_theme"
 
 
 # Theme options are theme-specific and customize the look and feel of a theme

From f48e9aab593232b72f74fd79ad256ed51b997b43 Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Thu, 19 Dec 2024 19:39:29 -0800
Subject: [PATCH 15/39] Add support for float16 to the python pairwise distance
 api (#547)

Authors:
  - Ben Frederickson (https://github.com/benfred)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/547
---
 cpp/src/distance/pairwise_distance_c.cpp | 13 +++++++++----
 python/cuvs/cuvs/distance/distance.pyx   |  7 +++++--
 python/cuvs/cuvs/test/test_distance.py   | 13 ++++++++++---
 3 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/cpp/src/distance/pairwise_distance_c.cpp b/cpp/src/distance/pairwise_distance_c.cpp
index d457198a2..061adaa2c 100644
--- a/cpp/src/distance/pairwise_distance_c.cpp
+++ b/cpp/src/distance/pairwise_distance_c.cpp
@@ -29,7 +29,7 @@
 
 namespace {
 
-template <typename T>
+template <typename T, typename DistT>
 void _pairwise_distance(cuvsResources_t res,
                         DLManagedTensor* x_tensor,
                         DLManagedTensor* y_tensor,
@@ -40,7 +40,7 @@ void _pairwise_distance(cuvsResources_t res,
   auto res_ptr = reinterpret_cast<raft::resources*>(res);
 
   using mdspan_type           = raft::device_matrix_view<T const, int64_t, raft::row_major>;
-  using distances_mdspan_type = raft::device_matrix_view<T, int64_t, raft::row_major>;
+  using distances_mdspan_type = raft::device_matrix_view<DistT, int64_t, raft::row_major>;
 
   auto x_mds         = cuvs::core::from_dlpack<mdspan_type>(x_tensor);
   auto y_mds         = cuvs::core::from_dlpack<mdspan_type>(y_tensor);
@@ -71,9 +71,14 @@ extern "C" cuvsError_t cuvsPairwiseDistance(cuvsResources_t res,
     }
 
     if (x_dt.bits == 32) {
-      _pairwise_distance<float>(res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
+      _pairwise_distance<float, float>(
+        res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
+    } else if (x_dt.bits == 16) {
+      _pairwise_distance<half, float>(
+        res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
     } else if (x_dt.bits == 64) {
-      _pairwise_distance<double>(res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
+      _pairwise_distance<double, double>(
+        res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
     } else {
       RAFT_FAIL("Unsupported DLtensor dtype: %d and bits: %d", x_dt.code, x_dt.bits);
     }
diff --git a/python/cuvs/cuvs/distance/distance.pyx b/python/cuvs/cuvs/distance/distance.pyx
index eb34366e4..187532bfe 100644
--- a/python/cuvs/cuvs/distance/distance.pyx
+++ b/python/cuvs/cuvs/distance/distance.pyx
@@ -100,7 +100,10 @@ def pairwise_distance(X, Y, out=None, metric="euclidean", metric_arg=2.0,
     n = y_cai.shape[0]
 
     if out is None:
-        out = device_ndarray.empty((m, n), dtype=y_cai.dtype)
+        output_dtype = y_cai.dtype
+        if np.issubdtype(y_cai.dtype, np.float16):
+            output_dtype = np.float32
+        out = device_ndarray.empty((m, n), dtype=output_dtype)
     out_cai = wrap_array(out)
 
     x_k = x_cai.shape[1]
@@ -119,7 +122,7 @@ def pairwise_distance(X, Y, out=None, metric="euclidean", metric_arg=2.0,
     y_dt = y_cai.dtype
     d_dt = out_cai.dtype
 
-    if x_dt != y_dt or x_dt != d_dt:
+    if x_dt != y_dt:
         raise ValueError("Inputs must have the same dtypes")
 
     cdef cydlpack.DLManagedTensor* x_dlpack = \
diff --git a/python/cuvs/cuvs/test/test_distance.py b/python/cuvs/cuvs/test/test_distance.py
index 681217fc8..f466c2743 100644
--- a/python/cuvs/cuvs/test/test_distance.py
+++ b/python/cuvs/cuvs/test/test_distance.py
@@ -40,7 +40,7 @@
     ],
 )
 @pytest.mark.parametrize("inplace", [True, False])
-@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.float16])
 def test_distance(n_rows, n_cols, inplace, metric, dtype):
     input1 = np.random.random_sample((n_rows, n_cols))
     input1 = np.asarray(input1).astype(dtype)
@@ -55,7 +55,10 @@ def test_distance(n_rows, n_cols, inplace, metric, dtype):
         norm = np.sum(input1, axis=1)
         input1 = (input1.T / norm).T
 
-    output = np.zeros((n_rows, n_rows), dtype=dtype)
+    output_dtype = dtype
+    if np.issubdtype(dtype, np.float16):
+        output_dtype = np.float32
+    output = np.zeros((n_rows, n_rows), dtype=output_dtype)
 
     if metric == "inner_product":
         expected = np.matmul(input1, input1.T)
@@ -76,4 +79,8 @@ def test_distance(n_rows, n_cols, inplace, metric, dtype):
 
     actual = output_device.copy_to_host()
 
-    assert np.allclose(expected, actual, atol=1e-3, rtol=1e-3)
+    tol = 1e-3
+    if np.issubdtype(dtype, np.float16):
+        tol = 1e-1
+
+    assert np.allclose(expected, actual, atol=tol, rtol=tol)

From ac49c414254cb448efce02d7a7b08190e43584e8 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 30 Dec 2024 11:44:25 -0800
Subject: [PATCH 16/39] Check if nightlies have succeeded recently enough
 (#548)

Contributes to https://github.com/rapidsai/build-planning/issues/127

This PR cannot be merged unless nightly CI has passed within the past 7 days, so if it remains unmerged that will itself be an indication that nightly CI needs fixing.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cuvs/pull/548
---
 .github/workflows/pr.yaml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 4c3b4d06a..91f51bd90 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -12,6 +12,7 @@ concurrency:
 jobs:
   pr-builder:
     needs:
+      - check-nightly-ci
       - changed-files
       - checks
       - conda-cpp-build
@@ -29,6 +30,18 @@ jobs:
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
+  check-nightly-ci:
+    # Switch to ubuntu-latest once it defaults to a version of Ubuntu that
+    # provides at least Python 3.11 (see
+    # https://docs.python.org/3/library/datetime.html#datetime.date.fromisoformat)
+    runs-on: ubuntu-24.04
+    env:
+      RAPIDS_GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - name: Check if nightly CI is passing
+        uses: rapidsai/shared-actions/check_nightly_success/dispatch@main
+        with:
+          repo: cuvs
   changed-files:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-25.02

From a57227310a54b42481e20aaece72d0879f4c5b96 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 30 Dec 2024 16:09:03 -0800
Subject: [PATCH 17/39] Update for raft logger changes (#540)

This PR updates cuvs to use raft's updated logger implementation using [rapids-logger](https://github.com/rapidsai/rapids-logger). It is a breaking change because it changes the kmeans `base_params` verbosity type from an int to a `raft::level_enum`.

This PR requires https://github.com/rapidsai/raft/pull/2530.

Contributes to https://github.com/rapidsai/build-planning/issues/104

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/540
---
 cpp/CMakeLists.txt                            | 11 ++++---
 cpp/bench/ann/CMakeLists.txt                  | 21 +++++++++----
 cpp/bench/ann/src/common/benchmark.hpp        | 31 +++++++------------
 cpp/include/cuvs/cluster/kmeans.hpp           |  3 +-
 cpp/src/cluster/detail/kmeans.cuh             | 12 +++----
 cpp/src/cluster/detail/kmeans_auto_find_k.cuh |  4 +--
 cpp/src/cluster/detail/kmeans_balanced.cuh    |  4 +--
 cpp/src/cluster/detail/kmeans_common.cuh      |  2 +-
 .../detail/sparse/coo_spmv_kernel.cuh         |  2 ++
 cpp/src/neighbors/detail/ann_utils.cuh        |  2 +-
 cpp/src/neighbors/detail/cagra/add_nodes.cuh  |  2 --
 .../neighbors/detail/cagra/cagra_build.cuh    |  4 +--
 .../detail/cagra/cagra_serialize.cuh          |  4 +--
 .../detail/cagra/compute_distance.hpp         |  2 +-
 .../detail/cagra/search_multi_cta.cuh         |  2 +-
 .../cagra/search_multi_cta_kernel-inl.cuh     |  2 +-
 .../detail/cagra/search_multi_kernel.cuh      |  2 +-
 .../detail/cagra/search_single_cta.cuh        |  2 +-
 .../cagra/search_single_cta_kernel-inl.cuh    |  3 +-
 .../neighbors/detail/dataset_serialize.hpp    |  2 +-
 cpp/src/neighbors/detail/dynamic_batching.cuh |  2 --
 .../neighbors/detail/vamana/vamana_build.cuh  |  4 +--
 .../detail/vamana/vamana_serialize.cuh        |  2 +-
 .../detail/vamana/vamana_structs.cuh          |  2 +-
 cpp/src/neighbors/detail/vpq_dataset.cuh      |  2 +-
 cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh |  3 +-
 .../ivf_flat/ivf_flat_interleaved_scan.cuh    |  2 +-
 .../neighbors/ivf_flat/ivf_flat_search.cuh    |  3 +-
 cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh     |  3 +-
 cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh   |  2 +-
 cpp/src/neighbors/ivf_pq/ivf_pq_search.cuh    |  2 +-
 cpp/src/neighbors/ivf_pq/ivf_pq_serialize.cuh |  2 +-
 cpp/src/neighbors/mg/omp_checks.cpp           |  1 -
 cpp/test/CMakeLists.txt                       |  4 +--
 cpp/test/neighbors/ann_ivf_pq.cuh             |  2 --
 cpp/test/neighbors/ann_utils.cuh              |  2 --
 cpp/test/neighbors/brute_force.cu             |  2 --
 37 files changed, 71 insertions(+), 86 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 79e50c1c1..26c0b82d3 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -486,13 +486,14 @@ if(BUILD_SHARED_LIBS)
                       "$<$<COMPILE_LANGUAGE:CUDA>:${CUVS_CUDA_FLAGS}>"
   )
   target_link_libraries(
-    cuvs_objs PUBLIC raft::raft rmm::rmm rmm::rmm_logger ${CUVS_CTK_MATH_DEPENDENCIES}
-                     $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
-              PRIVATE rmm::rmm_logger_impl
+    cuvs_objs
+    PUBLIC raft::raft rmm::rmm rmm::rmm_logger ${CUVS_CTK_MATH_DEPENDENCIES}
+           $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+    PRIVATE rmm::rmm_logger_impl raft::raft_logger_impl
   )
 
   add_library(cuvs SHARED $<FILTER:$<TARGET_OBJECTS:cuvs_objs>,EXCLUDE,rmm.*logger>)
-  add_library(cuvs_static STATIC  $<FILTER:$<TARGET_OBJECTS:cuvs_objs>,EXCLUDE,rmm.*logger>)
+  add_library(cuvs_static STATIC $<FILTER:$<TARGET_OBJECTS:cuvs_objs>,EXCLUDE,rmm.*logger>)
 
   target_compile_options(
     cuvs INTERFACE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-extended-lambda
@@ -704,7 +705,7 @@ target_compile_definitions(cuvs::cuvs INTERFACE $<$<BOOL:${CUVS_NVTX}>:NVTX_ENAB
     target_link_libraries(
       cuvs_c
       PUBLIC cuvs::cuvs ${CUVS_CTK_MATH_DEPENDENCIES}
-      PRIVATE raft::raft rmm::rmm_logger_impl
+      PRIVATE raft::raft rmm::rmm_logger_impl raft::raft_logger_impl
     )
 
     # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index 144cd3048..200b52ab3 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -126,10 +126,11 @@ function(ConfigureAnnBench)
     PRIVATE ${ConfigureAnnBench_LINKS}
             nlohmann_json::nlohmann_json
             Threads::Threads
+            $<TARGET_NAME_IF_EXISTS:raft::raft_logger>
             $<$<BOOL:${GPU_BUILD}>:CUDA::cudart_static>
             $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
             $<TARGET_NAME_IF_EXISTS:conda_env>
-            $<TARGET_NAME_IF_EXISTS:cuvs_bench_rmm_logger>
+            $<TARGET_NAME_IF_EXISTS:cuvs_bench_logger>
   )
 
   set_target_properties(
@@ -175,9 +176,11 @@ function(ConfigureAnnBench)
   add_dependencies(CUVS_ANN_BENCH_ALL ${BENCH_NAME})
 endfunction()
 
-if(CUVS_FAISS_ENABLE_GPU)
-  add_library(cuvs_bench_rmm_logger OBJECT)
-  target_link_libraries(cuvs_bench_rmm_logger PRIVATE rmm::rmm_logger_impl)
+if(CUVS_FAISS_ENABLE_GPU OR CUVS_ANN_BENCH_SINGLE_EXE)
+  add_library(cuvs_bench_logger OBJECT)
+  target_link_libraries(
+    cuvs_bench_logger PRIVATE rmm::rmm_logger_impl $<TARGET_NAME_IF_EXISTS:raft::raft_logger>
+  )
 endif()
 
 # ##################################################################################################
@@ -303,8 +306,14 @@ if(CUVS_ANN_BENCH_SINGLE_EXE)
 
   target_link_libraries(
     ANN_BENCH
-    PRIVATE raft::raft nlohmann_json::nlohmann_json benchmark::benchmark dl fmt::fmt-header-only
-            spdlog::spdlog_header_only $<$<TARGET_EXISTS:CUDA::nvtx3>:CUDA::nvtx3> rmm::rmm_logger_impl
+    PRIVATE raft::raft
+            nlohmann_json::nlohmann_json
+            benchmark::benchmark
+            dl
+            fmt::fmt-header-only
+            spdlog::spdlog_header_only
+            $<$<TARGET_EXISTS:CUDA::nvtx3>:CUDA::nvtx3>
+            cuvs_bench_logger
   )
   set_target_properties(
     ANN_BENCH
diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index 06e1e27af..49be78673 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -597,18 +597,16 @@ inline auto parse_string_flag(const char* arg, const char* pat, std::string& res
 
 inline auto run_main(int argc, char** argv) -> int
 {
-  bool force_overwrite                = false;
-  bool build_mode                     = false;
-  bool search_mode                    = false;
-  bool no_lap_sync                    = false;
-  std::string data_prefix             = "data";
-  std::string index_prefix            = "index";
-  std::string new_override_kv         = "";
-  std::string mode                    = "latency";
-  std::string threads_arg_txt         = "";
-  std::vector<int> threads            = {1, -1};  // min_thread, max_thread
-  std::string log_level_str           = "";
-  [[maybe_unused]] int raft_log_level = 0;  // raft::logger::get(RAFT_NAME).get_level();
+  bool force_overwrite        = false;
+  bool build_mode             = false;
+  bool search_mode            = false;
+  bool no_lap_sync            = false;
+  std::string data_prefix     = "data";
+  std::string index_prefix    = "index";
+  std::string new_override_kv = "";
+  std::string mode            = "latency";
+  std::string threads_arg_txt = "";
+  std::vector<int> threads    = {1, -1};  // min_thread, max_thread
   kv_series override_kv{};
 
   char arg0_default[] = "benchmark";  // NOLINT
@@ -639,12 +637,7 @@ inline auto run_main(int argc, char** argv) -> int
         parse_string_flag(argv[i], "--index_prefix", index_prefix) ||
         parse_string_flag(argv[i], "--mode", mode) ||
         parse_string_flag(argv[i], "--override_kv", new_override_kv) ||
-        parse_string_flag(argv[i], "--threads", threads_arg_txt) ||
-        parse_string_flag(argv[i], "--raft_log_level", log_level_str)) {
-      if (!log_level_str.empty()) {
-        raft_log_level = std::stoi(log_level_str);
-        log_level_str  = "";
-      }
+        parse_string_flag(argv[i], "--threads", threads_arg_txt)) {
       if (!threads_arg_txt.empty()) {
         auto threads_arg = split(threads_arg_txt, ':');
         threads[0]       = std::stoi(threads_arg[0]);
@@ -673,8 +666,6 @@ inline auto run_main(int argc, char** argv) -> int
     }
   }
 
-  // raft::logger::get(RAFT_NAME).set_level(raft_log_level);
-
   Mode metric_objective = Mode::kLatency;
   if (mode == "throughput") { metric_objective = Mode::kThroughput; }
 
diff --git a/cpp/include/cuvs/cluster/kmeans.hpp b/cpp/include/cuvs/cluster/kmeans.hpp
index 89b3acc24..cb8d36b10 100644
--- a/cpp/include/cuvs/cluster/kmeans.hpp
+++ b/cpp/include/cuvs/cluster/kmeans.hpp
@@ -17,6 +17,7 @@
 #include <cuvs/distance/distance.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/random/rng_state.hpp>
 
@@ -85,7 +86,7 @@ struct params : base_params {
   /**
    * verbosity level.
    */
-  int verbosity = RAFT_LEVEL_INFO;
+  raft::level_enum verbosity = raft::level_enum::info;
 
   /**
    * Seed to the random number generator.
diff --git a/cpp/src/cluster/detail/kmeans.cuh b/cpp/src/cluster/detail/kmeans.cuh
index 3d054f0fd..e943b8afc 100644
--- a/cpp/src/cluster/detail/kmeans.cuh
+++ b/cpp/src/cluster/detail/kmeans.cuh
@@ -25,7 +25,7 @@
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/host_mdarray.hpp>
 #include <raft/core/kvp.hpp>
-#include <raft/core/logger-ext.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/mdarray.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
@@ -56,8 +56,6 @@
 
 namespace cuvs::cluster::kmeans::detail {
 
-// TODO(cjnolet): RAFT_NAME needs to be removed and the raft::logger fixed to not require it
-static const std::string RAFT_NAME = "raft";
 static const std::string CUVS_NAME = "cuvs";
 
 // =========================================================
@@ -373,7 +371,7 @@ void kmeans_fit_main(raft::resources const& handle,
                      rmm::device_uvector<char>& workspace)
 {
   raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("kmeans_fit_main");
-  raft::logger::get(RAFT_NAME).set_level(params.verbosity);
+  raft::default_logger().set_level(params.verbosity);
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
   auto n_samples      = X.extent(0);
   auto n_features     = X.extent(1);
@@ -879,7 +877,7 @@ void kmeans_fit(raft::resources const& handle,
       pams.n_clusters);
   }
 
-  raft::logger::get(RAFT_NAME).set_level(pams.verbosity);
+  raft::default_logger().set_level(pams.verbosity);
 
   // Allocate memory
   rmm::device_uvector<char> workspace(0, stream);
@@ -1025,7 +1023,7 @@ void kmeans_predict(raft::resources const& handle,
   RAFT_EXPECTS(centroids.extent(1) == n_features,
                "invalid parameter (centroids.extent(1) != n_features)");
 
-  raft::logger::get(RAFT_NAME).set_level(pams.verbosity);
+  raft::default_logger().set_level(pams.verbosity);
   auto metric = pams.metric;
 
   // Allocate memory
@@ -1218,7 +1216,7 @@ void kmeans_transform(raft::resources const& handle,
                       raft::device_matrix_view<DataT> X_new)
 {
   raft::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope("kmeans_transform");
-  raft::logger::get(RAFT_NAME).set_level(pams.verbosity);
+  raft::default_logger().set_level(pams.verbosity);
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
   auto n_samples      = X.extent(0);
   auto n_features     = X.extent(1);
diff --git a/cpp/src/cluster/detail/kmeans_auto_find_k.cuh b/cpp/src/cluster/detail/kmeans_auto_find_k.cuh
index 6441f7ad5..797b33bca 100644
--- a/cpp/src/cluster/detail/kmeans_auto_find_k.cuh
+++ b/cpp/src/cluster/detail/kmeans_auto_find_k.cuh
@@ -21,7 +21,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/error.hpp>
 #include <raft/core/host_mdarray.hpp>
-#include <raft/core/logger-ext.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/stats/dispersion.cuh>
@@ -230,4 +230,4 @@ void find_k(raft::resources const& handle,
       n_iter);
   }
 }
-}  // namespace  cuvs::cluster::kmeans::detail
\ No newline at end of file
+}  // namespace  cuvs::cluster::kmeans::detail
diff --git a/cpp/src/cluster/detail/kmeans_balanced.cuh b/cpp/src/cluster/detail/kmeans_balanced.cuh
index 3f1ad2334..ba4cabbde 100644
--- a/cpp/src/cluster/detail/kmeans_balanced.cuh
+++ b/cpp/src/cluster/detail/kmeans_balanced.cuh
@@ -25,7 +25,8 @@
 
 #include <cuvs/distance/distance.hpp>
 #include <raft/core/cudart_utils.hpp>
-#include <raft/core/logger-ext.hpp>
+#include <raft/core/logger-macros.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_memory_resource.hpp>
@@ -59,7 +60,6 @@
 
 namespace cuvs::cluster::kmeans::detail {
 
-static const std::string RAFT_NAME                 = "raft";
 constexpr static inline float kAdjustCentersWeight = 7.0f;
 
 /**
diff --git a/cpp/src/cluster/detail/kmeans_common.cuh b/cpp/src/cluster/detail/kmeans_common.cuh
index eec71b5d2..03db08bd1 100644
--- a/cpp/src/cluster/detail/kmeans_common.cuh
+++ b/cpp/src/cluster/detail/kmeans_common.cuh
@@ -24,7 +24,7 @@
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/kvp.hpp>
-#include <raft/core/logger-ext.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/mdarray.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
diff --git a/cpp/src/distance/detail/sparse/coo_spmv_kernel.cuh b/cpp/src/distance/detail/sparse/coo_spmv_kernel.cuh
index 1f4b19af4..e44edc68a 100644
--- a/cpp/src/distance/detail/sparse/coo_spmv_kernel.cuh
+++ b/cpp/src/distance/detail/sparse/coo_spmv_kernel.cuh
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <raft/core/detail/macros.hpp>
+
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_radix_sort.cuh>
 #include <cub/block/block_store.cuh>
diff --git a/cpp/src/neighbors/detail/ann_utils.cuh b/cpp/src/neighbors/detail/ann_utils.cuh
index 529356351..149eea3f1 100644
--- a/cpp/src/neighbors/detail/ann_utils.cuh
+++ b/cpp/src/neighbors/detail/ann_utils.cuh
@@ -18,7 +18,7 @@
 
 #include <cuvs/distance/distance.hpp>
 #include <raft/common/nvtx.hpp>
-#include <raft/core/logger-ext.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/integer_utils.hpp>
diff --git a/cpp/src/neighbors/detail/cagra/add_nodes.cuh b/cpp/src/neighbors/detail/cagra/add_nodes.cuh
index 952039130..358b7643e 100644
--- a/cpp/src/neighbors/detail/cagra/add_nodes.cuh
+++ b/cpp/src/neighbors/detail/cagra/add_nodes.cuh
@@ -31,8 +31,6 @@
 
 namespace cuvs::neighbors::cagra {
 
-static const std::string RAFT_NAME = "raft";
-
 template <class T, class IdxT, class Accessor>
 void add_node_core(
   raft::resources const& handle,
diff --git a/cpp/src/neighbors/detail/cagra/cagra_build.cuh b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
index b7fec724b..340986448 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_build.cuh
@@ -26,7 +26,7 @@
 #include <raft/core/host_device_accessor.hpp>
 #include <raft/core/host_mdarray.hpp>
 #include <raft/core/host_mdspan.hpp>
-#include <raft/core/logger-ext.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 
 #include <cuvs/distance/distance.hpp>
@@ -46,8 +46,6 @@
 
 namespace cuvs::neighbors::cagra::detail {
 
-static const std::string RAFT_NAME = "raft";
-
 template <typename IdxT>
 void write_to_graph(raft::host_matrix_view<IdxT, int64_t, raft::row_major> knn_graph,
                     raft::host_matrix_view<int64_t, int64_t, raft::row_major> neighbors_host_view,
diff --git a/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh b/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh
index 0f6cf852f..c83da7bb1 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_serialize.cuh
@@ -18,7 +18,7 @@
 
 #include <cuvs/neighbors/cagra.hpp>
 #include <raft/core/host_mdarray.hpp>
-#include <raft/core/logger-ext.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/mdarray.hpp>
 #include <raft/core/mdspan_types.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
@@ -34,8 +34,6 @@
 
 namespace cuvs::neighbors::cagra::detail {
 
-static const std::string RAFT_NAME = "raft";
-
 constexpr int serialization_version = 4;
 
 /**
diff --git a/cpp/src/neighbors/detail/cagra/compute_distance.hpp b/cpp/src/neighbors/detail/cagra/compute_distance.hpp
index 7eb798459..2227e4f9e 100644
--- a/cpp/src/neighbors/detail/cagra/compute_distance.hpp
+++ b/cpp/src/neighbors/detail/cagra/compute_distance.hpp
@@ -22,7 +22,7 @@
 #include <cuvs/distance/distance.hpp>
 #include <cuvs/neighbors/cagra.hpp>
 #include <cuvs/neighbors/common.hpp>
-#include <raft/core/logger-macros.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/operators.hpp>
 
 // TODO: This shouldn't be invoking spatial/knn
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
index ecfd856f1..9cb432bcb 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
@@ -26,7 +26,7 @@
 
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/logger-ext.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_properties.hpp>
 #include <raft/core/resources.hpp>
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
index 9fa9d5894..7535ff217 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
@@ -26,7 +26,7 @@
 #include "utils.hpp"
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/logger-ext.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_properties.hpp>
 #include <raft/core/resources.hpp>
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
index c6fe21642..469c80a08 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -23,7 +23,7 @@
 #include "utils.hpp"
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/logger-ext.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
 
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
index fa71dbaf9..161aa8c4a 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
@@ -26,7 +26,7 @@
 #include "utils.hpp"
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/logger-ext.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_properties.hpp>
 #include <raft/core/resources.hpp>
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 678ed0cb4..188862fbb 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -28,7 +28,7 @@
 
 #include <cuvs/distance/distance.hpp>
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/logger-ext.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_properties.hpp>
 #include <raft/core/resources.hpp>
@@ -64,7 +64,6 @@
 
 namespace cuvs::neighbors::cagra::detail {
 namespace single_cta_search {
-using raft::RAFT_NAME;  // TODO: this is required for RAFT_LOG_XXX messages.
 
 // #define _CLK_BREAKDOWN
 
diff --git a/cpp/src/neighbors/detail/dataset_serialize.hpp b/cpp/src/neighbors/detail/dataset_serialize.hpp
index 0ecc2cf5d..ba3090b59 100644
--- a/cpp/src/neighbors/detail/dataset_serialize.hpp
+++ b/cpp/src/neighbors/detail/dataset_serialize.hpp
@@ -21,7 +21,7 @@
 #include <raft/core/resources.hpp>
 #include <raft/core/serialize.hpp>
 
-#include <raft/core/logger-ext.hpp>
+#include <raft/core/logger.hpp>
 
 #include <cuda_fp16.h>
 
diff --git a/cpp/src/neighbors/detail/dynamic_batching.cuh b/cpp/src/neighbors/detail/dynamic_batching.cuh
index 5c6b1654e..cb8e08ef5 100644
--- a/cpp/src/neighbors/detail/dynamic_batching.cuh
+++ b/cpp/src/neighbors/detail/dynamic_batching.cuh
@@ -50,8 +50,6 @@
 
 namespace cuvs::neighbors::dynamic_batching::detail {
 
-using raft::RAFT_NAME;  // TODO: a workaround for RAFT_LOG_XXX macros
-
 /**
  * A helper to make the requester threads more cooperative when busy-spinning.
  * It is used in the wait loops across this file to reduce the CPU usage.
diff --git a/cpp/src/neighbors/detail/vamana/vamana_build.cuh b/cpp/src/neighbors/detail/vamana/vamana_build.cuh
index da24decb3..ec75c99c1 100644
--- a/cpp/src/neighbors/detail/vamana/vamana_build.cuh
+++ b/cpp/src/neighbors/detail/vamana/vamana_build.cuh
@@ -29,7 +29,7 @@
 #include <raft/core/host_device_accessor.hpp>
 #include <raft/core/host_mdarray.hpp>
 #include <raft/core/host_mdspan.hpp>
-#include <raft/core/logger-ext.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/matrix/copy.cuh>
 #include <raft/matrix/init.cuh>
@@ -52,8 +52,6 @@ namespace cuvs::neighbors::experimental::vamana::detail {
  * @{
  */
 
-static const std::string RAFT_NAME = "raft";
-
 static const int blockD    = 32;
 static const int maxBlocks = 10000;
 
diff --git a/cpp/src/neighbors/detail/vamana/vamana_serialize.cuh b/cpp/src/neighbors/detail/vamana/vamana_serialize.cuh
index a554464f6..c360ae19a 100644
--- a/cpp/src/neighbors/detail/vamana/vamana_serialize.cuh
+++ b/cpp/src/neighbors/detail/vamana/vamana_serialize.cuh
@@ -20,7 +20,7 @@
 
 #include <cuvs/neighbors/vamana.hpp>
 #include <raft/core/host_mdarray.hpp>
-#include <raft/core/logger-ext.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/mdarray.hpp>
 #include <raft/core/mdspan_types.hpp>
 #include <raft/core/nvtx.hpp>
diff --git a/cpp/src/neighbors/detail/vamana/vamana_structs.cuh b/cpp/src/neighbors/detail/vamana/vamana_structs.cuh
index 86cb4e1f8..f6f0279f7 100644
--- a/cpp/src/neighbors/detail/vamana/vamana_structs.cuh
+++ b/cpp/src/neighbors/detail/vamana/vamana_structs.cuh
@@ -29,7 +29,7 @@
 #include <raft/core/host_device_accessor.hpp>
 #include <raft/core/host_mdarray.hpp>
 #include <raft/core/host_mdspan.hpp>
-#include <raft/core/logger-ext.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 
 #include <cuvs/distance/distance.hpp>
diff --git a/cpp/src/neighbors/detail/vpq_dataset.cuh b/cpp/src/neighbors/detail/vpq_dataset.cuh
index d85bad920..0d7882b4b 100644
--- a/cpp/src/neighbors/detail/vpq_dataset.cuh
+++ b/cpp/src/neighbors/detail/vpq_dataset.cuh
@@ -25,7 +25,7 @@
 
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
-#include <raft/core/logger-ext.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/linalg/map.cuh>
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh
index d6ffc1218..f594343c7 100644
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_build.cuh
@@ -27,7 +27,8 @@
 #include "../../cluster/kmeans_balanced.cuh"
 #include "../detail/ann_utils.cuh"
 #include <cuvs/distance/distance.hpp>
-#include <raft/core/logger-ext.hpp>
+#include <raft/core/logger-macros.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/mdarray.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh
index f5a4267cd..79b4f1a18 100644
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_interleaved_scan.cuh
@@ -23,7 +23,7 @@
 
 #include "../detail/ann_utils.cuh"
 #include <cuvs/distance/distance.hpp>
-#include <raft/core/logger-ext.hpp>  // RAFT_LOG_TRACE
+#include <raft/core/logger.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/matrix/detail/select_warpsort.cuh>
 #include <raft/util/cuda_rt_essentials.hpp>  // RAFT_CUDA_TRY
diff --git a/cpp/src/neighbors/ivf_flat/ivf_flat_search.cuh b/cpp/src/neighbors/ivf_flat/ivf_flat_search.cuh
index 032b6a8ff..2df6f4f0e 100644
--- a/cpp/src/neighbors/ivf_flat/ivf_flat_search.cuh
+++ b/cpp/src/neighbors/ivf_flat/ivf_flat_search.cuh
@@ -27,7 +27,8 @@
 #include <cuvs/distance/distance.hpp>   // is_min_close, DistanceType
 #include <cuvs/selection/select_k.hpp>  // cuvs::selection::select_k
 #include <raft/core/error.hpp>
-#include <raft/core/logger-ext.hpp>  // RAFT_LOG_TRACE
+#include <raft/core/logger-macros.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>   // raft::resources
 #include <raft/linalg/gemm.cuh>      // raft::linalg::gemm
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
index 1d4acea1e..44a1b11fa 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_build.cuh
@@ -30,7 +30,7 @@
 #include "../../cluster/kmeans_balanced.cuh"
 
 #include <raft/core/device_mdarray.hpp>
-#include <raft/core/logger-ext.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/mdspan.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
@@ -68,7 +68,6 @@
 #include <variant>
 
 namespace cuvs::neighbors::ivf_pq::detail {
-using raft::RAFT_NAME;                       // TODO: this is required for RAFT_LOG_XXX messages.
 using namespace cuvs::spatial::knn::detail;  // NOLINT
 
 using internal_extents_t = int64_t;  // The default mdspan extent type used internally.
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
index 5b41e5f3d..1b098ac5c 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_fp_8bit.cuh
@@ -20,7 +20,7 @@
 
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/device_mdarray.hpp>
-#include <raft/core/logger-ext.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/nvtx.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resources.hpp>
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_search.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_search.cuh
index db8f9fbd3..05bb99353 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_search.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_search.cuh
@@ -28,7 +28,7 @@
 #include <cuvs/selection/select_k.hpp>
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/device_mdarray.hpp>
-#include <raft/core/logger-ext.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/custom_resource.hpp>
diff --git a/cpp/src/neighbors/ivf_pq/ivf_pq_serialize.cuh b/cpp/src/neighbors/ivf_pq/ivf_pq_serialize.cuh
index 5eaebe69d..4af9dbb8e 100644
--- a/cpp/src/neighbors/ivf_pq/ivf_pq_serialize.cuh
+++ b/cpp/src/neighbors/ivf_pq/ivf_pq_serialize.cuh
@@ -21,7 +21,7 @@
 #include <cuvs/neighbors/common.hpp>
 #include <cuvs/neighbors/ivf_pq.hpp>
 #include <raft/core/host_mdarray.hpp>
-#include <raft/core/logger-ext.hpp>
+#include <raft/core/logger.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/core/serialize.hpp>
diff --git a/cpp/src/neighbors/mg/omp_checks.cpp b/cpp/src/neighbors/mg/omp_checks.cpp
index e09182dfe..c8cc27414 100644
--- a/cpp/src/neighbors/mg/omp_checks.cpp
+++ b/cpp/src/neighbors/mg/omp_checks.cpp
@@ -18,7 +18,6 @@
 #include <raft/core/logger.hpp>
 
 namespace cuvs::neighbors::mg {
-using raft::RAFT_NAME;
 
 void check_omp_threads(const int requirements)
 {
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 4d13daaed..cca061455 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -89,7 +89,7 @@ function(ConfigureTest)
 endfunction()
 
 add_library(test_rmm_logger OBJECT)
-target_link_libraries(test_rmm_logger PRIVATE rmm::rmm_logger_impl)
+target_link_libraries(test_rmm_logger PRIVATE rmm::rmm_logger_impl raft::raft_logger_impl)
 
 # ##################################################################################################
 # test sources ##################################################################################
@@ -236,7 +236,7 @@ if(BUILD_TESTS)
     NAME SPARSE_TEST PATH sparse/cluster/cluster_solvers.cu sparse/cluster/eigen_solvers.cu
     sparse/cluster/spectral.cu GPUS 1 PERCENT 100
   )
-  
+
   ConfigureTest(
     NAME PREPROCESSING_TEST PATH preprocessing/scalar_quantization.cu GPUS 1 PERCENT 100
   )
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index 3a92b5e3d..01efd804e 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -31,8 +31,6 @@
 
 namespace cuvs::neighbors::ivf_pq {
 
-using raft::RAFT_NAME;  // For logging
-
 struct test_ivf_sample_filter {
   static constexpr unsigned offset = 300;
 };
diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh
index 94bccade2..ded8cb5af 100644
--- a/cpp/test/neighbors/ann_utils.cuh
+++ b/cpp/test/neighbors/ann_utils.cuh
@@ -38,8 +38,6 @@
 
 namespace cuvs::neighbors {
 
-using raft::RAFT_NAME;  // For logging
-
 struct print_dtype {
   cudaDataType_t value;
 };
diff --git a/cpp/test/neighbors/brute_force.cu b/cpp/test/neighbors/brute_force.cu
index 8c354baa9..2cefb1098 100644
--- a/cpp/test/neighbors/brute_force.cu
+++ b/cpp/test/neighbors/brute_force.cu
@@ -76,11 +76,9 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs<T>> {
  protected:
   void testBruteForce()
   {
-    // #if (RAFT_ACTIVE_LEVEL >= RAFT_LEVEL_DEBUG)
     raft::print_device_vector("Input array: ", input_.data(), rows_ * cols_, std::cout);
     std::cout << "K: " << k_ << std::endl;
     raft::print_device_vector("Labels array: ", search_labels_.data(), rows_, std::cout);
-    // #endif
 
     auto index = raft::make_device_matrix_view<const T, IdxT, raft::row_major>(
       (const T*)(input_.data()), rows_, cols_);

From 55c5a7f0f9c3e103a33264a913dbd17b059eff78 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 30 Dec 2024 18:48:13 -0800
Subject: [PATCH 18/39] Get Breathe from conda again (#554)

As part of https://github.com/rapidsai/cuvs/pull/528 cuvs's doc builds were modified to pull Breathe from pip. That was necessary because the nvidia-sphinx-theme requires Sphinx 8 but [the conda-forge Breathe package was not compatible with that Sphinx version](https://github.com/conda-forge/breathe-feedstock/issues/63). I fixed that in https://github.com/conda-forge/breathe-feedstock/pull/64, so now we can go back to using Breathe from conda to avoid mixing pip and conda for dependency management in the same environment.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cuvs/pull/554
---
 conda/environments/all_cuda-118_arch-aarch64.yaml | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml  | 2 +-
 conda/environments/all_cuda-125_arch-aarch64.yaml | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml  | 2 +-
 dependencies.yaml                                 | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index a6d98ea3b..01853da84 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -7,6 +7,7 @@ channels:
 - conda-forge
 - nvidia
 dependencies:
+- breathe>=4.35.0
 - c-compiler
 - clang
 - clang-tools=16.0.6
@@ -56,6 +57,5 @@ dependencies:
 - sphinx>=8.0.0
 - sysroot_linux-aarch64==2.17
 - pip:
-  - breathe>=4.35.0
   - nvidia-sphinx-theme
 name: all_cuda-118_arch-aarch64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 1063e4d6c..a1ad68d7f 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -7,6 +7,7 @@ channels:
 - conda-forge
 - nvidia
 dependencies:
+- breathe>=4.35.0
 - c-compiler
 - clang
 - clang-tools=16.0.6
@@ -56,6 +57,5 @@ dependencies:
 - sphinx>=8.0.0
 - sysroot_linux-64==2.17
 - pip:
-  - breathe>=4.35.0
   - nvidia-sphinx-theme
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index ee7b37695..ee0213fff 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -7,6 +7,7 @@ channels:
 - conda-forge
 - nvidia
 dependencies:
+- breathe>=4.35.0
 - c-compiler
 - clang
 - clang-tools=16.0.6
@@ -52,6 +53,5 @@ dependencies:
 - sphinx>=8.0.0
 - sysroot_linux-aarch64==2.17
 - pip:
-  - breathe>=4.35.0
   - nvidia-sphinx-theme
 name: all_cuda-125_arch-aarch64
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 7c8e1fd99..d93dcaf7a 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -7,6 +7,7 @@ channels:
 - conda-forge
 - nvidia
 dependencies:
+- breathe>=4.35.0
 - c-compiler
 - clang
 - clang-tools=16.0.6
@@ -52,6 +53,5 @@ dependencies:
 - sphinx>=8.0.0
 - sysroot_linux-64==2.17
 - pip:
-  - breathe>=4.35.0
   - nvidia-sphinx-theme
 name: all_cuda-125_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
index a73fe7b8f..a11e59e31 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -394,6 +394,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
+          - breathe>=4.35.0
           - doxygen>=1.8.20
           - graphviz
           - ipython
@@ -404,7 +405,6 @@ dependencies:
           - sphinx-markdown-tables
           - pip:
             - nvidia-sphinx-theme
-            - breathe>=4.35.0
   rust:
     common:
       - output_types: [conda]

From 0e735ea025f8e1e24e8e9b3d3f2ac502711f5387 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 7 Jan 2025 12:32:02 -0600
Subject: [PATCH 19/39] remove setup.cfg files, other packaging cleanup (#544)

Similar to https://github.com/rapidsai/raft/pull/2532, this proposes some small packaging cleanup.

* removes `setup.cfg` files
   - *these are currently being ignored by tools, in favor of identical configuration in `pyproject.toml` and `.flake8` files*
   - e.g. https://github.com/rapidsai/cuvs/blob/b3ce774d39e149d4e34c401068f24136eac44e13/.pre-commit-config.yaml#L31-L35
* alphabetizes dependency lists in `dependencies.yaml`
* changes `cupy:` group in `dependencies.yaml` to `depends_on_cupy:` (for consistency with other dependencies)

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Micka (https://github.com/lowener)

URL: https://github.com/rapidsai/cuvs/pull/544
---
 .pre-commit-config.yaml |  3 +--
 dependencies.yaml       | 40 +++++++++++++++---------------
 pyproject.toml          |  4 +--
 python/cuvs/setup.cfg   | 39 -----------------------------
 setup.cfg               | 55 -----------------------------------------
 5 files changed, 23 insertions(+), 118 deletions(-)
 delete mode 100644 python/cuvs/setup.cfg
 delete mode 100644 setup.cfg

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5e53abd92..fcfc7e1fa 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -108,8 +108,7 @@ repos:
                   [.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx|rs)$|
                   CMakeLists[.]txt$|
                   CMakeLists_standalone[.]txt$|
-                  meta[.]yaml$|
-                  setup[.]cfg$
+                  meta[.]yaml$
             exclude: |
               (?x)
                   docs/source/sphinxext/github_link\.py|
diff --git a/dependencies.yaml b/dependencies.yaml
index a11e59e31..fbd1d8372 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -7,39 +7,39 @@ files:
       arch: [x86_64, aarch64]
     includes:
       - build
-      - rapids_build
       - build_py_cuvs
+      - build_wheels
+      - checks
       - cuda
       - cuda_version
-      - depends_on_pylibraft
+      - depends_on_cupy
       - depends_on_librmm
+      - depends_on_pylibraft
       - develop
-      - checks
-      - build_wheels
-      - test_libcuvs
       - docs
+      - rapids_build
       - run_py_cuvs
+      - rust
+      - test_libcuvs
       - test_python_common
       - test_py_cuvs
-      - cupy
-      - rust
   bench_ann:
     output: conda
     matrix:
       cuda: ["11.8", "12.5"]
       arch: [x86_64, aarch64]
     includes:
-      - rapids_build
+      - bench
+      - bench_python
       - build_py_cuvs
       - cuda
       - cuda_version
+      - depends_on_cupy
       - depends_on_pylibraft
       - depends_on_librmm
       - develop
-      - bench
-      - bench_python
+      - rapids_build
       - rapids_build_setuptools
-      - cupy
   test_cpp:
     output: none
     includes:
@@ -49,10 +49,10 @@ files:
     output: none
     includes:
       - cuda_version
+      - depends_on_cupy
       - py_version
       - test_python_common
       - test_py_cuvs
-      - cupy
   checks:
     output: none
     includes:
@@ -61,19 +61,19 @@ files:
   docs:
     output: none
     includes:
+      - cuda
       - cuda_version
-      - cupy
+      - depends_on_cupy
       - docs
       - py_version
-      - rust
       - rapids_build
-      - cuda
+      - rust
   rust:
     output: none
     includes:
+      - cuda
       - cuda_version
       - rapids_build
-      - cuda
       - rust
   py_build_cuvs:
     output: pyproject
@@ -89,8 +89,8 @@ files:
       table: tool.rapids-build-backend
       key: requires
     includes:
-      - rapids_build
       - build_py_cuvs
+      - rapids_build
   py_run_cuvs:
     output: pyproject
     pyproject_dir: python/cuvs
@@ -98,8 +98,8 @@ files:
       table: project
     includes:
       - cuda_wheels
-      - run_py_cuvs
       - depends_on_pylibraft
+      - run_py_cuvs
   py_test_cuvs:
     output: pyproject
     pyproject_dir: python/cuvs
@@ -107,9 +107,9 @@ files:
       table: project.optional-dependencies
       key: test
     includes:
+      - depends_on_cupy
       - test_python_common
       - test_py_cuvs
-      - cupy
   py_build_cuvs_bench:
     output: pyproject
     pyproject_dir: python/cuvs_bench
@@ -368,7 +368,7 @@ dependencies:
               - nvidia-cusolver
               - nvidia-cusparse
 
-  cupy:
+  depends_on_cupy:
     common:
       - output_types: conda
         packages:
diff --git a/pyproject.toml b/pyproject.toml
index fbf4cf41f..417514466 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,7 +25,7 @@ force-exclude = '''
 # unlike the match option above this match-dir will have no effect when
 # pydocstyle is invoked from pre-commit. Therefore this exclusion list must
 # also be maintained in the pre-commit config file.
-match-dir = "^(?!(ci|cpp|conda|docs)).*$"
+match-dir = "^(?!(ci|cpp|conda|docs|notebooks)).*$"
 select = "D201, D204, D206, D207, D208, D209, D210, D211, D214, D215, D300, D301, D302, D403, D405, D406, D407, D408, D409, D410, D411, D412, D414, D418"
     # Would like to enable the following rules in the future:
     # D200, D202, D205, D400
@@ -42,6 +42,6 @@ follow_imports = "skip"
 skip = "./.git,./.github,./cpp/build,.*egg-info.*,./.mypy_cache,.*_skbuild"
 # ignore short words, and typename parameters like OffsetT
 ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b"
-ignore-words-list = "inout,numer"
+ignore-words-list = "inout,unparseable,numer"
 builtin = "clear"
 quiet-level = 3
diff --git a/python/cuvs/setup.cfg b/python/cuvs/setup.cfg
deleted file mode 100644
index 57b4954bc..000000000
--- a/python/cuvs/setup.cfg
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-[isort]
-line_length=79
-multi_line_output=3
-include_trailing_comma=True
-force_grid_wrap=0
-combine_as_imports=True
-order_by_type=True
-known_dask=
-    dask
-    distributed
-    dask_cuda
-known_rapids=
-    cuvs
-    nvtext
-    cudf
-    cuml
-    raft
-    cugraph
-    dask_cudf
-    rmm
-known_first_party=
-    cuvs
-default_section=THIRDPARTY
-sections=FUTURE,STDLIB,THIRDPARTY,DASK,RAPIDS,FIRSTPARTY,LOCALFOLDER
-skip=
-    thirdparty
-    .eggs
-    .git
-    .hg
-    .mypy_cache
-    .tox
-    .venv
-    _build
-    buck-out
-    build
-    dist
-    __init__.py
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index e64641d05..000000000
--- a/setup.cfg
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-
-[flake8]
-filename = *.py, *.pyx, *.pxd, *.pxi
-exclude = __init__.py, *.egg, build, docs, .git
-force-check = True
-ignore =
-    # line break before binary operator
-    W503,
-    # whitespace before :
-    E203
-per-file-ignores =
-    # Rules ignored only in Cython:
-    # E211: whitespace before '(' (used in multi-line imports)
-    # E225: Missing whitespace around operators (breaks cython casting syntax like <int>)
-    # E226: Missing whitespace around arithmetic operators (breaks cython pointer syntax like int*)
-    # E227: Missing whitespace around bitwise or shift operator (Can also break casting syntax)
-    # E275: Missing whitespace after keyword (Doesn't work with Cython except?)
-    # E402: invalid syntax (works for Python, not Cython)
-    # E999: invalid syntax (works for Python, not Cython)
-    # W504: line break after binary operator (breaks lines that end with a pointer)
-    *.pyx: E211, E225, E226, E227, E275, E402, E999, W504
-    *.pxd: E211, E225, E226, E227, E275, E402, E999, W504
-    *.pxi: E211, E225, E226, E227, E275, E402, E999, W504
-
-[pydocstyle]
-# Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather
-# than include using match-dir. Note that as discussed in
-# https://stackoverflow.com/questions/65478393/how-to-filter-directories-using-the-match-dir-flag-for-pydocstyle,
-# unlike the match option above this match-dir will have no effect when
-# pydocstyle is invoked from pre-commit. Therefore this exclusion list must
-# also be maintained in the pre-commit config file.
-match-dir = ^(?!(ci|cpp|conda|docs|java|notebooks)).*$
-# Allow missing docstrings for docutils
-ignore-decorators = .*(docutils|doc_apply|copy_docstring).*
-select =
-    D201, D204, D206, D207, D208, D209, D210, D211, D214, D215, D300, D301, D302, D403, D405, D406, D407, D408, D409, D410, D411, D412, D414, D418
-    # Would like to enable the following rules in the future:
-    # D200, D202, D205, D400
-
-[mypy]
-ignore_missing_imports = True
-# If we don't specify this, then mypy will check excluded files if
-# they are imported by a checked file.
-follow_imports = skip
-
-[codespell]
-# note: pre-commit passes explicit lists of files here, which this skip file list doesn't override -
-# this is only to allow you to run codespell interactively
-skip = ./.git,./.github,./cpp/build,.*egg-info.*,./.mypy_cache,.*_skbuild
-# ignore short words, and typename parameters like OffsetT
-ignore-regex = \b(.{1,4}|[A-Z]\w*T)\b
-ignore-words-list = inout,unparseable,numer
-builtin = clear
-quiet-level = 3

From e3244123a3021f52f1374fdafedbd8f37546d112 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 7 Jan 2025 11:55:06 -0800
Subject: [PATCH 20/39] Support raft's logger targets (#557)

rapidsai/raft#2530 added new targets that we need to make global in cuvs's CMake as well.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cuvs/pull/557
---
 cpp/cmake/thirdparty/get_raft.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake
index 5def74f4b..2e57df84e 100644
--- a/cpp/cmake/thirdparty/get_raft.cmake
+++ b/cpp/cmake/thirdparty/get_raft.cmake
@@ -39,7 +39,7 @@ function(find_and_configure_raft)
     # Invoke CPM find_package()
     #-----------------------------------------------------
     rapids_cpm_find(raft ${PKG_VERSION}
-            GLOBAL_TARGETS      raft::raft
+            GLOBAL_TARGETS      raft::raft raft::raft_logger raft::raft_logger_impl
             BUILD_EXPORT_SET    cuvs-exports
             INSTALL_EXPORT_SET  cuvs-exports
             COMPONENTS          ${RAFT_COMPONENTS}

From 2a10353fcaa1e1da8429267a542f2c5a6e3412b1 Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Tue, 7 Jan 2025 16:45:22 -0800
Subject: [PATCH 21/39] Change brute_force api to match ivf*/cagra (#536)

This changes the brute_force knn api to match that of ivf-* and cagra , by adding a search_params and index_params structure to the relevant calls.

This allows us to use the dynamic batching code on brute_force knn, as well as provide a more standardized API for our users.

Authors:
  - Ben Frederickson (https://github.com/benfred)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/536
---
 cpp/include/cuvs/neighbors/brute_force.hpp    | 139 +++++++++++++-----
 cpp/src/neighbors/brute_force.cu              | 116 ++++++++++-----
 cpp/src/neighbors/brute_force_c.cpp           |  15 +-
 cpp/src/neighbors/dynamic_batching.cu         |   3 +
 cpp/test/CMakeLists.txt                       |   1 +
 cpp/test/neighbors/brute_force.cu             |  34 +++--
 .../dynamic_batching/test_brute_force.cu      |  40 +++++
 7 files changed, 259 insertions(+), 89 deletions(-)
 create mode 100644 cpp/test/neighbors/dynamic_batching/test_brute_force.cu

diff --git a/cpp/include/cuvs/neighbors/brute_force.hpp b/cpp/include/cuvs/neighbors/brute_force.hpp
index d040e03db..8fca9da83 100644
--- a/cpp/include/cuvs/neighbors/brute_force.hpp
+++ b/cpp/include/cuvs/neighbors/brute_force.hpp
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include "common.hpp"
 #include <cuvs/neighbors/common.hpp>
 #include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/device_mdarray.hpp>
@@ -28,6 +27,10 @@
 
 namespace cuvs::neighbors::brute_force {
 
+struct index_params : cuvs::neighbors::index_params {};
+
+struct search_params : cuvs::neighbors::search_params {};
+
 /**
  * @defgroup bruteforce_cpp_index Bruteforce index
  * @{
@@ -41,6 +44,11 @@ namespace cuvs::neighbors::brute_force {
  */
 template <typename T, typename DistT = T>
 struct index : cuvs::neighbors::index {
+  using index_params_type  = brute_force::index_params;
+  using search_params_type = brute_force::search_params;
+  using index_type         = int64_t;
+  using value_type         = T;
+
  public:
   index(const index&)            = delete;
   index(index&&)                 = default;
@@ -181,20 +189,26 @@ struct index : cuvs::neighbors::index {
  * @code{.cpp}
  *   using namespace cuvs::neighbors;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = brute_force::build(handle, dataset, metric);
+ *   brute_force::index_params index_params;
+ *   auto index = brute_force::build(handle, index_params, dataset);
  * @endcode
  *
  * @param[in] handle
+ * @param[in] index_params parameters such as the distance metric to use
  * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
- * @param[in] metric cuvs::distance::DistanceType
- * @param[in] metric_arg metric argument
  *
  * @return the constructed brute-force index
  */
 auto build(raft::resources const& handle,
-           raft::device_matrix_view<const float, int64_t, raft::row_major> dataset,
-           cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded,
-           float metric_arg = 0) -> cuvs::neighbors::brute_force::index<float, float>;
+           const cuvs::neighbors::brute_force::index_params& index_params,
+           raft::device_matrix_view<const float, int64_t, raft::row_major> dataset)
+  -> cuvs::neighbors::brute_force::index<float, float>;
+
+[[deprecated]] auto build(
+  raft::resources const& handle,
+  raft::device_matrix_view<const float, int64_t, raft::row_major> dataset,
+  cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded,
+  float metric_arg                    = 0) -> cuvs::neighbors::brute_force::index<float, float>;
 /**
  * @brief Build the index from the dataset for efficient search.
  *
@@ -202,62 +216,78 @@ auto build(raft::resources const& handle,
  * @code{.cpp}
  *   using namespace cuvs::neighbors;
  *   // create and fill the index from a [N, D] dataset
- *   auto index = brute_force::build(handle, dataset, metric);
+ *   brute_force::index_params index_params;
+ *   auto index = brute_force::build(handle, index_params, dataset);
  * @endcode
  *
  * @param[in] handle
+ * @param[in] index_params parameters such as the distance metric to use
  * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
- * @param[in] metric cuvs::distance::DistanceType
- * @param[in] metric_arg metric argument
  *
- * @return the constructed ivf-flat index
+ * @return the constructed brute force index
  */
 auto build(raft::resources const& handle,
-           raft::device_matrix_view<const half, int64_t, raft::row_major> dataset,
-           cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded,
-           float metric_arg = 0) -> cuvs::neighbors::brute_force::index<half, float>;
+           const cuvs::neighbors::brute_force::index_params& index_params,
+           raft::device_matrix_view<const half, int64_t, raft::row_major> dataset)
+  -> cuvs::neighbors::brute_force::index<half, float>;
+
+[[deprecated]] auto build(
+  raft::resources const& handle,
+  raft::device_matrix_view<const half, int64_t, raft::row_major> dataset,
+  cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded,
+  float metric_arg                    = 0) -> cuvs::neighbors::brute_force::index<half, float>;
+
 /**
  * @brief Build the index from the dataset for efficient search.
  *
  * Usage example:
  * @code{.cpp}
- *   using namespace cuvs::neighbors;
- *   // create and fill the index from a [N, D] dataset
- *   auto index = brute_force::build(handle, dataset, metric);
+ *   brute_force::index_params index_params;
+ *   auto index = brute_force::build(handle, index_params, dataset);
  * @endcode
  *
  * @param[in] handle
- * @param[in] dataset a device pointer to a col-major matrix [n_rows, dim]
- * @param[in] metric cuvs::distance::DistanceType
- * @param[in] metric_arg metric argument
+ * @param[in] index_params parameters such as the distance metric to use
+ * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
  *
- * @return the constructed bruteforce index
+ * @return the constructed brute force index
  */
 auto build(raft::resources const& handle,
-           raft::device_matrix_view<const float, int64_t, raft::col_major> dataset,
-           cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded,
-           float metric_arg = 0) -> cuvs::neighbors::brute_force::index<float, float>;
+           const cuvs::neighbors::brute_force::index_params& index_params,
+           raft::device_matrix_view<const float, int64_t, raft::col_major> dataset)
+  -> cuvs::neighbors::brute_force::index<float, float>;
+
+[[deprecated]] auto build(
+  raft::resources const& handle,
+  raft::device_matrix_view<const float, int64_t, raft::col_major> dataset,
+  cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded,
+  float metric_arg                    = 0) -> cuvs::neighbors::brute_force::index<float, float>;
+
 /**
  * @brief Build the index from the dataset for efficient search.
  *
  * Usage example:
  * @code{.cpp}
- *   using namespace cuvs::neighbors;
- *   // create and fill the index from a [N, D] dataset
- *   auto index = brute_force::build(handle, dataset, metric);
+ *   brute_force::index_params index_params;
+ *   auto index = brute_force::build(handle, index_params, dataset);
  * @endcode
  *
  * @param[in] handle
- * @param[in] dataset a device pointer to a col-major matrix [n_rows, dim]
- * @param[in] metric cuvs::distance::DistanceType
- * @param[in] metric_arg metric argument
+ * @param[in] index_params parameters such as the distance metric to use
+ * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
  *
- * @return the constructed bruteforce index
+ * @return the constructed brute force index
  */
 auto build(raft::resources const& handle,
-           raft::device_matrix_view<const half, int64_t, raft::col_major> dataset,
-           cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded,
-           float metric_arg = 0) -> cuvs::neighbors::brute_force::index<half, float>;
+           const cuvs::neighbors::brute_force::index_params& index_params,
+           raft::device_matrix_view<const half, int64_t, raft::col_major> dataset)
+  -> cuvs::neighbors::brute_force::index<half, float>;
+
+[[deprecated]] auto build(
+  raft::resources const& handle,
+  raft::device_matrix_view<const half, int64_t, raft::col_major> dataset,
+  cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded,
+  float metric_arg                    = 0) -> cuvs::neighbors::brute_force::index<half, float>;
 /**
  * @}
  */
@@ -286,6 +316,7 @@ auto build(raft::resources const& handle,
  * @endcode
  *
  * @param[in] handle
+ * @param[in] params parameters configuring the search
  * @param[in] index brute-force constructed index
  * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()]
  * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
@@ -296,6 +327,7 @@ auto build(raft::resources const& handle,
  * `index->size()` bits to indicate whether queries[0] should compute the distance with dataset.
  */
 void search(raft::resources const& handle,
+            const cuvs::neighbors::brute_force::search_params& params,
             const cuvs::neighbors::brute_force::index<float, float>& index,
             raft::device_matrix_view<const float, int64_t, raft::row_major> queries,
             raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
@@ -303,6 +335,14 @@ void search(raft::resources const& handle,
             const cuvs::neighbors::filtering::base_filter& sample_filter =
               cuvs::neighbors::filtering::none_sample_filter{});
 
+[[deprecated]] void search(raft::resources const& handle,
+                           const cuvs::neighbors::brute_force::index<float, float>& index,
+                           raft::device_matrix_view<const float, int64_t, raft::row_major> queries,
+                           raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+                           raft::device_matrix_view<float, int64_t, raft::row_major> distances,
+                           const cuvs::neighbors::filtering::base_filter& sample_filter =
+                             cuvs::neighbors::filtering::none_sample_filter{});
+
 /**
  * @brief Search ANN using the constructed index.
  *
@@ -323,6 +363,7 @@ void search(raft::resources const& handle,
  * @endcode
  *
  * @param[in] handle
+ * @param[in] params parameters configuring the search
  * @param[in] index ivf-flat constructed index
  * @param[in] queries a device pointer to a row-major matrix [n_queries, index->dim()]
  * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
@@ -332,18 +373,28 @@ void search(raft::resources const& handle,
  * given
  */
 void search(raft::resources const& handle,
+            const cuvs::neighbors::brute_force::search_params& params,
             const cuvs::neighbors::brute_force::index<half, float>& index,
             raft::device_matrix_view<const half, int64_t, raft::row_major> queries,
             raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
             raft::device_matrix_view<float, int64_t, raft::row_major> distances,
             const cuvs::neighbors::filtering::base_filter& sample_filter =
               cuvs::neighbors::filtering::none_sample_filter{});
+
+[[deprecated]] void search(raft::resources const& handle,
+                           const cuvs::neighbors::brute_force::index<half, float>& index,
+                           raft::device_matrix_view<const half, int64_t, raft::row_major> queries,
+                           raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+                           raft::device_matrix_view<float, int64_t, raft::row_major> distances,
+                           const cuvs::neighbors::filtering::base_filter& sample_filter =
+                             cuvs::neighbors::filtering::none_sample_filter{});
 /**
  * @brief Search ANN using the constructed index.
  *
  * See the [brute_force::build](#brute_force::build) documentation for a usage example.
  *
  * @param[in] handle
+ * @param[in] params parameters configuring the search
  * @param[in] index bruteforce constructed index
  * @param[in] queries a device pointer to a col-major matrix [n_queries, index->dim()]
  * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
@@ -353,18 +404,28 @@ void search(raft::resources const& handle,
  * given query
  */
 void search(raft::resources const& handle,
+            const cuvs::neighbors::brute_force::search_params& params,
             const cuvs::neighbors::brute_force::index<float, float>& index,
             raft::device_matrix_view<const float, int64_t, raft::col_major> queries,
             raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
             raft::device_matrix_view<float, int64_t, raft::row_major> distances,
             const cuvs::neighbors::filtering::base_filter& sample_filter =
               cuvs::neighbors::filtering::none_sample_filter{});
+
+[[deprecated]] void search(raft::resources const& handle,
+                           const cuvs::neighbors::brute_force::index<float, float>& index,
+                           raft::device_matrix_view<const float, int64_t, raft::col_major> queries,
+                           raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+                           raft::device_matrix_view<float, int64_t, raft::row_major> distances,
+                           const cuvs::neighbors::filtering::base_filter& sample_filter =
+                             cuvs::neighbors::filtering::none_sample_filter{});
 /**
  * @brief Search ANN using the constructed index.
  *
  * See the [brute_force::build](#brute_force::build) documentation for a usage example.
  *
  * @param[in] handle
+ * @param[in] params parameters configuring the search
  * @param[in] index bruteforce constructed index
  * @param[in] queries a device pointer to a col-major matrix [n_queries, index->dim()]
  * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
@@ -374,12 +435,21 @@ void search(raft::resources const& handle,
  * given query
  */
 void search(raft::resources const& handle,
+            const cuvs::neighbors::brute_force::search_params& params,
             const cuvs::neighbors::brute_force::index<half, float>& index,
             raft::device_matrix_view<const half, int64_t, raft::col_major> queries,
             raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
             raft::device_matrix_view<float, int64_t, raft::row_major> distances,
             const cuvs::neighbors::filtering::base_filter& sample_filter =
               cuvs::neighbors::filtering::none_sample_filter{});
+
+[[deprecated]] void search(raft::resources const& handle,
+                           const cuvs::neighbors::brute_force::index<half, float>& index,
+                           raft::device_matrix_view<const half, int64_t, raft::col_major> queries,
+                           raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+                           raft::device_matrix_view<float, int64_t, raft::row_major> distances,
+                           const cuvs::neighbors::filtering::base_filter& sample_filter =
+                             cuvs::neighbors::filtering::none_sample_filter{});
 /**
  * @}
  */
@@ -472,6 +542,7 @@ struct sparse_search_params {
  * @brief Search the sparse bruteforce index for nearest neighbors
  *
  * @param[in] handle
+ * @param[in] params parameters configuring the search
  * @param[in] index Sparse brute-force constructed index
  * @param[in] queries a sparse CSR matrix on the device to query
  * @param[out] neighbors a device pointer to the indices of the neighbors in the source dataset
diff --git a/cpp/src/neighbors/brute_force.cu b/cpp/src/neighbors/brute_force.cu
index d534676e3..a9980a390 100644
--- a/cpp/src/neighbors/brute_force.cu
+++ b/cpp/src/neighbors/brute_force.cu
@@ -160,45 +160,81 @@ void index<T, DistT>::update_dataset(
   dataset_view_ = raft::make_const_mdspan(dataset_.view());
 }
 
-#define CUVS_INST_BFKNN(T, DistT)                                                    \
-  auto build(raft::resources const& res,                                             \
-             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,    \
-             cuvs::distance::DistanceType metric,                                    \
-             DistT metric_arg)                                                       \
-    ->cuvs::neighbors::brute_force::index<T, DistT>                                  \
-  {                                                                                  \
-    return detail::build<T, DistT>(res, dataset, metric, metric_arg);                \
-  }                                                                                  \
-  auto build(raft::resources const& res,                                             \
-             raft::device_matrix_view<const T, int64_t, raft::col_major> dataset,    \
-             cuvs::distance::DistanceType metric,                                    \
-             DistT metric_arg)                                                       \
-    ->cuvs::neighbors::brute_force::index<T, DistT>                                  \
-  {                                                                                  \
-    return detail::build<T, DistT>(res, dataset, metric, metric_arg);                \
-  }                                                                                  \
-                                                                                     \
-  void search(raft::resources const& res,                                            \
-              const cuvs::neighbors::brute_force::index<T, DistT>& idx,              \
-              raft::device_matrix_view<const T, int64_t, raft::row_major> queries,   \
-              raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors, \
-              raft::device_matrix_view<DistT, int64_t, raft::row_major> distances,   \
-              const cuvs::neighbors::filtering::base_filter& sample_filter)          \
-  {                                                                                  \
-    detail::search<T, int64_t, DistT, raft::row_major>(                              \
-      res, idx, queries, neighbors, distances, sample_filter);                       \
-  }                                                                                  \
-  void search(raft::resources const& res,                                            \
-              const cuvs::neighbors::brute_force::index<T, DistT>& idx,              \
-              raft::device_matrix_view<const T, int64_t, raft::col_major> queries,   \
-              raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors, \
-              raft::device_matrix_view<DistT, int64_t, raft::row_major> distances,   \
-              const cuvs::neighbors::filtering::base_filter& sample_filter)          \
-  {                                                                                  \
-    detail::search<T, int64_t, DistT, raft::col_major>(                              \
-      res, idx, queries, neighbors, distances, sample_filter);                       \
-  }                                                                                  \
-                                                                                     \
+#define CUVS_INST_BFKNN(T, DistT)                                                               \
+  auto build(raft::resources const& res,                                                        \
+             const cuvs::neighbors::brute_force::index_params& index_params,                    \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset)               \
+    ->cuvs::neighbors::brute_force::index<T, DistT>                                             \
+  {                                                                                             \
+    return detail::build<T, DistT>(res, dataset, index_params.metric, index_params.metric_arg); \
+  }                                                                                             \
+  auto build(raft::resources const& res,                                                        \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,               \
+             cuvs::distance::DistanceType metric,                                               \
+             DistT metric_arg)                                                                  \
+    ->cuvs::neighbors::brute_force::index<T, DistT>                                             \
+  {                                                                                             \
+    return detail::build<T, DistT>(res, dataset, metric, metric_arg);                           \
+  }                                                                                             \
+  auto build(raft::resources const& res,                                                        \
+             const cuvs::neighbors::brute_force::index_params& index_params,                    \
+             raft::device_matrix_view<const T, int64_t, raft::col_major> dataset)               \
+    ->cuvs::neighbors::brute_force::index<T, DistT>                                             \
+  {                                                                                             \
+    return detail::build<T, DistT>(res, dataset, index_params.metric, index_params.metric_arg); \
+  }                                                                                             \
+  auto build(raft::resources const& res,                                                        \
+             raft::device_matrix_view<const T, int64_t, raft::col_major> dataset,               \
+             cuvs::distance::DistanceType metric,                                               \
+             DistT metric_arg)                                                                  \
+    ->cuvs::neighbors::brute_force::index<T, DistT>                                             \
+  {                                                                                             \
+    return detail::build<T, DistT>(res, dataset, metric, metric_arg);                           \
+  }                                                                                             \
+                                                                                                \
+  void search(raft::resources const& res,                                                       \
+              const cuvs::neighbors::brute_force::search_params& params,                        \
+              const cuvs::neighbors::brute_force::index<T, DistT>& idx,                         \
+              raft::device_matrix_view<const T, int64_t, raft::row_major> queries,              \
+              raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,            \
+              raft::device_matrix_view<DistT, int64_t, raft::row_major> distances,              \
+              const cuvs::neighbors::filtering::base_filter& sample_filter)                     \
+  {                                                                                             \
+    detail::search<T, int64_t, DistT, raft::row_major>(                                         \
+      res, idx, queries, neighbors, distances, sample_filter);                                  \
+  }                                                                                             \
+  void search(raft::resources const& res,                                                       \
+              const cuvs::neighbors::brute_force::index<T, DistT>& idx,                         \
+              raft::device_matrix_view<const T, int64_t, raft::row_major> queries,              \
+              raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,            \
+              raft::device_matrix_view<DistT, int64_t, raft::row_major> distances,              \
+              const cuvs::neighbors::filtering::base_filter& sample_filter)                     \
+  {                                                                                             \
+    detail::search<T, int64_t, DistT, raft::row_major>(                                         \
+      res, idx, queries, neighbors, distances, sample_filter);                                  \
+  }                                                                                             \
+  void search(raft::resources const& res,                                                       \
+              const cuvs::neighbors::brute_force::search_params& params,                        \
+              const cuvs::neighbors::brute_force::index<T, DistT>& idx,                         \
+              raft::device_matrix_view<const T, int64_t, raft::col_major> queries,              \
+              raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,            \
+              raft::device_matrix_view<DistT, int64_t, raft::row_major> distances,              \
+              const cuvs::neighbors::filtering::base_filter& sample_filter)                     \
+  {                                                                                             \
+    detail::search<T, int64_t, DistT, raft::col_major>(                                         \
+      res, idx, queries, neighbors, distances, sample_filter);                                  \
+  }                                                                                             \
+  void search(raft::resources const& res,                                                       \
+              const cuvs::neighbors::brute_force::index<T, DistT>& idx,                         \
+              raft::device_matrix_view<const T, int64_t, raft::col_major> queries,              \
+              raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,            \
+              raft::device_matrix_view<DistT, int64_t, raft::row_major> distances,              \
+              const cuvs::neighbors::filtering::base_filter& sample_filter)                     \
+  {                                                                                             \
+    detail::search<T, int64_t, DistT, raft::col_major>(                                         \
+      res, idx, queries, neighbors, distances, sample_filter);                                  \
+  }                                                                                             \
+                                                                                                \
   template struct cuvs::neighbors::brute_force::index<T, DistT>;
 
 CUVS_INST_BFKNN(float, float);
@@ -206,4 +242,4 @@ CUVS_INST_BFKNN(half, float);
 
 #undef CUVS_INST_BFKNN
 
-}  // namespace cuvs::neighbors::brute_force
\ No newline at end of file
+}  // namespace cuvs::neighbors::brute_force
diff --git a/cpp/src/neighbors/brute_force_c.cpp b/cpp/src/neighbors/brute_force_c.cpp
index f1a8c995d..2b8980863 100644
--- a/cpp/src/neighbors/brute_force_c.cpp
+++ b/cpp/src/neighbors/brute_force_c.cpp
@@ -44,10 +44,12 @@ void* _build(cuvsResources_t res,
   using mdspan_type = raft::device_matrix_view<T const, int64_t, raft::row_major>;
   auto mds          = cuvs::core::from_dlpack<mdspan_type>(dataset_tensor);
 
-  auto index_on_stack = cuvs::neighbors::brute_force::build(
-    *res_ptr, mds, static_cast<cuvs::distance::DistanceType>((int)metric), metric_arg);
-  auto index_on_heap = new cuvs::neighbors::brute_force::index<T>(std::move(index_on_stack));
+  cuvs::neighbors::brute_force::index_params params;
+  params.metric     = metric;
+  params.metric_arg = metric_arg;
 
+  auto index_on_stack = cuvs::neighbors::brute_force::build(*res_ptr, params, mds);
+  auto index_on_heap  = new cuvs::neighbors::brute_force::index<T>(std::move(index_on_stack));
   return index_on_heap;
 }
 
@@ -72,8 +74,11 @@ void _search(cuvsResources_t res,
   auto neighbors_mds = cuvs::core::from_dlpack<neighbors_mdspan_type>(neighbors_tensor);
   auto distances_mds = cuvs::core::from_dlpack<distances_mdspan_type>(distances_tensor);
 
+  cuvs::neighbors::brute_force::search_params params;
+
   if (prefilter.type == NO_FILTER) {
     cuvs::neighbors::brute_force::search(*res_ptr,
+                                         params,
                                          *index_ptr,
                                          queries_mds,
                                          neighbors_mds,
@@ -87,7 +92,7 @@ void _search(cuvsResources_t res,
                          queries_mds.extent(0),
                          index_ptr->dataset().extent(0)));
     cuvs::neighbors::brute_force::search(
-      *res_ptr, *index_ptr, queries_mds, neighbors_mds, distances_mds, prefilter_view);
+      *res_ptr, params, *index_ptr, queries_mds, neighbors_mds, distances_mds, prefilter_view);
   } else {
     RAFT_FAIL("Unsupported prefilter type: BITSET");
   }
@@ -226,4 +231,4 @@ extern "C" cuvsError_t cuvsBruteForceSerialize(cuvsResources_t res,
       RAFT_FAIL("Unsupported index dtype: %d and bits: %d", index->dtype.code, index->dtype.bits);
     }
   });
-}
\ No newline at end of file
+}
diff --git a/cpp/src/neighbors/dynamic_batching.cu b/cpp/src/neighbors/dynamic_batching.cu
index 6be70353b..84c8a2cf1 100644
--- a/cpp/src/neighbors/dynamic_batching.cu
+++ b/cpp/src/neighbors/dynamic_batching.cu
@@ -16,6 +16,7 @@
 
 #include "detail/dynamic_batching.cuh"
 
+#include <cuvs/neighbors/brute_force.hpp>
 #include <cuvs/neighbors/cagra.hpp>
 #include <cuvs/neighbors/ivf_flat.hpp>
 #include <cuvs/neighbors/ivf_pq.hpp>
@@ -53,6 +54,8 @@ namespace cuvs::neighbors::dynamic_batching {
     return index.runner->search(res, params, queries, neighbors, distances);       \
   }
 
+CUVS_INST_DYNAMIC_BATCHING_INDEX(float, int64_t, cuvs::neighbors::brute_force, index<float, float>);
+
 CUVS_INST_DYNAMIC_BATCHING_INDEX(float, uint32_t, cuvs::neighbors::cagra, index<float, uint32_t>);
 CUVS_INST_DYNAMIC_BATCHING_INDEX(half, uint32_t, cuvs::neighbors::cagra, index<half, uint32_t>);
 CUVS_INST_DYNAMIC_BATCHING_INDEX(int8_t, uint32_t, cuvs::neighbors::cagra, index<int8_t, uint32_t>);
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index cca061455..9aa596a6e 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -183,6 +183,7 @@ if(BUILD_TESTS)
     NAME
     NEIGHBORS_DYNAMIC_BATCHING_TEST
     PATH
+    neighbors/dynamic_batching/test_brute_force.cu
     neighbors/dynamic_batching/test_cagra.cu
     neighbors/dynamic_batching/test_ivf_flat.cu
     neighbors/dynamic_batching/test_ivf_pq.cu
diff --git a/cpp/test/neighbors/brute_force.cu b/cpp/test/neighbors/brute_force.cu
index 2cefb1098..b1c819a26 100644
--- a/cpp/test/neighbors/brute_force.cu
+++ b/cpp/test/neighbors/brute_force.cu
@@ -89,10 +89,18 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs<T>> {
     auto distances =
       raft::make_device_matrix_view<DistT, IdxT, raft::row_major>(distances_.data(), rows_, k_);
 
-    auto metric = cuvs::distance::DistanceType::L2Unexpanded;
-    auto idx    = cuvs::neighbors::brute_force::build(handle, index, metric);
-    cuvs::neighbors::brute_force::search(
-      handle, idx, search, indices, distances, cuvs::neighbors::filtering::none_sample_filter{});
+    cuvs::neighbors::brute_force::index_params index_params;
+    index_params.metric = cuvs::distance::DistanceType::L2Unexpanded;
+
+    auto idx = cuvs::neighbors::brute_force::build(handle, index_params, index);
+    cuvs::neighbors::brute_force::search_params search_params;
+    cuvs::neighbors::brute_force::search(handle,
+                                         search_params,
+                                         idx,
+                                         search,
+                                         indices,
+                                         distances,
+                                         cuvs::neighbors::filtering::none_sample_filter{});
 
     build_actual_output<<<raft::ceildiv(rows_ * k_, 32), 32, 0, stream>>>(
       actual_labels_.data(), rows_, k_, search_labels_.data(), indices_.data());
@@ -385,16 +393,22 @@ class RandomBruteForceKNNTest : public ::testing::TestWithParam<RandomKNNInputs>
     auto distances = raft::make_device_matrix_view<DistT, int64_t, raft::row_major>(
       cuvs_distances_.data(), params_.num_queries, params_.k);
 
+    cuvs::neighbors::brute_force::index_params index_params;
+    index_params.metric     = metric;
+    index_params.metric_arg = metric_arg;
+
+    cuvs::neighbors::brute_force::search_params search_params;
+
     if (params_.row_major) {
       auto idx =
         cuvs::neighbors::brute_force::build(handle_,
+                                            index_params,
                                             raft::make_device_matrix_view<const T, int64_t>(
-                                              database.data(), params_.num_db_vecs, params_.dim),
-                                            metric,
-                                            metric_arg);
+                                              database.data(), params_.num_db_vecs, params_.dim));
 
       cuvs::neighbors::brute_force::search(
         handle_,
+        search_params,
         idx,
         raft::make_device_matrix_view<const T, int64_t>(
           search_queries.data(), params_.num_queries, params_.dim),
@@ -404,13 +418,13 @@ class RandomBruteForceKNNTest : public ::testing::TestWithParam<RandomKNNInputs>
     } else {
       auto idx = cuvs::neighbors::brute_force::build(
         handle_,
+        index_params,
         raft::make_device_matrix_view<const T, int64_t, raft::col_major>(
-          database.data(), params_.num_db_vecs, params_.dim),
-        metric,
-        metric_arg);
+          database.data(), params_.num_db_vecs, params_.dim));
 
       cuvs::neighbors::brute_force::search(
         handle_,
+        search_params,
         idx,
         raft::make_device_matrix_view<const T, int64_t, raft::col_major>(
           search_queries.data(), params_.num_queries, params_.dim),
diff --git a/cpp/test/neighbors/dynamic_batching/test_brute_force.cu b/cpp/test/neighbors/dynamic_batching/test_brute_force.cu
new file mode 100644
index 000000000..11f468374
--- /dev/null
+++ b/cpp/test/neighbors/dynamic_batching/test_brute_force.cu
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "../dynamic_batching.cuh"
+
+#include <cuvs/neighbors/brute_force.hpp>
+
+namespace cuvs::neighbors::dynamic_batching {
+
+using brute_force_float32 = dynamic_batching_test<float,
+                                                  int64_t,
+                                                  brute_force::index<float, float>,
+                                                  brute_force::build,
+                                                  brute_force::search>;
+
+TEST_P(brute_force_float32, defaults)
+{
+  build_all();
+  search_all();
+  check_neighbors();
+}
+
+INSTANTIATE_TEST_CASE_P(dynamic_batching, brute_force_float32, ::testing::ValuesIn(inputs));
+
+}  // namespace cuvs::neighbors::dynamic_batching

From 1e548f8c3a773452ce69556f4db72fc712efae02 Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Fri, 10 Jan 2025 07:38:42 -0800
Subject: [PATCH 22/39] Allow brute_force::build to work on host matrix dataset
 (#562)

Closes #538

Authors:
  - Ben Frederickson (https://github.com/benfred)

Approvers:
  - Micka (https://github.com/lowener)

URL: https://github.com/rapidsai/cuvs/pull/562
---
 cpp/include/cuvs/neighbors/brute_force.hpp   |  28 ++++
 cpp/src/neighbors/brute_force.cu             |   7 +
 cpp/src/neighbors/detail/knn_brute_force.cuh |  22 ++-
 cpp/test/neighbors/brute_force.cu            | 142 ++++++++++++-------
 4 files changed, 140 insertions(+), 59 deletions(-)

diff --git a/cpp/include/cuvs/neighbors/brute_force.hpp b/cpp/include/cuvs/neighbors/brute_force.hpp
index 8fca9da83..99581469f 100644
--- a/cpp/include/cuvs/neighbors/brute_force.hpp
+++ b/cpp/include/cuvs/neighbors/brute_force.hpp
@@ -204,6 +204,20 @@ auto build(raft::resources const& handle,
            raft::device_matrix_view<const float, int64_t, raft::row_major> dataset)
   -> cuvs::neighbors::brute_force::index<float, float>;
 
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * @param[in] handle
+ * @param[in] index_params parameters such as the distance metric to use
+ * @param[in] dataset a host pointer to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed brute-force index
+ */
+auto build(raft::resources const& handle,
+           const cuvs::neighbors::brute_force::index_params& index_params,
+           raft::host_matrix_view<const float, int64_t, raft::row_major> dataset)
+  -> cuvs::neighbors::brute_force::index<float, float>;
+
 [[deprecated]] auto build(
   raft::resources const& handle,
   raft::device_matrix_view<const float, int64_t, raft::row_major> dataset,
@@ -231,6 +245,20 @@ auto build(raft::resources const& handle,
            raft::device_matrix_view<const half, int64_t, raft::row_major> dataset)
   -> cuvs::neighbors::brute_force::index<half, float>;
 
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * @param[in] handle
+ * @param[in] index_params parameters such as the distance metric to use
+ * @param[in] dataset a host pointer to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed brute-force index
+ */
+auto build(raft::resources const& handle,
+           const cuvs::neighbors::brute_force::index_params& index_params,
+           raft::host_matrix_view<const half, int64_t, raft::row_major> dataset)
+  -> cuvs::neighbors::brute_force::index<half, float>;
+
 [[deprecated]] auto build(
   raft::resources const& handle,
   raft::device_matrix_view<const half, int64_t, raft::row_major> dataset,
diff --git a/cpp/src/neighbors/brute_force.cu b/cpp/src/neighbors/brute_force.cu
index a9980a390..d54a75879 100644
--- a/cpp/src/neighbors/brute_force.cu
+++ b/cpp/src/neighbors/brute_force.cu
@@ -168,6 +168,13 @@ void index<T, DistT>::update_dataset(
   {                                                                                             \
     return detail::build<T, DistT>(res, dataset, index_params.metric, index_params.metric_arg); \
   }                                                                                             \
+  auto build(raft::resources const& res,                                                        \
+             const cuvs::neighbors::brute_force::index_params& index_params,                    \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)                 \
+    ->cuvs::neighbors::brute_force::index<T, DistT>                                             \
+  {                                                                                             \
+    return detail::build<T, DistT>(res, dataset, index_params.metric, index_params.metric_arg); \
+  }                                                                                             \
   auto build(raft::resources const& res,                                                        \
              raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,               \
              cuvs::distance::DistanceType metric,                                               \
diff --git a/cpp/src/neighbors/detail/knn_brute_force.cuh b/cpp/src/neighbors/detail/knn_brute_force.cuh
index e5eeecbc9..f1976e002 100644
--- a/cpp/src/neighbors/detail/knn_brute_force.cuh
+++ b/cpp/src/neighbors/detail/knn_brute_force.cuh
@@ -28,6 +28,7 @@
 #include "./knn_utils.cuh"
 
 #include <raft/core/bitmap.cuh>
+#include <raft/core/copy.cuh>
 #include <raft/core/device_csr_matrix.hpp>
 #include <raft/core/host_mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
@@ -750,10 +751,10 @@ void search(raft::resources const& res,
   }
 }
 
-template <typename T, typename DistT, typename LayoutT = raft::row_major>
+template <typename T, typename DistT, typename AccessorT, typename LayoutT = raft::row_major>
 cuvs::neighbors::brute_force::index<T, DistT> build(
   raft::resources const& res,
-  raft::device_matrix_view<const T, int64_t, LayoutT> dataset,
+  mdspan<const T, matrix_extent<int64_t>, LayoutT, AccessorT> dataset,
   cuvs::distance::DistanceType metric,
   DistT metric_arg)
 {
@@ -764,18 +765,31 @@ cuvs::neighbors::brute_force::index<T, DistT> build(
   if (metric == cuvs::distance::DistanceType::L2Expanded ||
       metric == cuvs::distance::DistanceType::L2SqrtExpanded ||
       metric == cuvs::distance::DistanceType::CosineExpanded) {
+    auto dataset_storage = std::optional<device_matrix<T, int64_t, LayoutT>>{};
+    auto dataset_view    = [&res, &dataset_storage, dataset]() {
+      if constexpr (std::is_same_v<decltype(dataset),
+                                   raft::device_matrix_view<const T, int64_t, row_major>>) {
+        return dataset;
+      } else {
+        dataset_storage =
+          make_device_matrix<T, int64_t, LayoutT>(res, dataset.extent(0), dataset.extent(1));
+        raft::copy(res, dataset_storage->view(), dataset);
+        return raft::make_const_mdspan(dataset_storage->view());
+      }
+    }();
+
     norms = raft::make_device_vector<DistT, int64_t>(res, dataset.extent(0));
     // cosine needs the l2norm, where as l2 distances needs the squared norm
     if (metric == cuvs::distance::DistanceType::CosineExpanded) {
       raft::linalg::norm(res,
-                         dataset,
+                         dataset_view,
                          norms->view(),
                          raft::linalg::NormType::L2Norm,
                          raft::linalg::Apply::ALONG_ROWS,
                          raft::sqrt_op{});
     } else {
       raft::linalg::norm(res,
-                         dataset,
+                         dataset_view,
                          norms->view(),
                          raft::linalg::NormType::L2Norm,
                          raft::linalg::Apply::ALONG_ROWS);
diff --git a/cpp/test/neighbors/brute_force.cu b/cpp/test/neighbors/brute_force.cu
index b1c819a26..a9ad4bf1c 100644
--- a/cpp/test/neighbors/brute_force.cu
+++ b/cpp/test/neighbors/brute_force.cu
@@ -21,6 +21,7 @@
 #include <cuvs/selection/select_k.hpp>
 
 #include <cuvs/neighbors/brute_force.hpp>
+#include <raft/core/host_mdarray.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/linalg/transpose.cuh>
 #include <raft/matrix/init.cuh>
@@ -210,6 +211,7 @@ struct RandomKNNInputs {
   int k;
   cuvs::distance::DistanceType metric;
   bool row_major;
+  bool host_dataset;
 };
 
 std::ostream& operator<<(std::ostream& os, const RandomKNNInputs& input)
@@ -217,7 +219,7 @@ std::ostream& operator<<(std::ostream& os, const RandomKNNInputs& input)
   return os << "num_queries:" << input.num_queries << " num_vecs:" << input.num_db_vecs
             << " dim:" << input.dim << " k:" << input.k
             << " metric:" << cuvs::neighbors::print_metric{input.metric}
-            << " row_major:" << input.row_major;
+            << " row_major:" << input.row_major << " host_dataset:" << input.host_dataset;
 }
 
 template <typename T, typename DistT = T>
@@ -399,12 +401,15 @@ class RandomBruteForceKNNTest : public ::testing::TestWithParam<RandomKNNInputs>
 
     cuvs::neighbors::brute_force::search_params search_params;
 
-    if (params_.row_major) {
-      auto idx =
-        cuvs::neighbors::brute_force::build(handle_,
-                                            index_params,
-                                            raft::make_device_matrix_view<const T, int64_t>(
-                                              database.data(), params_.num_db_vecs, params_.dim));
+    if (params_.host_dataset) {
+      // test building from a dataset in host memory
+      auto host_database =
+        raft::make_host_matrix<T, int64_t, raft::row_major>(params_.num_db_vecs, params_.dim);
+      raft::copy(
+        host_database.data_handle(), database.data(), params_.num_db_vecs * params_.dim, stream_);
+
+      auto idx = cuvs::neighbors::brute_force::build(
+        handle_, index_params, raft::make_const_mdspan(host_database.view()));
 
       cuvs::neighbors::brute_force::search(
         handle_,
@@ -416,21 +421,39 @@ class RandomBruteForceKNNTest : public ::testing::TestWithParam<RandomKNNInputs>
         distances,
         cuvs::neighbors::filtering::none_sample_filter{});
     } else {
-      auto idx = cuvs::neighbors::brute_force::build(
-        handle_,
-        index_params,
-        raft::make_device_matrix_view<const T, int64_t, raft::col_major>(
-          database.data(), params_.num_db_vecs, params_.dim));
+      if (params_.row_major) {
+        auto idx =
+          cuvs::neighbors::brute_force::build(handle_,
+                                              index_params,
+                                              raft::make_device_matrix_view<const T, int64_t>(
+                                                database.data(), params_.num_db_vecs, params_.dim));
 
-      cuvs::neighbors::brute_force::search(
-        handle_,
-        search_params,
-        idx,
-        raft::make_device_matrix_view<const T, int64_t, raft::col_major>(
-          search_queries.data(), params_.num_queries, params_.dim),
-        indices,
-        distances,
-        cuvs::neighbors::filtering::none_sample_filter{});
+        cuvs::neighbors::brute_force::search(
+          handle_,
+          search_params,
+          idx,
+          raft::make_device_matrix_view<const T, int64_t>(
+            search_queries.data(), params_.num_queries, params_.dim),
+          indices,
+          distances,
+          cuvs::neighbors::filtering::none_sample_filter{});
+      } else {
+        auto idx = cuvs::neighbors::brute_force::build(
+          handle_,
+          index_params,
+          raft::make_device_matrix_view<const T, int64_t, raft::col_major>(
+            database.data(), params_.num_db_vecs, params_.dim));
+
+        cuvs::neighbors::brute_force::search(
+          handle_,
+          search_params,
+          idx,
+          raft::make_device_matrix_view<const T, int64_t, raft::col_major>(
+            search_queries.data(), params_.num_queries, params_.dim),
+          indices,
+          distances,
+          cuvs::neighbors::filtering::none_sample_filter{});
+      }
     }
 
     ASSERT_TRUE(cuvs::neighbors::devArrMatchKnnPair(ref_indices_.data(),
@@ -480,42 +503,51 @@ class RandomBruteForceKNNTest : public ::testing::TestWithParam<RandomKNNInputs>
 
 const std::vector<RandomKNNInputs> random_inputs = {
   // test each distance metric on a small-ish input, with row-major inputs
-  {100, 256, 2, 65, cuvs::distance::DistanceType::L2Expanded, true},
-  {256, 512, 16, 8, cuvs::distance::DistanceType::L2Unexpanded, true},
-  {256, 512, 16, 8, cuvs::distance::DistanceType::L2SqrtExpanded, true},
-  {256, 512, 16, 8, cuvs::distance::DistanceType::L2SqrtUnexpanded, true},
-  {256, 512, 16, 8, cuvs::distance::DistanceType::L1, true},
-  {256, 512, 16, 8, cuvs::distance::DistanceType::Linf, true},
-  {256, 512, 16, 8, cuvs::distance::DistanceType::InnerProduct, true},
-  {256, 512, 16, 8, cuvs::distance::DistanceType::CorrelationExpanded, true},
-  {256, 512, 16, 8, cuvs::distance::DistanceType::CosineExpanded, true},
-  {256, 512, 16, 8, cuvs::distance::DistanceType::LpUnexpanded, true},
-  {256, 512, 16, 8, cuvs::distance::DistanceType::JensenShannon, true},
-  {256, 512, 16, 8, cuvs::distance::DistanceType::L2SqrtExpanded, true},
-  {256, 512, 16, 8, cuvs::distance::DistanceType::Canberra, true},
+  {100, 256, 2, 65, cuvs::distance::DistanceType::L2Expanded, true, false},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::L2Unexpanded, true, false},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::L2SqrtExpanded, true, false},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::L2SqrtUnexpanded, true, false},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::L1, true, false},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::Linf, true, false},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::InnerProduct, true, false},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::CorrelationExpanded, true, false},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::CosineExpanded, true, false},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::LpUnexpanded, true, false},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::JensenShannon, true, false},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::L2SqrtExpanded, true, false},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::Canberra, true, false},
   // test each distance metric with col-major inputs
-  {256, 512, 16, 7, cuvs::distance::DistanceType::L2Expanded, false},
-  {256, 512, 16, 8, cuvs::distance::DistanceType::L2Unexpanded, false},
-  {256, 512, 16, 8, cuvs::distance::DistanceType::L2SqrtExpanded, false},
-  {256, 512, 16, 8, cuvs::distance::DistanceType::L2SqrtUnexpanded, false},
-  {256, 512, 16, 8, cuvs::distance::DistanceType::L1, false},
-  {256, 512, 16, 8, cuvs::distance::DistanceType::Linf, false},
-  {256, 512, 16, 8, cuvs::distance::DistanceType::InnerProduct, false},
-  {256, 512, 16, 8, cuvs::distance::DistanceType::CorrelationExpanded, false},
-  {256, 512, 16, 8, cuvs::distance::DistanceType::CosineExpanded, false},
-  {256, 512, 16, 8, cuvs::distance::DistanceType::LpUnexpanded, false},
-  {256, 512, 16, 8, cuvs::distance::DistanceType::JensenShannon, false},
-  {256, 512, 16, 8, cuvs::distance::DistanceType::L2SqrtExpanded, false},
-  {256, 512, 16, 8, cuvs::distance::DistanceType::Canberra, false},
+  {256, 512, 16, 7, cuvs::distance::DistanceType::L2Expanded, false, false},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::L2Unexpanded, false, false},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::L2SqrtExpanded, false, false},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::L2SqrtUnexpanded, false, false},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::L1, false, false},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::Linf, false, false},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::InnerProduct, false, false},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::CorrelationExpanded, false, false},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::CosineExpanded, false, false},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::LpUnexpanded, false, false},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::JensenShannon, false, false},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::L2SqrtExpanded, false, false},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::Canberra, false, false},
   // larger tests on different sized data / k values
-  {10000, 40000, 32, 30, cuvs::distance::DistanceType::L2Expanded, false},
-  {345, 1023, 16, 128, cuvs::distance::DistanceType::CosineExpanded, true},
-  {789, 20516, 64, 256, cuvs::distance::DistanceType::L2SqrtExpanded, false},
-  {1000, 200000, 128, 128, cuvs::distance::DistanceType::L2Expanded, true},
-  {1000, 200000, 128, 128, cuvs::distance::DistanceType::L2Expanded, false},
-  {1000, 5000, 128, 128, cuvs::distance::DistanceType::LpUnexpanded, true},
-  {1000, 5000, 128, 128, cuvs::distance::DistanceType::L2SqrtExpanded, false},
-  {1000, 5000, 128, 128, cuvs::distance::DistanceType::InnerProduct, false}};
+  {10000, 40000, 32, 30, cuvs::distance::DistanceType::L2Expanded, false, false},
+  {345, 1023, 16, 128, cuvs::distance::DistanceType::CosineExpanded, true, false},
+  {789, 20516, 64, 256, cuvs::distance::DistanceType::L2SqrtExpanded, false, false},
+  {1000, 200000, 128, 128, cuvs::distance::DistanceType::L2Expanded, true, false},
+  {1000, 200000, 128, 128, cuvs::distance::DistanceType::L2Expanded, false, false},
+  {1000, 5000, 128, 128, cuvs::distance::DistanceType::LpUnexpanded, true, false},
+  {1000, 5000, 128, 128, cuvs::distance::DistanceType::L2SqrtExpanded, false, false},
+  {1000, 5000, 128, 128, cuvs::distance::DistanceType::InnerProduct, false, false},
+  // test with datasets on host memory
+  {256, 512, 16, 8, cuvs::distance::DistanceType::L2Expanded, true, true},
+  {256, 512, 32, 16, cuvs::distance::DistanceType::L2Unexpanded, true, true},
+  {256, 512, 8, 8, cuvs::distance::DistanceType::L2SqrtExpanded, true, true},
+  {256, 128, 32, 8, cuvs::distance::DistanceType::L2SqrtUnexpanded, true, true},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::L1, true, true},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::Linf, true, true},
+  {256, 512, 16, 8, cuvs::distance::DistanceType::InnerProduct, true, true},
+  {256, 512, 16, 7, cuvs::distance::DistanceType::L2Expanded, true, true}};
 
 typedef RandomBruteForceKNNTest<float, float> RandomBruteForceKNNTestF;
 TEST_P(RandomBruteForceKNNTestF, BruteForce) { this->testBruteForce(); }

From 28d9990821e26b9bef7b452d9f797bec2972a92e Mon Sep 17 00:00:00 2001
From: Micka <ide.mickael@gmail.com>
Date: Mon, 13 Jan 2025 20:38:03 +0100
Subject: [PATCH 23/39] Add support for refinement with `uint32_t` index type
 (#563)

Closes #537.
Needed change for the transition from Raft to cuVS.

Authors:
  - Micka (https://github.com/lowener)

Approvers:
  - Ben Frederickson (https://github.com/benfred)

URL: https://github.com/rapidsai/cuvs/pull/563
---
 cpp/include/cuvs/neighbors/refine.hpp         | 45 +++++++++++++++++++
 cpp/src/neighbors/ivf_flat_index.cpp          |  1 +
 .../detail/refine_device_float_float.cu       |  1 +
 cpp/src/neighbors/refine/refine_device.cuh    | 13 +++---
 4 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cuvs/neighbors/refine.hpp b/cpp/include/cuvs/neighbors/refine.hpp
index 19fbd30bb..5e60ff537 100644
--- a/cpp/include/cuvs/neighbors/refine.hpp
+++ b/cpp/include/cuvs/neighbors/refine.hpp
@@ -76,6 +76,51 @@ void refine(raft::resources const& handle,
             raft::device_matrix_view<float, int64_t, raft::row_major> distances,
             cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded);
 
+/**
+ * @brief Refine nearest neighbor search.
+ *
+ * Refinement is an operation that follows an approximate NN search. The approximate search has
+ * already selected n_candidates neighbor candidates for each query. We narrow it down to k
+ * neighbors. For each query, we calculate the exact distance between the query and its
+ * n_candidates neighbor candidate, and select the k nearest ones.
+ *
+ * The k nearest neighbors and distances are returned.
+ *
+ * Example usage
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_pq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_pq::build(handle, index_params, dataset);
+ *   // use default search parameters
+ *   ivf_pq::search_params search_params;
+ *   // search m = 4 * k nearest neighbours for each of the N queries
+ *   ivf_pq::search(handle, search_params, index, queries, neighbor_candidates,
+ *                  out_dists_tmp);
+ *   // refine it to the k nearest one
+ *   refine(handle, dataset, queries, neighbor_candidates, out_indices, out_dists,
+ *           index.metric());
+ * @endcode
+ *
+ *
+ * @param[in] handle the raft handle
+ * @param[in] dataset device matrix that stores the dataset [n_rows, dims]
+ * @param[in] queries device matrix of the queries [n_queris, dims]
+ * @param[in] neighbor_candidates indices of candidate vectors [n_queries, n_candidates], where
+ *   n_candidates >= k
+ * @param[out] indices device matrix that stores the refined indices [n_queries, k]
+ * @param[out] distances device matrix that stores the refined distances [n_queries, k]
+ * @param[in] metric distance metric to use. Euclidean (L2) is used by default
+ */
+void refine(raft::resources const& handle,
+            raft::device_matrix_view<const float, int64_t, raft::row_major> dataset,
+            raft::device_matrix_view<const float, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<const uint32_t, int64_t, raft::row_major> neighbor_candidates,
+            raft::device_matrix_view<uint32_t, int64_t, raft::row_major> indices,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances,
+            cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded);
+
 /**
  * @brief Refine nearest neighbor search.
  *
diff --git a/cpp/src/neighbors/ivf_flat_index.cpp b/cpp/src/neighbors/ivf_flat_index.cpp
index 6f7d11e50..c16dc47aa 100644
--- a/cpp/src/neighbors/ivf_flat_index.cpp
+++ b/cpp/src/neighbors/ivf_flat_index.cpp
@@ -226,6 +226,7 @@ void index<T, IdxT>::check_consistency()
     "inconsistent number of lists (clusters)");
 }
 
+template struct index<float, uint32_t>;  // Used for refine function
 template struct index<float, int64_t>;
 template struct index<half, int64_t>;
 template struct index<int8_t, int64_t>;
diff --git a/cpp/src/neighbors/refine/detail/refine_device_float_float.cu b/cpp/src/neighbors/refine/detail/refine_device_float_float.cu
index 25bad201b..76b792d1c 100644
--- a/cpp/src/neighbors/refine/detail/refine_device_float_float.cu
+++ b/cpp/src/neighbors/refine/detail/refine_device_float_float.cu
@@ -43,5 +43,6 @@
   }
 
 instantiate_cuvs_neighbors_refine_d(int64_t, float, float, int64_t);
+instantiate_cuvs_neighbors_refine_d(uint32_t, float, float, int64_t);
 
 #undef instantiate_cuvs_neighbors_refine_d
diff --git a/cpp/src/neighbors/refine/refine_device.cuh b/cpp/src/neighbors/refine/refine_device.cuh
index 6184e540b..a5491be0d 100644
--- a/cpp/src/neighbors/refine/refine_device.cuh
+++ b/cpp/src/neighbors/refine/refine_device.cuh
@@ -84,12 +84,13 @@ void refine_device(
   cuvs::neighbors::ivf_flat::index<data_t, idx_t> refinement_index(
     handle, cuvs::distance::DistanceType(metric), n_queries, false, true, dim);
 
-  cuvs::neighbors::ivf_flat::detail::fill_refinement_index(handle,
-                                                           &refinement_index,
-                                                           dataset.data_handle(),
-                                                           neighbor_candidates.data_handle(),
-                                                           n_queries,
-                                                           n_candidates);
+  cuvs::neighbors::ivf_flat::detail::fill_refinement_index<data_t, idx_t>(
+    handle,
+    &refinement_index,
+    dataset.data_handle(),
+    neighbor_candidates.data_handle(),
+    static_cast<idx_t>(n_queries),
+    static_cast<uint32_t>(n_candidates));
   uint32_t grid_dim_x = 1;
 
   // the neighbor ids will be computed in uint32_t as offset

From 898ccfb588cb3013cd4de90fccefd406668e7dce Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Wed, 15 Jan 2025 08:46:50 -0800
Subject: [PATCH 24/39] Expose col-major pairwise distances to python (#572)

Authors:
  - Ben Frederickson (https://github.com/benfred)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/572
---
 cpp/include/cuvs/core/detail/interop.hpp | 33 ++++++++++-
 cpp/include/cuvs/core/interop.hpp        | 18 +++++-
 cpp/src/distance/pairwise_distance_c.cpp | 73 +++++++++++++++++++-----
 python/cuvs/cuvs/common/cydlpack.pyx     | 13 ++++-
 python/cuvs/cuvs/distance/distance.pyx   |  4 +-
 python/cuvs/cuvs/test/test_distance.py   |  7 ++-
 6 files changed, 128 insertions(+), 20 deletions(-)

diff --git a/cpp/include/cuvs/core/detail/interop.hpp b/cpp/include/cuvs/core/detail/interop.hpp
index 2ed0b330d..19e4a922c 100644
--- a/cpp/include/cuvs/core/detail/interop.hpp
+++ b/cpp/include/cuvs/core/detail/interop.hpp
@@ -86,7 +86,6 @@ inline MdspanType from_dlpack(DLManagedTensor* managed_tensor)
   RAFT_EXPECTS(to_data_type.lanes == tensor.dtype.lanes,
                "lanes mismatch between return mdspan and DLTensor");
   RAFT_EXPECTS(tensor.dtype.lanes == 1, "More than 1 DLTensor lanes not supported");
-  RAFT_EXPECTS(tensor.strides == nullptr, "Strided memory layout for DLTensor not supported");
 
   auto to_device = accessor_type_to_DLDevice<typename MdspanType::accessor_type>();
   if (to_device.device_type == kDLCUDA) {
@@ -110,4 +109,36 @@ inline MdspanType from_dlpack(DLManagedTensor* managed_tensor)
   return MdspanType{reinterpret_cast<typename MdspanType::data_handle_type>(tensor.data), exts};
 }
 
+inline bool is_f_contiguous(DLManagedTensor* managed_tensor)
+{
+  auto tensor = managed_tensor->dl_tensor;
+
+  if (!tensor.strides) { return false; }
+  int64_t expected_stride = 1;
+  for (int64_t i = 0; i < tensor.ndim; ++i) {
+    if (tensor.strides[i] != expected_stride) { return false; }
+    expected_stride *= tensor.shape[i];
+  }
+
+  return true;
+}
+
+inline bool is_c_contiguous(DLManagedTensor* managed_tensor)
+{
+  auto tensor = managed_tensor->dl_tensor;
+
+  if (!tensor.strides) {
+    // no stride information indicates a row-major tensor according to the dlpack spec
+    return true;
+  }
+
+  int64_t expected_stride = 1;
+  for (int64_t i = tensor.ndim - 1; i >= 0; --i) {
+    if (tensor.strides[i] != expected_stride) { return false; }
+    expected_stride *= tensor.shape[i];
+  }
+
+  return true;
+}
+
 }  // namespace cuvs::core::detail
diff --git a/cpp/include/cuvs/core/interop.hpp b/cpp/include/cuvs/core/interop.hpp
index 2462f02ec..096885f2f 100644
--- a/cpp/include/cuvs/core/interop.hpp
+++ b/cpp/include/cuvs/core/interop.hpp
@@ -51,9 +51,25 @@ inline bool is_dlpack_host_compatible(DLTensor tensor)
   return detail::is_dlpack_host_compatible(tensor);
 }
 
+/**
+ * @brief Check if DLManagedTensor has a row-major (c-contiguous) layout
+ *
+ * @param tensor DLManagedTensor object to check
+ * @return bool
+ */
+inline bool is_c_contiguous(DLManagedTensor* tensor) { return detail::is_c_contiguous(tensor); }
+
+/**
+ * @brief Check if DLManagedTensor has a col-major (f-contiguous) layout
+ *
+ * @param tensor DLManagedTensor object to check
+ * @return bool
+ */
+inline bool is_f_contiguous(DLManagedTensor* tensor) { return detail::is_f_contiguous(tensor); }
+
 /**
  * @brief Convert a DLManagedTensor to an mdspan
- * NOTE: This function only supports compact row-major layouts.
+ * NOTE: This function only supports compact row-major and col-major layouts.
  *
  * @code {.cpp}
  * #include <raft/core/device_mdspan.hpp>
diff --git a/cpp/src/distance/pairwise_distance_c.cpp b/cpp/src/distance/pairwise_distance_c.cpp
index 061adaa2c..5344a554c 100644
--- a/cpp/src/distance/pairwise_distance_c.cpp
+++ b/cpp/src/distance/pairwise_distance_c.cpp
@@ -29,7 +29,7 @@
 
 namespace {
 
-template <typename T, typename DistT>
+template <typename T, typename DistT, typename LayoutT = raft::row_major>
 void _pairwise_distance(cuvsResources_t res,
                         DLManagedTensor* x_tensor,
                         DLManagedTensor* y_tensor,
@@ -39,8 +39,8 @@ void _pairwise_distance(cuvsResources_t res,
 {
   auto res_ptr = reinterpret_cast<raft::resources*>(res);
 
-  using mdspan_type           = raft::device_matrix_view<T const, int64_t, raft::row_major>;
-  using distances_mdspan_type = raft::device_matrix_view<DistT, int64_t, raft::row_major>;
+  using mdspan_type           = raft::device_matrix_view<T const, int64_t, LayoutT>;
+  using distances_mdspan_type = raft::device_matrix_view<DistT, int64_t, LayoutT>;
 
   auto x_mds         = cuvs::core::from_dlpack<mdspan_type>(x_tensor);
   auto y_mds         = cuvs::core::from_dlpack<mdspan_type>(y_tensor);
@@ -70,17 +70,64 @@ extern "C" cuvsError_t cuvsPairwiseDistance(cuvsResources_t res,
       RAFT_FAIL("Inputs to cuvsPairwiseDistance must all have the same dtype");
     }
 
-    if (x_dt.bits == 32) {
-      _pairwise_distance<float, float>(
-        res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
-    } else if (x_dt.bits == 16) {
-      _pairwise_distance<half, float>(
-        res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
-    } else if (x_dt.bits == 64) {
-      _pairwise_distance<double, double>(
-        res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
+    bool x_row_major;
+    if (cuvs::core::is_c_contiguous(x_tensor)) {
+      x_row_major = true;
+    } else if (cuvs::core::is_f_contiguous(x_tensor)) {
+      x_row_major = false;
     } else {
-      RAFT_FAIL("Unsupported DLtensor dtype: %d and bits: %d", x_dt.code, x_dt.bits);
+      RAFT_FAIL("X input to cuvsPairwiseDistance must be contiguous (non-strided)");
+    }
+
+    bool y_row_major;
+    if (cuvs::core::is_c_contiguous(y_tensor)) {
+      y_row_major = true;
+    } else if (cuvs::core::is_f_contiguous(y_tensor)) {
+      y_row_major = false;
+    } else {
+      RAFT_FAIL("Y input to cuvsPairwiseDistance must be contiguous (non-strided)");
+    }
+
+    bool distances_row_major;
+    if (cuvs::core::is_c_contiguous(distances_tensor)) {
+      distances_row_major = true;
+    } else if (cuvs::core::is_f_contiguous(distances_tensor)) {
+      distances_row_major = false;
+    } else {
+      RAFT_FAIL("distances input to cuvsPairwiseDistance must be contiguous (non-strided)");
+    }
+
+    if ((x_row_major != y_row_major) || (x_row_major != distances_row_major)) {
+      RAFT_FAIL(
+        "Inputs to cuvsPairwiseDistance must all have the same layout (row-major or col-major");
+    }
+
+    if (x_row_major) {
+      if (x_dt.bits == 32) {
+        _pairwise_distance<float, float>(
+          res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
+      } else if (x_dt.bits == 16) {
+        _pairwise_distance<half, float>(
+          res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
+      } else if (x_dt.bits == 64) {
+        _pairwise_distance<double, double>(
+          res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
+      } else {
+        RAFT_FAIL("Unsupported DLtensor dtype: %d and bits: %d", x_dt.code, x_dt.bits);
+      }
+    } else {
+      if (x_dt.bits == 32) {
+        _pairwise_distance<float, float, raft::col_major>(
+          res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
+      } else if (x_dt.bits == 16) {
+        _pairwise_distance<half, float, raft::col_major>(
+          res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
+      } else if (x_dt.bits == 64) {
+        _pairwise_distance<double, double, raft::col_major>(
+          res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
+      } else {
+        RAFT_FAIL("Unsupported DLtensor dtype: %d and bits: %d", x_dt.code, x_dt.bits);
+      }
     }
   });
 }
diff --git a/python/cuvs/cuvs/common/cydlpack.pyx b/python/cuvs/cuvs/common/cydlpack.pyx
index 79f88cddc..bee8d9afa 100644
--- a/python/cuvs/cuvs/common/cydlpack.pyx
+++ b/python/cuvs/cuvs/common/cydlpack.pyx
@@ -25,6 +25,8 @@ cdef void deleter(DLManagedTensor* tensor) noexcept:
     if tensor.manager_ctx is NULL:
         return
     stdlib.free(tensor.dl_tensor.shape)
+    if tensor.dl_tensor.strides is not NULL:
+        stdlib.free(tensor.dl_tensor.strides)
     tensor.manager_ctx = NULL
     stdlib.free(tensor)
 
@@ -95,11 +97,20 @@ cdef DLManagedTensor* dlpack_c(ary):
     tensor.data = <void*> tensor_ptr
     tensor.device = dev
     tensor.dtype = dtype
-    tensor.strides = NULL
     tensor.ndim = ndim
     tensor.shape = shape
     tensor.byte_offset = 0
 
+    if ary.c_contiguous:
+        tensor.strides = NULL
+    elif ary.f_contiguous:
+        tensor.strides = <int64_t*>stdlib.malloc(ndim * sizeof(int64_t))
+        tensor.strides[0] = 1
+        for i in range(1, ndim):
+            tensor.strides[i] = tensor.strides[i-1] * tensor.shape[i-1]
+    else:
+        raise ValueError("Input data must be contiguous")
+
     dlm.dl_tensor = tensor
     dlm.manager_ctx = NULL
     dlm.deleter = deleter
diff --git a/python/cuvs/cuvs/distance/distance.pyx b/python/cuvs/cuvs/distance/distance.pyx
index 187532bfe..d00e6b1b1 100644
--- a/python/cuvs/cuvs/distance/distance.pyx
+++ b/python/cuvs/cuvs/distance/distance.pyx
@@ -103,7 +103,9 @@ def pairwise_distance(X, Y, out=None, metric="euclidean", metric_arg=2.0,
         output_dtype = y_cai.dtype
         if np.issubdtype(y_cai.dtype, np.float16):
             output_dtype = np.float32
-        out = device_ndarray.empty((m, n), dtype=output_dtype)
+
+        order = "C" if getattr(X, "flags", X).c_contiguous else "F"
+        out = device_ndarray.empty((m, n), dtype=output_dtype, order=order)
     out_cai = wrap_array(out)
 
     x_k = x_cai.shape[1]
diff --git a/python/cuvs/cuvs/test/test_distance.py b/python/cuvs/cuvs/test/test_distance.py
index f466c2743..9f206064c 100644
--- a/python/cuvs/cuvs/test/test_distance.py
+++ b/python/cuvs/cuvs/test/test_distance.py
@@ -40,10 +40,11 @@
     ],
 )
 @pytest.mark.parametrize("inplace", [True, False])
+@pytest.mark.parametrize("order", ["F", "C"])
 @pytest.mark.parametrize("dtype", [np.float32, np.float64, np.float16])
-def test_distance(n_rows, n_cols, inplace, metric, dtype):
+def test_distance(n_rows, n_cols, inplace, order, metric, dtype):
     input1 = np.random.random_sample((n_rows, n_cols))
-    input1 = np.asarray(input1).astype(dtype)
+    input1 = np.asarray(input1, order=order).astype(dtype)
 
     # RussellRao expects boolean arrays
     if metric == "russellrao":
@@ -58,7 +59,7 @@ def test_distance(n_rows, n_cols, inplace, metric, dtype):
     output_dtype = dtype
     if np.issubdtype(dtype, np.float16):
         output_dtype = np.float32
-    output = np.zeros((n_rows, n_rows), dtype=output_dtype)
+    output = np.zeros((n_rows, n_rows), dtype=output_dtype, order=order)
 
     if metric == "inner_product":
         expected = np.matmul(input1, input1.T)

From 47d71c391453f002f4070512056075e37c8fcd3e Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Thu, 16 Jan 2025 05:24:14 +0100
Subject: [PATCH 25/39] Reduce the recall threshold for IVF-PQ low-precision
 LUT inner product  tests (#573)

IVF-PQ allows to use low-precision for the lookup table during search to improve QPS. When used for the inner product distance, this has extra tall on recall. This PR reduces our expectation of the recall in this case as an answer to occasional test failures in CI.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/573
---
 cpp/test/neighbors/ann_ivf_pq.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index 01efd804e..6c0fdc608 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -879,7 +879,7 @@ inline auto enum_variety_ip() -> test_cases_t
         // InnerProduct score is signed,
         // thus we're forced to used signed 8-bit representation,
         // thus we have one bit less precision
-        y.min_recall = y.min_recall.value() * 0.90;
+        y.min_recall = y.min_recall.value() * 0.88;
       } else {
         // In other cases it seems to perform a little bit better, still worse than L2
         y.min_recall = y.min_recall.value() * 0.94;

From c49ba7bf9b26633f08b254968ae33bb74039a104 Mon Sep 17 00:00:00 2001
From: Ben Frederickson <ben@benfrederickson.com>
Date: Wed, 15 Jan 2025 20:26:05 -0800
Subject: [PATCH 26/39] expose col-major bfknn to python (#575)

Follow on to #572 -

Authors:
  - Ben Frederickson (https://github.com/benfred)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/575
---
 cpp/src/distance/pairwise_distance_c.cpp      |  2 +-
 cpp/src/neighbors/brute_force_c.cpp           | 28 ++++++++++++++-----
 .../neighbors/brute_force/brute_force.pyx     |  4 +--
 python/cuvs/cuvs/neighbors/common.py          |  6 ++--
 python/cuvs/cuvs/test/test_brute_force.py     |  9 ++++--
 5 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/cpp/src/distance/pairwise_distance_c.cpp b/cpp/src/distance/pairwise_distance_c.cpp
index 5344a554c..121574880 100644
--- a/cpp/src/distance/pairwise_distance_c.cpp
+++ b/cpp/src/distance/pairwise_distance_c.cpp
@@ -99,7 +99,7 @@ extern "C" cuvsError_t cuvsPairwiseDistance(cuvsResources_t res,
 
     if ((x_row_major != y_row_major) || (x_row_major != distances_row_major)) {
       RAFT_FAIL(
-        "Inputs to cuvsPairwiseDistance must all have the same layout (row-major or col-major");
+        "Inputs to cuvsPairwiseDistance must all have the same layout (row-major or col-major)");
     }
 
     if (x_row_major) {
diff --git a/cpp/src/neighbors/brute_force_c.cpp b/cpp/src/neighbors/brute_force_c.cpp
index 2b8980863..1693ac930 100644
--- a/cpp/src/neighbors/brute_force_c.cpp
+++ b/cpp/src/neighbors/brute_force_c.cpp
@@ -33,7 +33,7 @@
 
 namespace {
 
-template <typename T>
+template <typename T, typename LayoutT = raft::row_major>
 void* _build(cuvsResources_t res,
              DLManagedTensor* dataset_tensor,
              cuvsDistanceType metric,
@@ -41,7 +41,7 @@ void* _build(cuvsResources_t res,
 {
   auto res_ptr = reinterpret_cast<raft::resources*>(res);
 
-  using mdspan_type = raft::device_matrix_view<T const, int64_t, raft::row_major>;
+  using mdspan_type = raft::device_matrix_view<T const, int64_t, LayoutT>;
   auto mds          = cuvs::core::from_dlpack<mdspan_type>(dataset_tensor);
 
   cuvs::neighbors::brute_force::index_params params;
@@ -53,7 +53,7 @@ void* _build(cuvsResources_t res,
   return index_on_heap;
 }
 
-template <typename T>
+template <typename T, typename QueriesLayoutT = raft::row_major>
 void _search(cuvsResources_t res,
              cuvsBruteForceIndex index,
              DLManagedTensor* queries_tensor,
@@ -64,7 +64,7 @@ void _search(cuvsResources_t res,
   auto res_ptr   = reinterpret_cast<raft::resources*>(res);
   auto index_ptr = reinterpret_cast<cuvs::neighbors::brute_force::index<T>*>(index.addr);
 
-  using queries_mdspan_type   = raft::device_matrix_view<T const, int64_t, raft::row_major>;
+  using queries_mdspan_type   = raft::device_matrix_view<T const, int64_t, QueriesLayoutT>;
   using neighbors_mdspan_type = raft::device_matrix_view<int64_t, int64_t, raft::row_major>;
   using distances_mdspan_type = raft::device_matrix_view<float, int64_t, raft::row_major>;
   using prefilter_mds_type    = raft::device_vector_view<const uint32_t, int64_t>;
@@ -150,8 +150,15 @@ extern "C" cuvsError_t cuvsBruteForceBuild(cuvsResources_t res,
     auto dataset = dataset_tensor->dl_tensor;
 
     if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 32) {
-      index->addr =
-        reinterpret_cast<uintptr_t>(_build<float>(res, dataset_tensor, metric, metric_arg));
+      if (cuvs::core::is_c_contiguous(dataset_tensor)) {
+        index->addr =
+          reinterpret_cast<uintptr_t>(_build<float>(res, dataset_tensor, metric, metric_arg));
+      } else if (cuvs::core::is_f_contiguous(dataset_tensor)) {
+        index->addr = reinterpret_cast<uintptr_t>(
+          _build<float, raft::col_major>(res, dataset_tensor, metric, metric_arg));
+      } else {
+        RAFT_FAIL("dataset input to cuvsBruteForceBuild must be contiguous (non-strided)");
+      }
       index->dtype = dataset.dtype;
     } else {
       RAFT_FAIL("Unsupported dataset DLtensor dtype: %d and bits: %d",
@@ -189,7 +196,14 @@ extern "C" cuvsError_t cuvsBruteForceSearch(cuvsResources_t res,
     RAFT_EXPECTS(queries.dtype.code == index.dtype.code, "type mismatch between index and queries");
 
     if (queries.dtype.code == kDLFloat && queries.dtype.bits == 32) {
-      _search<float>(res, index, queries_tensor, neighbors_tensor, distances_tensor, prefilter);
+      if (cuvs::core::is_c_contiguous(queries_tensor)) {
+        _search<float>(res, index, queries_tensor, neighbors_tensor, distances_tensor, prefilter);
+      } else if (cuvs::core::is_f_contiguous(queries_tensor)) {
+        _search<float, raft::col_major>(
+          res, index, queries_tensor, neighbors_tensor, distances_tensor, prefilter);
+      } else {
+        RAFT_FAIL("queries input to cuvsBruteForceSearch must be contiguous (non-strided)");
+      }
     } else {
       RAFT_FAIL("Unsupported queries DLtensor dtype: %d and bits: %d",
                 queries.dtype.code,
diff --git a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx
index 9d43bfb29..f71acd086 100644
--- a/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx
+++ b/python/cuvs/cuvs/neighbors/brute_force/brute_force.pyx
@@ -102,7 +102,7 @@ def build(dataset, metric="sqeuclidean", metric_arg=2.0, resources=None):
     """
 
     dataset_ai = wrap_array(dataset)
-    _check_input_array(dataset_ai, [np.dtype('float32')])
+    _check_input_array(dataset_ai, [np.dtype('float32')], exp_row_major=False)
 
     cdef cuvsResources_t res = <cuvsResources_t>resources.get_c_obj()
 
@@ -218,7 +218,7 @@ def search(Index index,
     cdef cuvsResources_t res = <cuvsResources_t>resources.get_c_obj()
 
     queries_cai = wrap_array(queries)
-    _check_input_array(queries_cai, [np.dtype('float32')])
+    _check_input_array(queries_cai, [np.dtype('float32')], exp_row_major=False)
 
     cdef uint32_t n_queries = queries_cai.shape[0]
 
diff --git a/python/cuvs/cuvs/neighbors/common.py b/python/cuvs/cuvs/neighbors/common.py
index c14b9f8c9..f49d9eb1f 100644
--- a/python/cuvs/cuvs/neighbors/common.py
+++ b/python/cuvs/cuvs/neighbors/common.py
@@ -14,11 +14,13 @@
 # limitations under the License.
 
 
-def _check_input_array(cai, exp_dt, exp_rows=None, exp_cols=None):
+def _check_input_array(
+    cai, exp_dt, exp_rows=None, exp_cols=None, exp_row_major=True
+):
     if cai.dtype not in exp_dt:
         raise TypeError("dtype %s not supported" % cai.dtype)
 
-    if not cai.c_contiguous:
+    if exp_row_major and not cai.c_contiguous:
         raise ValueError("Row major input is expected")
 
     if exp_cols is not None and cai.shape[1] != exp_cols:
diff --git a/python/cuvs/cuvs/test/test_brute_force.py b/python/cuvs/cuvs/test/test_brute_force.py
index acf347ec3..0b37ad885 100644
--- a/python/cuvs/cuvs/test/test_brute_force.py
+++ b/python/cuvs/cuvs/test/test_brute_force.py
@@ -40,12 +40,15 @@
     ],
 )
 @pytest.mark.parametrize("inplace", [True, False])
+@pytest.mark.parametrize("order", ["F", "C"])
 @pytest.mark.parametrize("dtype", [np.float32])
 def test_brute_force_knn(
-    n_index_rows, n_query_rows, n_cols, k, inplace, metric, dtype
+    n_index_rows, n_query_rows, n_cols, k, inplace, order, metric, dtype
 ):
-    index = np.random.random_sample((n_index_rows, n_cols)).astype(dtype)
-    queries = np.random.random_sample((n_query_rows, n_cols)).astype(dtype)
+    index = np.random.random_sample((n_index_rows, n_cols))
+    index = np.asarray(index, order=order).astype(dtype)
+    queries = np.random.random_sample((n_query_rows, n_cols))
+    queries = np.asarray(queries, order=order).astype(dtype)
 
     # RussellRao expects boolean arrays
     if metric == "russellrao":

From 8aae7069141ffe413f03cd4cd036860394fc0c44 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 15 Jan 2025 23:32:54 -0500
Subject: [PATCH 27/39] Small fixes to docs and pairwise distances (#570)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Ben Frederickson (https://github.com/benfred)

URL: https://github.com/rapidsai/cuvs/pull/570
---
 docs/source/cuvs_bench/index.rst       | 24 ++++++++++++++----------
 python/cuvs/cuvs/distance/distance.pyx |  6 +++---
 python/cuvs/cuvs/test/test_distance.py |  6 ++----
 3 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/docs/source/cuvs_bench/index.rst b/docs/source/cuvs_bench/index.rst
index 81fb7537c..820c44c4f 100644
--- a/docs/source/cuvs_bench/index.rst
+++ b/docs/source/cuvs_bench/index.rst
@@ -24,7 +24,7 @@ This tool offers several benefits, including
 
   * `Docker`_
 
-- `How to run the benchmarks`_
+- `How benchmarks are run`_
 
   * `Step 1: Prepare the dataset`_
 
@@ -93,32 +93,36 @@ We provide images for GPU enabled systems, as well as systems without a GPU. The
 - `cuvs-bench-datasets`: Contains the GPU and CPU benchmarks with million-scale datasets already included in the container. Best suited for users that want to run multiple million scale datasets already included in the image.
 - `cuvs-bench-cpu`: Contains only CPU benchmarks with minimal size. Best suited for users that want the smallest containers to reproduce benchmarks on systems without a GPU.
 
-Nightly images are located in `dockerhub <https://hub.docker.com/r/rapidsai/cuvs-bench/tags>`_, meanwhile release (stable) versions are located in `NGC <https://hub.docker.com/r/rapidsai/cuvs-bench>`_, starting with release 24.10.
+Nightly images are located in `dockerhub <https://hub.docker.com/r/rapidsai/cuvs-bench/tags>`_.
 
-The following command pulls the nightly container for Python version 3.10, CUDA version 12.0, and cuVS version 24.10:
+The following command pulls the nightly container for Python version 3.10, CUDA version 12.5, and cuVS version 24.12:
 
 .. code-block:: bash
 
-   docker pull rapidsai/cuvs-bench:24.10a-cuda12.0-py3.10 #substitute cuvs-bench for the exact desired container.
+   docker pull rapidsai/cuvs-bench:24.12a-cuda12.5-py3.10 #substitute cuvs-bench for the exact desired container.
 
 The CUDA and python versions can be changed for the supported values:
-- Supported CUDA versions: 11.4 and 12.x
-- Supported Python versions: 3.9 and 3.10.
+- Supported CUDA versions: 11.8 and 12.5
+- Supported Python versions: 3.10 and 3.11.
 
 You can see the exact versions as well in the dockerhub site:
 - `cuVS bench images <https://hub.docker.com/r/rapidsai/cuvs-bench/tags>`_
-- `cuVS bench with datasets preloaded images <https://hub.docker.com/r/rapidsai/cuvs-bench-cpu/tags>`_
+- `cuVS bench with pre-loaded million-scale datasets images <https://hub.docker.com/r/rapidsai/cuvs-bench-cpu/tags>`_
 - `cuVS bench CPU only images <https://hub.docker.com/r/rapidsai/cuvs-bench-datasets/tags>`_
 
 **Note:** GPU containers use the CUDA toolkit from inside the container, the only requirement is a driver installed on the host machine that supports that version. So, for example, CUDA 11.8 containers can run in systems with a CUDA 12.x capable driver. Please also note that the Nvidia-Docker runtime from the `Nvidia Container Toolkit <https://github.com/NVIDIA/nvidia-docker>`_ is required to use GPUs inside docker containers.
 
-How to run the benchmarks
-=========================
+How benchmarks are run
+======================
+
+The `cuvs-bench` package contains lightweight Python scripts to run the benchmarks. There are 4 general steps to running the benchmarks and visualizing the results.
 
-We provide a collection of lightweight Python scripts to run the benchmarks. There are 4 general steps to running the benchmarks and visualizing the results.
 #. Prepare Dataset
+
 #. Build Index and Search Index
+
 #. Data Export
+
 #. Plot Results
 
 Step 1: Prepare the dataset
diff --git a/python/cuvs/cuvs/distance/distance.pyx b/python/cuvs/cuvs/distance/distance.pyx
index d00e6b1b1..d50fc152f 100644
--- a/python/cuvs/cuvs/distance/distance.pyx
+++ b/python/cuvs/cuvs/distance/distance.pyx
@@ -56,7 +56,7 @@ SUPPORTED_DISTANCES = ["euclidean", "l1", "cityblock", "l2", "inner_product",
 
 @auto_sync_resources
 @auto_convert_output
-def pairwise_distance(X, Y, out=None, metric="euclidean", metric_arg=2.0,
+def pairwise_distance(X, Y, out=None, metric="euclidean", p=2.0,
                       resources=None):
     """
     Compute pairwise distances between X and Y
@@ -74,7 +74,7 @@ def pairwise_distance(X, Y, out=None, metric="euclidean", metric_arg=2.0,
     Y : CUDA array interface compliant matrix shape (n, k)
     out : Optional writable CUDA array interface matrix shape (m, n)
     metric : string denoting the metric type (default="euclidean")
-    metric_arg : metric parameter (currently used only for "minkowski")
+    p : metric parameter (currently used only for "minkowski")
     {resources_docstring}
 
     Examples
@@ -139,6 +139,6 @@ def pairwise_distance(X, Y, out=None, metric="euclidean", metric_arg=2.0,
                                     y_dlpack,
                                     out_dlpack,
                                     distance_type,
-                                    metric_arg))
+                                    p))
 
     return out
diff --git a/python/cuvs/cuvs/test/test_distance.py b/python/cuvs/cuvs/test/test_distance.py
index 9f206064c..483d5d201 100644
--- a/python/cuvs/cuvs/test/test_distance.py
+++ b/python/cuvs/cuvs/test/test_distance.py
@@ -35,6 +35,7 @@
         "jensenshannon",
         "russellrao",
         "cosine",
+        "minkowski",
         "sqeuclidean",
         "inner_product",
     ],
@@ -70,10 +71,7 @@ def test_distance(n_rows, n_cols, inplace, order, metric, dtype):
     output_device = device_ndarray(output) if inplace else None
 
     ret_output = pairwise_distance(
-        input1_device,
-        input1_device,
-        output_device,
-        metric,
+        input1_device, input1_device, output_device, metric, p=2.0
     )
 
     output_device = ret_output if not inplace else output_device

From 3ffb29ff06ff0b8fffa140780750cf1999626f7e Mon Sep 17 00:00:00 2001
From: Micka <mide@nvidia.com>
Date: Thu, 16 Jan 2025 07:14:47 +0100
Subject: [PATCH 28/39] Fix broken link to python doc (#564)

Apply the same change as https://github.com/rapidsai/cuml/pull/6202 to fix Python links to source code.
Closes #533

Authors:
  - Micka (https://github.com/lowener)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/564
---
 docs/source/conf.py                  |  2 +-
 docs/source/sphinxext/github_link.py | 22 ++++++++++++++++++----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index c14919568..ca7330279 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -198,7 +198,7 @@ def setup(app):
 linkcode_resolve = make_linkcode_resolve(
     "cuvs",
     "https://github.com/rapidsai/cuvs/"
-    "blob/{revision}/python/cuvs/cuvs/"
+    "blob/{revision}/python/cuvs/"
     "{package}/{path}#L{lineno}",
 )
 
diff --git a/docs/source/sphinxext/github_link.py b/docs/source/sphinxext/github_link.py
index 2c52488ca..75acfbd6e 100644
--- a/docs/source/sphinxext/github_link.py
+++ b/docs/source/sphinxext/github_link.py
@@ -1,5 +1,20 @@
 # This contains code with copyright by the scikit-learn project, subject to the
 # license in /thirdparty/LICENSES/LICENSE.scikit_learn
+#
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 
 import inspect
 import os
@@ -101,10 +116,9 @@ def _linkcode_resolve(domain, info, package, url_fmt, revision):
         else:
             return
     else:
-        # Test if we are absolute or not (pyx are relative)
-        if (not os.path.isabs(fn)):
-            # Should be relative to docs right now
-            fn = os.path.abspath(os.path.join("..", "python", fn))
+        if fn.endswith(".pyx"):
+            sp_path = next(x for x in sys.path if re.match(".*site-packages$", x))
+            fn = fn.replace("/opt/conda/conda-bld/work/python/cuvs", sp_path)
 
         # Convert to relative from module root
         fn = os.path.relpath(fn,

From 86b4ee8c6244b9ee2f1a718dad8dea0b1476ed15 Mon Sep 17 00:00:00 2001
From: Nathan VanBenschoten <nvanbenschoten@gmail.com>
Date: Wed, 15 Jan 2025 23:15:31 -0700
Subject: [PATCH 29/39] Fix typos in README (#543)

Spotted while learning about the project.

Authors:
  - Nathan VanBenschoten (https://github.com/nvanbenschoten)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/543
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 47f094039..dac71c881 100755
--- a/README.md
+++ b/README.md
@@ -67,7 +67,7 @@ There are several benefits to using cuVS and GPUs for vector search, including
 6. Multiple language support
 7. Building blocks for composing new or accelerating existing algorithms
 
-In addition to the items above, cuVS takes on the burden of keeping non-trivial accelerated code up to date as new NVIDIA architectures and CUDA versions are released. This provides a deslightful development experimence, guaranteeing that any libraries, databases, or applications built on top of it will always be getting the best performance and scale. 
+In addition to the items above, cuVS takes on the burden of keeping non-trivial accelerated code up to date as new NVIDIA architectures and CUDA versions are released. This provides a delightful development experience, guaranteeing that any libraries, databases, or applications built on top of it will always be getting the best performance and scale.
 
 ## cuVS Technology Stack
 

From b9f71fe346af926de55223b313979a5c85a01b1b Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Thu, 16 Jan 2025 15:40:52 +0100
Subject: [PATCH 30/39] Fix the use of constexpr in the dynamic batching header
 (#582)

Remove the `constexpr` in a function that is non-constexpr according to the C++17 rules.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuvs/pull/582
---
 cpp/src/neighbors/detail/dynamic_batching.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/neighbors/detail/dynamic_batching.cuh b/cpp/src/neighbors/detail/dynamic_batching.cuh
index cb8e08ef5..23c5c07f6 100644
--- a/cpp/src/neighbors/detail/dynamic_batching.cuh
+++ b/cpp/src/neighbors/detail/dynamic_batching.cuh
@@ -238,8 +238,8 @@ enum struct slot_state : int32_t {
 struct batch_token {
   uint64_t value = 0;
 
-  constexpr inline batch_token() {}
-  explicit constexpr inline batch_token(uint32_t buffer_id) { id() = buffer_id; }
+  constexpr inline batch_token() = default;
+  RAFT_INLINE_FUNCTION explicit batch_token(uint32_t buffer_id) { id() = buffer_id; }
 
   /**
    * Sequential id of the batch in the array of batches.
@@ -492,7 +492,7 @@ struct batch_queue_t {
    * NB: "round" is the number of times the queue counters went over the whole ring buffer.
    *     It's used to avoid the ABA problem for atomic token updates.
    */
-  static constexpr inline auto make_empty_token(seq_order_id seq_id) noexcept -> batch_token
+  static inline auto make_empty_token(seq_order_id seq_id) noexcept -> batch_token
   {
     // Modify the seq_id to identify that the token slot is empty
     auto empty_round    = static_cast<uint32_t>(slot_state::kEmptyPast) * kSize;

From 6371aa32e957b6e8df0cae3347b94ef5604e6e7b Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 16 Jan 2025 20:00:54 -0500
Subject: [PATCH 31/39] run_cuvs_pytests.sh uses proper test dir (#584)

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cuvs/pull/584
---
 ci/run_cuvs_pytests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/run_cuvs_pytests.sh b/ci/run_cuvs_pytests.sh
index 4de8927b1..57df9af94 100755
--- a/ci/run_cuvs_pytests.sh
+++ b/ci/run_cuvs_pytests.sh
@@ -6,4 +6,4 @@ set -euo pipefail
 # Support invoking run_pytests.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuvs/cuvs
 
-pytest --cache-clear --verbose "$@" tests
+pytest --cache-clear --verbose "$@" test

From bd603a97ea7c095dd109c802387b33dc1d591b54 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 17 Jan 2025 13:16:16 -0500
Subject: [PATCH 32/39] Fixing small typo in cuvs bench docs (#586)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Ben Frederickson (https://github.com/benfred)

URL: https://github.com/rapidsai/cuvs/pull/586
---
 docs/source/cuvs_bench/index.rst | 195 +------------------------------
 1 file changed, 3 insertions(+), 192 deletions(-)

diff --git a/docs/source/cuvs_bench/index.rst b/docs/source/cuvs_bench/index.rst
index 820c44c4f..c15aa41c1 100644
--- a/docs/source/cuvs_bench/index.rst
+++ b/docs/source/cuvs_bench/index.rst
@@ -24,16 +24,6 @@ This tool offers several benefits, including
 
   * `Docker`_
 
-- `How benchmarks are run`_
-
-  * `Step 1: Prepare the dataset`_
-
-  * `Step 2: Build and search index`_
-
-  * `Step 3: Data export`_
-
-  * `Step 4: Plot the results`_
-
 - `Running the benchmarks`_
 
   * `End-to-end: smaller-scale benchmarks (<1M to 10M)`_
@@ -75,7 +65,7 @@ Conda
    conda activate cuvs_benchmarks
 
    # to install GPU package:
-   conda install -c rapidsai -c conda-forge -c nvidia cuvs-ann-bench=<rapids_version> cuda-version=11.8*
+   conda install -c rapidsai -c conda-forge -c nvidia cuvs-bench=<rapids_version> cuda-version=11.8*
 
    # to install CPU package for usage in CPU-only systems:
    conda install -c rapidsai -c conda-forge  cuvs-bench-cpu
@@ -99,7 +89,7 @@ The following command pulls the nightly container for Python version 3.10, CUDA
 
 .. code-block:: bash
 
-   docker pull rapidsai/cuvs-bench:24.12a-cuda12.5-py3.10 #substitute cuvs-bench for the exact desired container.
+   docker pull rapidsai/cuvs-bench:24.12a-cuda12.5-py3.10 # substitute cuvs-bench for the exact desired container.
 
 The CUDA and python versions can be changed for the supported values:
 - Supported CUDA versions: 11.8 and 12.5
@@ -112,185 +102,6 @@ You can see the exact versions as well in the dockerhub site:
 
 **Note:** GPU containers use the CUDA toolkit from inside the container, the only requirement is a driver installed on the host machine that supports that version. So, for example, CUDA 11.8 containers can run in systems with a CUDA 12.x capable driver. Please also note that the Nvidia-Docker runtime from the `Nvidia Container Toolkit <https://github.com/NVIDIA/nvidia-docker>`_ is required to use GPUs inside docker containers.
 
-How benchmarks are run
-======================
-
-The `cuvs-bench` package contains lightweight Python scripts to run the benchmarks. There are 4 general steps to running the benchmarks and visualizing the results.
-
-#. Prepare Dataset
-
-#. Build Index and Search Index
-
-#. Data Export
-
-#. Plot Results
-
-Step 1: Prepare the dataset
----------------------------
-
-The script `cuvs_bench.get_dataset` will download and unpack the dataset in directory that the user provides. As of now, only million-scale datasets are supported by this script. For more information on :doc:`datasets and formats <datasets>`.
-
-The usage of this script is:
-
-.. code-block:: bash
-
-    usage: get_dataset.py [-h] [--name NAME] [--dataset-path DATASET_PATH] [--normalize]
-
-    options:
-      -h, --help            show this help message and exit
-      --dataset DATASET     dataset to download (default: glove-100-angular)
-      --dataset-path DATASET_PATH
-                            path to download dataset (default: ${RAPIDS_DATASET_ROOT_DIR})
-      --normalize           normalize cosine distance to inner product (default: False)
-
-When option `normalize` is provided to the script, any dataset that has cosine distances
-will be normalized to inner product. So, for example, the dataset `glove-100-angular`
-will be written at location `datasets/glove-100-inner/`.
-
-Step 2: Build and search index
-------------------------------
-
-The script `cuvs_bench.run` will build and search indices for a given dataset and its
-specified configuration.
-
-The usage of the script `cuvs_bench.run` is:
-
-.. code-block:: bash
-
-    usage: __main__.py [-h] [--subset-size SUBSET_SIZE] [-k COUNT] [-bs BATCH_SIZE] [--dataset-configuration DATASET_CONFIGURATION] [--configuration CONFIGURATION] [--dataset DATASET]
-                       [--dataset-path DATASET_PATH] [--build] [--search] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS] [-f] [-m SEARCH_MODE]
-
-    options:
-      -h, --help            show this help message and exit
-      --subset-size SUBSET_SIZE
-                            the number of subset rows of the dataset to build the index (default: None)
-      -k COUNT, --count COUNT
-                            the number of nearest neighbors to search for (default: 10)
-      -bs BATCH_SIZE, --batch-size BATCH_SIZE
-                            number of query vectors to use in each query trial (default: 10000)
-      --dataset-configuration DATASET_CONFIGURATION
-                            path to YAML configuration file for datasets (default: None)
-      --configuration CONFIGURATION
-                            path to YAML configuration file or directory for algorithms Any run groups found in the specified file/directory will automatically override groups of the same name
-                            present in the default configurations, including `base` (default: None)
-      --dataset DATASET     name of dataset (default: glove-100-inner)
-      --dataset-path DATASET_PATH
-                            path to dataset folder, by default will look in RAPIDS_DATASET_ROOT_DIR if defined, otherwise a datasets subdirectory from the calling directory (default:
-                            os.getcwd()/datasets/)
-      --build
-      --search
-      --algorithms ALGORITHMS
-                            run only comma separated list of named algorithms. If parameters `groups` and `algo-groups` are both undefined, then group `base` is run by default (default: None)
-      --groups GROUPS       run only comma separated groups of parameters (default: base)
-      --algo-groups ALGO_GROUPS
-                            add comma separated <algorithm>.<group> to run. Example usage: "--algo-groups=cuvs_cagra.large,hnswlib.large" (default: None)
-      -f, --force           re-run algorithms even if their results already exist (default: False)
-      -m SEARCH_MODE, --search-mode SEARCH_MODE
-                            run search in 'latency' (measure individual batches) or 'throughput' (pipeline batches and measure end-to-end) mode (default: throughput)
-      -t SEARCH_THREADS, --search-threads SEARCH_THREADS
-                            specify the number threads to use for throughput benchmark. Single value or a pair of min and max separated by ':'. Example --search-threads=1:4. Power of 2 values between 'min' and 'max' will be used. If only 'min' is
-                            specified, then a single test is run with 'min' threads. By default min=1, max=<num hyper threads>. (default: None)
-      -r, --dry-run         dry-run mode will convert the yaml config for the specified algorithms and datasets to the json format that's consumed by the lower-level c++ binaries and then print the command to run execute the benchmarks but
-                            will not actually execute the command. (default: False)
-
-`dataset`: name of the dataset to be searched in `datasets.yaml`_
-
-`dataset-configuration`: optional filepath to custom dataset YAML config which has an entry for arg `dataset`
-
-`configuration`: optional filepath to YAML configuration for an algorithm or to directory that contains YAML configurations for several algorithms. Refer to `Dataset.yaml config`_ for more info.
-
-`algorithms`: runs all algorithms that it can find in YAML configs found by `configuration`. By default, only `base` group will be run.
-
-`groups`: run only specific groups of parameters configurations for an algorithm. Groups are defined in YAML configs (see `configuration`), and by default run `base` group
-
-`algo-groups`: this parameter is helpful to append any specific algorithm+group combination to run the benchmark for in addition to all the arguments from `algorithms` and `groups`. It is of the format `<algorithm>.<group>`, or for example, `cuvs_cagra.large`
-
-For every algorithm run by this script, it outputs an index build statistics JSON file in `<dataset-path/<dataset>/result/build/<{algo},{group}.json>`
-and an index search statistics JSON file in `<dataset-path/<dataset>/result/search/<{algo},{group},k{k},bs{batch_size}.json>`. NOTE: The filenames will not have ",{group}" if `group = "base"`.
-
-For every algorithm run by this script, it outputs an index build statistics JSON file in `<dataset-path/<dataset>/result/build/<{algo},{group}.json>`
-and an index search statistics JSON file in `<dataset-path/<dataset>/result/search/<{algo},{group},k{k},bs{batch_size}.json>`. NOTE: The filenames will not have ",{group}" if `group = "base"`.
-
-`dataset-path` :
-#. data is read from `<dataset-path>/<dataset>`
-#. indices are built in `<dataset-path>/<dataset>/index`
-#. build/search results are stored in `<dataset-path>/<dataset>/result`
-
-`build` and `search` : if both parameters are not supplied to the script then it is assumed both are `True`.
-
-`indices` and `algorithms` : these parameters ensure that the algorithm specified for an index is available in `algos.yaml` and not disabled, as well as having an associated executable.
-
-Step 3: Data export
--------------------
-
-The script `cuvs_bench.data_export` will convert the intermediate JSON outputs produced by `cuvs_bench.run` to more easily readable CSV files, which are needed to build charts made by `cuvs_bench.plot`.
-
-.. code-block:: bash
-
-    usage: data_export.py [-h] [--dataset DATASET] [--dataset-path DATASET_PATH]
-
-    options:
-      -h, --help            show this help message and exit
-      --dataset DATASET     dataset to download (default: glove-100-inner)
-      --dataset-path DATASET_PATH
-                            path to dataset folder (default: ${RAPIDS_DATASET_ROOT_DIR})
-
-Build statistics CSV file is stored in `<dataset-path/<dataset>/result/build/<{algo},{group}.csv>`
-and index search statistics CSV file in `<dataset-path/<dataset>/result/search/<{algo},{group},k{k},bs{batch_size},{suffix}.csv>`, where suffix has three values:
-#. `raw`: All search results are exported
-#. `throughput`: Pareto frontier of throughput results is exported
-#. `latency`: Pareto frontier of latency results is exported
-
-Step 4: Plot the results
-------------------------
-
-The script `cuvs_bench.plot` will plot results for all algorithms found in index search statistics CSV files `<dataset-path/<dataset>/result/search/*.csv`.
-
-The usage of this script is:
-
-.. code-block:: bash
-
-    usage:  [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS]
-            [-k COUNT] [-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--x-start X_START] [--mode {throughput,latency}]
-            [--time-unit {s,ms,us}] [--raw]
-
-    options:
-      -h, --help            show this help message and exit
-      --dataset DATASET     dataset to plot (default: glove-100-inner)
-      --dataset-path DATASET_PATH
-                            path to dataset folder (default: /home/coder/cuvs/datasets/)
-      --output-filepath OUTPUT_FILEPATH
-                            directory for PNG to be saved (default: /home/coder/cuvs)
-      --algorithms ALGORITHMS
-                            plot only comma separated list of named algorithms. If parameters `groups` and `algo-groups are both undefined, then group `base` is plot by default
-                            (default: None)
-      --groups GROUPS       plot only comma separated groups of parameters (default: base)
-      --algo-groups ALGO_GROUPS, --algo-groups ALGO_GROUPS
-                            add comma separated <algorithm>.<group> to plot. Example usage: "--algo-groups=cuvs_cagra.large,hnswlib.large" (default: None)
-      -k COUNT, --count COUNT
-                            the number of nearest neighbors to search for (default: 10)
-      -bs BATCH_SIZE, --batch-size BATCH_SIZE
-                            number of query vectors to use in each query trial (default: 10000)
-      --build
-      --search
-      --x-scale X_SCALE     Scale to use when drawing the X-axis. Typically linear, logit or a2 (default: linear)
-      --y-scale {linear,log,symlog,logit}
-                            Scale to use when drawing the Y-axis (default: linear)
-      --x-start X_START     Recall values to start the x-axis from (default: 0.8)
-      --mode {throughput,latency}
-                            search mode whose Pareto frontier is used on the y-axis (default: throughput)
-      --time-unit {s,ms,us}
-                            time unit to plot when mode is latency (default: ms)
-      --raw                 Show raw results (not just Pareto frontier) of mode arg (default: False)
-
-`mode`: plots pareto frontier of `throughput` or `latency` results exported in the previous step
-
-`algorithms`: plots all algorithms that it can find results for the specified `dataset`. By default, only `base` group will be plotted.
-
-`groups`: plot only specific groups of parameters configurations for an algorithm. Groups are defined in YAML configs (see `configuration`), and by default run `base` group
-
-`algo-groups`: this parameter is helpful to append any specific algorithm+group combination to plot results for in addition to all the arguments from `algorithms` and `groups`. It is of the format `<algorithm>.<group>`, or for example, `cuvs_cagra.large`
-
 Running the benchmarks
 ======================
 
@@ -576,7 +387,7 @@ Creating and customizing dataset configurations
 
 A single configuration will often define a set of algorithms, with associated index and search parameters, that can be generalize across datasets. We use YAML to define dataset specific and algorithm specific configurations.
 
-A default `datasets.yaml` is provided by CUVS in `${CUVS_HOME}/python/cuvs-ann-bench/src/cuvs_bench/run/conf` with configurations available for several datasets. Here's a simple example entry for the `sift-128-euclidean` dataset:
+A default `datasets.yaml` is provided by CUVS in `${CUVS_HOME}/python/cuvs_bench/src/cuvs_bench/run/conf` with configurations available for several datasets. Here's a simple example entry for the `sift-128-euclidean` dataset:
 
 .. code-block:: yaml
 

From f1de1b287f776bcb03d20765fad85da3555969d6 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 21 Jan 2025 11:30:32 -0600
Subject: [PATCH 33/39] Use GCC 13 in CUDA 12 conda builds. (#567)

## Description
conda-forge is using GCC 13 for CUDA 12 builds. This PR updates CUDA 12
conda builds to use GCC 13, for alignment.

These PRs should be merged in a specific order, see
https://github.com/rapidsai/build-planning/issues/129 for details.

Closes https://github.com/rapidsai/build-planning/issues/129.
---
 .../all_cuda-118_arch-aarch64.yaml            |  2 +-
 .../all_cuda-118_arch-x86_64.yaml             |  2 +-
 .../all_cuda-125_arch-aarch64.yaml            |  4 ++--
 .../all_cuda-125_arch-x86_64.yaml             |  4 ++--
 .../bench_ann_cuda-118_arch-aarch64.yaml      |  2 +-
 .../bench_ann_cuda-118_arch-x86_64.yaml       |  2 +-
 .../bench_ann_cuda-125_arch-aarch64.yaml      |  4 ++--
 .../bench_ann_cuda-125_arch-x86_64.yaml       |  4 ++--
 .../cuvs-bench-cpu/conda_build_config.yaml    |  8 ++++---
 .../cuvs-bench/conda_build_config.yaml        | 14 +++++------
 conda/recipes/cuvs-bench/meta.yaml            |  6 ++---
 conda/recipes/cuvs/conda_build_config.yaml    | 14 +++++------
 conda/recipes/cuvs/meta.yaml                  |  6 ++---
 conda/recipes/libcuvs/conda_build_config.yaml | 14 +++++------
 conda/recipes/libcuvs/meta.yaml               | 24 +++++++------------
 cpp/test/distance/masked_nn.cu                |  4 ++--
 dependencies.yaml                             | 18 ++++++++++++--
 17 files changed, 68 insertions(+), 64 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 01853da84..4c464ef4e 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -55,7 +55,7 @@ dependencies:
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sphinx>=8.0.0
-- sysroot_linux-aarch64==2.17
+- sysroot_linux-aarch64==2.28
 - pip:
   - nvidia-sphinx-theme
 name: all_cuda-118_arch-aarch64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index a1ad68d7f..71cbeeaf3 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -55,7 +55,7 @@ dependencies:
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sphinx>=8.0.0
-- sysroot_linux-64==2.17
+- sysroot_linux-64==2.28
 - pip:
   - nvidia-sphinx-theme
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index ee0213fff..4fd08fa97 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -24,7 +24,7 @@ dependencies:
 - cython>=3.0.0
 - dlpack>=0.8,<1.0
 - doxygen>=1.8.20
-- gcc_linux-aarch64=11.*
+- gcc_linux-aarch64=13.*
 - graphviz
 - ipython
 - libclang
@@ -51,7 +51,7 @@ dependencies:
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sphinx>=8.0.0
-- sysroot_linux-aarch64==2.17
+- sysroot_linux-aarch64==2.28
 - pip:
   - nvidia-sphinx-theme
 name: all_cuda-125_arch-aarch64
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index d93dcaf7a..de5f341fa 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -24,7 +24,7 @@ dependencies:
 - cython>=3.0.0
 - dlpack>=0.8,<1.0
 - doxygen>=1.8.20
-- gcc_linux-64=11.*
+- gcc_linux-64=13.*
 - graphviz
 - ipython
 - libclang
@@ -51,7 +51,7 @@ dependencies:
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sphinx>=8.0.0
-- sysroot_linux-64==2.17
+- sysroot_linux-64==2.28
 - pip:
   - nvidia-sphinx-theme
 name: all_cuda-125_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index a90dc03e7..fb69ac251 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -47,6 +47,6 @@ dependencies:
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - setuptools
-- sysroot_linux-aarch64==2.17
+- sysroot_linux-aarch64==2.28
 - wheel
 name: bench_ann_cuda-118_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index b7344c822..123033b08 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -47,6 +47,6 @@ dependencies:
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - setuptools
-- sysroot_linux-64==2.17
+- sysroot_linux-64==2.28
 - wheel
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
index da7229004..fa2ae7955 100644
--- a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
@@ -24,7 +24,7 @@ dependencies:
 - cxx-compiler
 - cython>=3.0.0
 - dlpack>=0.8,<1.0
-- gcc_linux-aarch64=11.*
+- gcc_linux-aarch64=13.*
 - glog>=0.6.0
 - h5py>=3.8.0
 - libcublas-dev
@@ -43,6 +43,6 @@ dependencies:
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - setuptools
-- sysroot_linux-aarch64==2.17
+- sysroot_linux-aarch64==2.28
 - wheel
 name: bench_ann_cuda-125_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
index 5d1dd8fc7..76b005e3c 100644
--- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
@@ -24,7 +24,7 @@ dependencies:
 - cxx-compiler
 - cython>=3.0.0
 - dlpack>=0.8,<1.0
-- gcc_linux-64=11.*
+- gcc_linux-64=13.*
 - glog>=0.6.0
 - h5py>=3.8.0
 - libcublas-dev
@@ -43,6 +43,6 @@ dependencies:
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - setuptools
-- sysroot_linux-64==2.17
+- sysroot_linux-64==2.28
 - wheel
 name: bench_ann_cuda-125_arch-x86_64
diff --git a/conda/recipes/cuvs-bench-cpu/conda_build_config.yaml b/conda/recipes/cuvs-bench-cpu/conda_build_config.yaml
index ed6f708e1..5407d7c17 100644
--- a/conda/recipes/cuvs-bench-cpu/conda_build_config.yaml
+++ b/conda/recipes/cuvs-bench-cpu/conda_build_config.yaml
@@ -1,14 +1,16 @@
 c_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cxx_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 c_stdlib:
   - sysroot
 
 c_stdlib_version:
-  - "2.17"
+  - "2.28"
 
 cmake_version:
   - ">=3.26.4,!=3.30.0"
diff --git a/conda/recipes/cuvs-bench/conda_build_config.yaml b/conda/recipes/cuvs-bench/conda_build_config.yaml
index 47bd730da..ccd7341d1 100644
--- a/conda/recipes/cuvs-bench/conda_build_config.yaml
+++ b/conda/recipes/cuvs-bench/conda_build_config.yaml
@@ -1,20 +1,20 @@
 c_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cxx_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cuda_compiler:
-  - cuda-nvcc
-
-cuda11_compiler:
-  - nvcc
+  - cuda-nvcc  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - nvcc  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 c_stdlib:
   - sysroot
 
 c_stdlib_version:
-  - "2.17"
+  - "2.28"
 
 cmake_version:
   - ">=3.26.4,!=3.30.0"
diff --git a/conda/recipes/cuvs-bench/meta.yaml b/conda/recipes/cuvs-bench/meta.yaml
index d77aee8ce..33b1745ec 100644
--- a/conda/recipes/cuvs-bench/meta.yaml
+++ b/conda/recipes/cuvs-bench/meta.yaml
@@ -37,10 +37,8 @@ build:
   number: {{ GIT_DESCRIBE_NUMBER }}
   string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
   ignore_run_exports_from:
-    {% if cuda_major == "11" %}
-    - {{ compiler('cuda11') }}
-    {% else %}
     - {{ compiler('cuda') }}
+    {% if cuda_major != "11" %}
     - cuda-cudart-dev
     - libcublas-dev
     {% endif %}
@@ -50,7 +48,7 @@ requirements:
     - {{ compiler('c') }}
     - {{ compiler('cxx') }}
     {% if cuda_major == "11" %}
-    - {{ compiler('cuda11') }} ={{ cuda_version }}
+    - {{ compiler('cuda') }} ={{ cuda_version }}
     {% else %}
     - {{ compiler('cuda') }}
     {% endif %}
diff --git a/conda/recipes/cuvs/conda_build_config.yaml b/conda/recipes/cuvs/conda_build_config.yaml
index 001878ff2..83f5ebcb1 100644
--- a/conda/recipes/cuvs/conda_build_config.yaml
+++ b/conda/recipes/cuvs/conda_build_config.yaml
@@ -1,20 +1,20 @@
 c_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cxx_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cuda_compiler:
-  - cuda-nvcc
-
-cuda11_compiler:
-  - nvcc
+  - cuda-nvcc  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - nvcc  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 c_stdlib:
   - sysroot
 
 c_stdlib_version:
-  - "2.17"
+  - "2.28"
 
 cmake_version:
   - ">=3.26.4,!=3.30.0"
diff --git a/conda/recipes/cuvs/meta.yaml b/conda/recipes/cuvs/meta.yaml
index ad7ffe756..25fc204a8 100644
--- a/conda/recipes/cuvs/meta.yaml
+++ b/conda/recipes/cuvs/meta.yaml
@@ -20,10 +20,8 @@ build:
   number: {{ GIT_DESCRIBE_NUMBER }}
   string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
   ignore_run_exports_from:
-    {% if cuda_major == "11" %}
-    - {{ compiler('cuda11') }}
-    {% else %}
     - {{ compiler('cuda') }}
+    {% if cuda_major != "11" %}
     - cuda-cudart-dev
     {% endif %}
     - cuda-python
@@ -33,7 +31,7 @@ requirements:
     - {{ compiler('c') }}
     - {{ compiler('cxx') }}
     {% if cuda_major == "11" %}
-    - {{ compiler('cuda11') }} ={{ cuda_version }}
+    - {{ compiler('cuda') }} ={{ cuda_version }}
     {% else %}
     - {{ compiler('cuda') }}
     {% endif %}
diff --git a/conda/recipes/libcuvs/conda_build_config.yaml b/conda/recipes/libcuvs/conda_build_config.yaml
index b8c49943e..72cc4415d 100644
--- a/conda/recipes/libcuvs/conda_build_config.yaml
+++ b/conda/recipes/libcuvs/conda_build_config.yaml
@@ -1,20 +1,20 @@
 c_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cxx_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cuda_compiler:
-  - cuda-nvcc
-
-cuda11_compiler:
-  - nvcc
+  - cuda-nvcc  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - nvcc  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 c_stdlib:
   - sysroot
 
 c_stdlib_version:
-  - "2.17"
+  - "2.28"
 
 cmake_version:
   - ">=3.26.4,!=3.30.0"
diff --git a/conda/recipes/libcuvs/meta.yaml b/conda/recipes/libcuvs/meta.yaml
index 46552c397..fd466cd22 100644
--- a/conda/recipes/libcuvs/meta.yaml
+++ b/conda/recipes/libcuvs/meta.yaml
@@ -39,10 +39,8 @@ outputs:
       number: {{ GIT_DESCRIBE_NUMBER }}
       string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
       ignore_run_exports_from:
-        {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }}
-        {% else %}
         - {{ compiler('cuda') }}
+        {% if cuda_major != "11" %}
         - cuda-cudart-dev
         - libcublas-dev
         - libcurand-dev
@@ -54,7 +52,7 @@ outputs:
         - {{ compiler('c') }}
         - {{ compiler('cxx') }}
         {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }} ={{ cuda_version }}
+        - {{ compiler('cuda') }} ={{ cuda_version }}
         {% else %}
         - {{ compiler('cuda') }}
         {% endif %}
@@ -106,10 +104,8 @@ outputs:
       number: {{ GIT_DESCRIBE_NUMBER }}
       string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
       ignore_run_exports_from:
-        {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }}
-        {% else %}
         - {{ compiler('cuda') }}
+        {% if cuda_major != "11" %}
         - cuda-cudart-dev
         - libcublas-dev
         - libcurand-dev
@@ -121,7 +117,7 @@ outputs:
         - {{ compiler('c') }}
         - {{ compiler('cxx') }}
         {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }} ={{ cuda_version }}
+        - {{ compiler('cuda') }} ={{ cuda_version }}
         {% else %}
         - {{ compiler('cuda') }}
         {% endif %}
@@ -174,10 +170,8 @@ outputs:
       number: {{ GIT_DESCRIBE_NUMBER }}
       string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
       ignore_run_exports_from:
-        {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }}
-        {% else %}
         - {{ compiler('cuda') }}
+        {% if cuda_major != "11" %}
         - cuda-cudart-dev
         - libcublas-dev
         - libcurand-dev
@@ -189,7 +183,7 @@ outputs:
         - {{ compiler('c') }}
         - {{ compiler('cxx') }}
         {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }} ={{ cuda_version }}
+        - {{ compiler('cuda') }} ={{ cuda_version }}
         {% else %}
         - {{ compiler('cuda') }}
         {% endif %}
@@ -246,10 +240,8 @@ outputs:
       number: {{ GIT_DESCRIBE_NUMBER }}
       string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
       ignore_run_exports_from:
-        {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }}
-        {% else %}
         - {{ compiler('cuda') }}
+        {% if cuda_major != "11" %}
         - cuda-cudart-dev
         - libcublas-dev
         - libcurand-dev
@@ -261,7 +253,7 @@ outputs:
         - {{ compiler('c') }}
         - {{ compiler('cxx') }}
         {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }} ={{ cuda_version }}
+        - {{ compiler('cuda') }} ={{ cuda_version }}
         {% else %}
         - {{ compiler('cuda') }}
         {% endif %}
diff --git a/cpp/test/distance/masked_nn.cu b/cpp/test/distance/masked_nn.cu
index a8f2f5163..a1c784669 100644
--- a/cpp/test/distance/masked_nn.cu
+++ b/cpp/test/distance/masked_nn.cu
@@ -314,8 +314,8 @@ template <typename K, typename V, typename L>
                                        cudaStream_t stream = 0)
 {
   typedef typename raft::KeyValuePair<K, V> KVP;
-  std::shared_ptr<KVP> exp_h(new KVP[size]);
-  std::shared_ptr<KVP> act_h(new KVP[size]);
+  std::shared_ptr<KVP[]> exp_h(new KVP[size]);
+  std::shared_ptr<KVP[]> act_h(new KVP[size]);
   raft::update_host<KVP>(exp_h.get(), expected, size, stream);
   raft::update_host<KVP>(act_h.get(), actual, size, stream);
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
diff --git a/dependencies.yaml b/dependencies.yaml
index fbd1d8372..d23c118c0 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -166,14 +166,28 @@ dependencies:
         matrices:
           - matrix:
               arch: x86_64
+              cuda: "11.*"
             packages:
               - gcc_linux-64=11.*
-              - sysroot_linux-64==2.17
+              - sysroot_linux-64==2.28
           - matrix:
               arch: aarch64
+              cuda: "11.*"
             packages:
               - gcc_linux-aarch64=11.*
-              - sysroot_linux-aarch64==2.17
+              - sysroot_linux-aarch64==2.28
+          - matrix:
+              arch: x86_64
+              cuda: "12.*"
+            packages:
+              - gcc_linux-64=13.*
+              - sysroot_linux-64==2.28
+          - matrix:
+              arch: aarch64
+              cuda: "12.*"
+            packages:
+              - gcc_linux-aarch64=13.*
+              - sysroot_linux-aarch64==2.28
       - output_types: conda
         matrices:
           - matrix: {cuda: "12.*"}

From 9b7bb975249f6863f72aa04147bf423d130a25c3 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 22 Jan 2025 00:41:17 -0600
Subject: [PATCH 34/39] Temporarily skip CUDA 11 wheel CI (#599)

Due to some failures coming from libraft C++ wheels, CUDA 11 wheel CI will not pass. This PR temporarily disables CUDA 11 wheel tests until those issues can be resolved.

See https://github.com/rapidsai/build-planning/issues/137.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cuvs/pull/599
---
 .github/workflows/pr.yaml   | 3 +++
 .github/workflows/test.yaml | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 91f51bd90..ca85c5c2e 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -150,6 +150,9 @@ jobs:
     with:
       build_type: pull-request
       script: ci/test_wheel_cuvs.sh
+      # CUDA 11 wheel CI is disabled until
+      # https://github.com/rapidsai/build-planning/issues/137 is resolved.
+      matrix_filter: map(select(.CUDA_VER | startswith("11") | not))
   devcontainer:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.02
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index e3bf5d16f..cf081d579 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -49,3 +49,6 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       script: ci/test_wheel_cuvs.sh
+      # CUDA 11 wheel CI is disabled until
+      # https://github.com/rapidsai/build-planning/issues/137 is resolved.
+      matrix_filter: map(select(.CUDA_VER | startswith("11") | not))

From d6476f122e57d20e54cf1df6eb3a5b2baeefb175 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 22 Jan 2025 11:05:14 -0600
Subject: [PATCH 35/39] Use cuda.bindings layout. (#588)

This PR updates cuVS to use the new cuda-python `cuda.bindings` layout. See https://github.com/rapidsai/build-planning/issues/117.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/cuvs/pull/588
---
 python/cuvs/cuvs/common/c_api.pxd     | 2 +-
 python/cuvs/cuvs/common/resources.pyx | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cuvs/cuvs/common/c_api.pxd b/python/cuvs/cuvs/common/c_api.pxd
index f99fd5348..dae93d750 100644
--- a/python/cuvs/cuvs/common/c_api.pxd
+++ b/python/cuvs/cuvs/common/c_api.pxd
@@ -16,7 +16,7 @@
 # cython: language_level=3
 
 
-from cuda.ccudart cimport cudaStream_t
+from cuda.bindings.cyruntime cimport cudaStream_t
 from libc.stdint cimport uintptr_t
 
 
diff --git a/python/cuvs/cuvs/common/resources.pyx b/python/cuvs/cuvs/common/resources.pyx
index c0b72ae34..0edf53fc1 100644
--- a/python/cuvs/cuvs/common/resources.pyx
+++ b/python/cuvs/cuvs/common/resources.pyx
@@ -17,7 +17,7 @@
 
 import functools
 
-from cuda.ccudart cimport cudaStream_t
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 from cuvs.common.c_api cimport (
     cuvsResources_t,

From 43969ca29e054fb94820f518c047e8446e6730a0 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 22 Jan 2025 14:31:23 -0600
Subject: [PATCH 36/39] Revert "Temporarily skip CUDA 11 wheel CI" (#601)

Reverts rapidsai/cuvs#599 now that https://github.com/rapidsai/raft/pull/2548 has landed.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cuvs/pull/601
---
 .github/workflows/pr.yaml   | 3 ---
 .github/workflows/test.yaml | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index ca85c5c2e..91f51bd90 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -150,9 +150,6 @@ jobs:
     with:
       build_type: pull-request
       script: ci/test_wheel_cuvs.sh
-      # CUDA 11 wheel CI is disabled until
-      # https://github.com/rapidsai/build-planning/issues/137 is resolved.
-      matrix_filter: map(select(.CUDA_VER | startswith("11") | not))
   devcontainer:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-25.02
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index cf081d579..e3bf5d16f 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -49,6 +49,3 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       script: ci/test_wheel_cuvs.sh
-      # CUDA 11 wheel CI is disabled until
-      # https://github.com/rapidsai/build-planning/issues/137 is resolved.
-      matrix_filter: map(select(.CUDA_VER | startswith("11") | not))

From 1c91e1f6bf38321fb1f57250bc32980ea0674138 Mon Sep 17 00:00:00 2001
From: rhdong <rhdong2017@gmail.com>
Date: Wed, 22 Jan 2025 13:33:10 -0800
Subject: [PATCH 37/39] [Fix] l2_exp random fail in half-float32 mixed
 precision on self-neighboring (#596)

Authors:
  - rhdong (https://github.com/rhdong)

Approvers:
  - Ben Frederickson (https://github.com/benfred)

URL: https://github.com/rapidsai/cuvs/pull/596
---
 .../distance/detail/distance_ops/l2_exp.cuh   | 34 ++++++++++++-------
 python/cuvs/cuvs/test/test_distance.py        |  5 ++-
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/cpp/src/distance/detail/distance_ops/l2_exp.cuh b/cpp/src/distance/detail/distance_ops/l2_exp.cuh
index 04817aa8b..f49771605 100644
--- a/cpp/src/distance/detail/distance_ops/l2_exp.cuh
+++ b/cpp/src/distance/detail/distance_ops/l2_exp.cuh
@@ -28,14 +28,14 @@ namespace cuvs::distance::detail::ops {
  * for round-off error tolerance.
  * @tparam DataT
  */
-template <typename DataT>
-__device__ constexpr DataT get_clamp_precision()
+template <typename DataT, typename AccT>
+__device__ constexpr AccT get_clamp_precision()
 {
   switch (sizeof(DataT)) {
-    case 2: return 1e-3;
-    case 4: return 1e-6;
-    case 8: return 1e-15;
-    default: return 0;
+    case 2: return AccT{1e-3};
+    case 4: return AccT{1e-6};
+    case 8: return AccT{1e-15};
+    default: return AccT{0};
   }
 }
 
@@ -46,19 +46,27 @@ struct l2_exp_cutlass_op {
 
   __device__ l2_exp_cutlass_op() noexcept : sqrt(false) {}
   __device__ l2_exp_cutlass_op(bool isSqrt) noexcept : sqrt(isSqrt) {}
-  inline __device__ AccT operator()(DataT aNorm, DataT bNorm, DataT accVal) const noexcept
+  inline __device__ AccT operator()(AccT aNorm, AccT bNorm, AccT accVal) const noexcept
   {
-    AccT outVal = aNorm + bNorm - DataT(2.0) * accVal;
+    AccT outVal = aNorm + bNorm - AccT(2.0) * accVal;
 
     /**
      * Self-neighboring points should have (aNorm == bNorm) == accVal and the dot product (accVal)
      * can sometimes have round-off errors, which will cause (aNorm == bNorm) ~ accVal instead.
      */
-    outVal = outVal * AccT(!((outVal * outVal < get_clamp_precision<AccT>()) * (aNorm == bNorm)));
+    outVal =
+      outVal * AccT(!((outVal * outVal < get_clamp_precision<DataT, AccT>()) * (aNorm == bNorm)));
     return sqrt ? raft::sqrt(outVal * static_cast<AccT>(outVal > AccT(0))) : outVal;
   }
 
-  __device__ AccT operator()(DataT aData) const noexcept { return aData; }
+  __device__ AccT operator()(DataT aData) const noexcept
+  {
+    if constexpr (std::is_same_v<DataT, half> && std::is_same_v<AccT, float>) {
+      return __half2float(aData);
+    } else {
+      return aData;
+    }
+  }
 };
 
 /**
@@ -121,9 +129,9 @@ struct l2_exp_distance_op {
          * (accVal) can sometimes have round-off errors, which will cause (aNorm == bNorm) ~ accVal
          * instead.
          */
-        acc[i][j] =
-          val * static_cast<AccT>((val > AccT(0))) *
-          static_cast<AccT>(!((val * val < get_clamp_precision<AccT>()) * (regxn[i] == regyn[j])));
+        acc[i][j] = val * static_cast<AccT>((val > AccT(0))) *
+                    static_cast<AccT>(
+                      !((val * val < get_clamp_precision<DataT, AccT>()) * (regxn[i] == regyn[j])));
       }
     }
     if (sqrt) {
diff --git a/python/cuvs/cuvs/test/test_distance.py b/python/cuvs/cuvs/test/test_distance.py
index 483d5d201..370dd773a 100644
--- a/python/cuvs/cuvs/test/test_distance.py
+++ b/python/cuvs/cuvs/test/test_distance.py
@@ -21,6 +21,7 @@
 from cuvs.distance import pairwise_distance
 
 
+@pytest.mark.parametrize("times", range(20))
 @pytest.mark.parametrize("n_rows", [50, 100])
 @pytest.mark.parametrize("n_cols", [10, 50])
 @pytest.mark.parametrize(
@@ -43,7 +44,7 @@
 @pytest.mark.parametrize("inplace", [True, False])
 @pytest.mark.parametrize("order", ["F", "C"])
 @pytest.mark.parametrize("dtype", [np.float32, np.float64, np.float16])
-def test_distance(n_rows, n_cols, inplace, order, metric, dtype):
+def test_distance(n_rows, n_cols, inplace, order, metric, dtype, times):
     input1 = np.random.random_sample((n_rows, n_cols))
     input1 = np.asarray(input1, order=order).astype(dtype)
 
@@ -79,7 +80,5 @@ def test_distance(n_rows, n_cols, inplace, order, metric, dtype):
     actual = output_device.copy_to_host()
 
     tol = 1e-3
-    if np.issubdtype(dtype, np.float16):
-        tol = 1e-1
 
     assert np.allclose(expected, actual, atol=tol, rtol=tol)

From b62b11aa73c5b39fe5ccd7328ccdcf468c4c2323 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 22 Jan 2025 21:53:22 -0600
Subject: [PATCH 38/39] introduce libcuvs wheels (#594)

Contributes to https://github.com/rapidsai/build-planning/issues/33.

Proposes packaging `libcuvs` as a wheel, which is then re-used by `cuvs-cu{11,12}`.

Similar changes were recently made in RAFT: https://github.com/rapidsai/raft/pull/2531

As part of this, also proposes:

* introducing a new CMake option, `CUVS_COMPILE_DYNAMIC_ONLY`, to allow building/installing only the dynamic shared library (i.e. skipping the static library)
* enforcing `rapids-cmake`'s preferred CMake style (similar https://github.com/rapidsai/raft/pull/2531#discussion_r1917039870)
* standardizing `clang` pins across the project, and pinning to `clang` 16 for Rust builds (https://github.com/rapidsai/cuvs/pull/594#discussion_r1924484839)

## Notes for Reviewers

### Benefits of these changes

* smaller wheels for projects that depend on cuVS (they can dynamically link against `libcuvs` instead of statically linking in the pieces they need)
* fewer CI resources used for cuVS wheels (no more re-compiling for every Python minor version)
* faster, cheaper cuML wheel builds (https://github.com/rapidsai/cuml/pull/6199#discussion_r1920830933)
* other benefits mentioned in https://github.com/rapidsai/build-planning/issues/33

### Wheel contents

`libcuvs`:

* `libcuvs.so` and `libcuvs_c.so` (shared library)
* cuVS headers
* vendored dependencies (hnswlib)

`cuvs`:

* `cuvs` Python / Cython code and compiled Cython extensions

### Size changes (CUDA 12, Python 3.12, x86_64)

| wheel                | num files (before) | num files (this PR) | size (before)  | size (this PR) |
|:---------------:|------------------:|-----------------:|--------------:|---------------:|
| `libcuvs`          |   ---                        |  67                        | ---                   | 843M               |
| `cuvs`              | 88                          |   84                       |845M               | 2M                    |
|**TOTAL**          |   **88**                |   **131**                 | **845M**       | **845M**         |

*NOTES: size = compressed, "before" = 2025-01-22 nightlies*

<details><summary>how I calculated those (click me)</summary>

* nightly commit = https://github.com/rapidsai/cuml/commit/7c715c494dff71274d0fdec774bdee12a7e78827
* PR = this PR

```shell
docker run \
    --rm \
    --network host \
    --env RAPIDS_NIGHTLY_DATE=2025-01-22 \
    --env CUVS_NIGHTLY_SHA=f1de1b2 \
    --env CUVS_PR="pull-request/594" \
    --env CUVS_PR_SHA="97c56178cd0e07e4b6b138bb0904af78379f1bb3" \
    --env RAPIDS_PY_CUDA_SUFFIX=cu12 \
    --env WHEEL_DIR_BEFORE=/tmp/wheels-before \
    --env WHEEL_DIR_AFTER=/tmp/wheels-after \
    -it rapidsai/ci-wheel:cuda12.5.1-rockylinux8-py3.12 \
    bash

# --- nightly wheels --- #
mkdir -p ./wheels-before

export RAPIDS_BUILD_TYPE=branch
export RAPIDS_REF_NAME="branch-25.02"

# cuvs
RAPIDS_PY_WHEEL_NAME="cuvs_${RAPIDS_PY_CUDA_SUFFIX}" \
RAPIDS_REPOSITORY=rapidsai/cuvs \
RAPIDS_SHA=${CUVS_NIGHTLY_SHA} \
    rapids-download-wheels-from-s3 python ./wheels-before

# --- wheels from CI --- #
mkdir -p ./wheels-after

export RAPIDS_BUILD_TYPE="pull-request"

# libcuvs
RAPIDS_PY_WHEEL_NAME="libcuvs_${RAPIDS_PY_CUDA_SUFFIX}" \
RAPIDS_REPOSITORY=rapidsai/cuvs \
RAPIDS_REF_NAME="${CUVS_PR}" \
RAPIDS_SHA="${CUVS_PR_SHA}" \
    rapids-download-wheels-from-s3 cpp ./wheels-after

# cuvs
RAPIDS_PY_WHEEL_NAME="cuvs_${RAPIDS_PY_CUDA_SUFFIX}" \
RAPIDS_REPOSITORY=rapidsai/cuvs \
RAPIDS_REF_NAME="${CUVS_PR}" \
RAPIDS_SHA="${CUVS_PR_SHA}" \
    rapids-download-wheels-from-s3 python ./wheels-after

pip install pydistcheck
pydistcheck \
    --inspect \
    --select 'distro-too-large-compressed' \
    ./wheels-before/*.whl \
| grep -E '^checking|files: | compressed' \
> ./before.txt

# get more exact sizes
du -sh ./wheels-before/*

pydistcheck \
    --inspect \
    --select 'distro-too-large-compressed' \
    ./wheels-after/*.whl \
| grep -E '^checking|files: | compressed' \
> ./after.txt

# get more exact sizes
du -sh ./wheels-after/*
```

</details>

### How I tested this

* https://github.com/rapidsai/devcontainers/pull/440
* https://github.com/rapidsai/cuml/pull/6199

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ben Frederickson (https://github.com/benfred)

URL: https://github.com/rapidsai/cuvs/pull/594
---
 .github/workflows/build.yaml                  |  24 ++++
 .github/workflows/pr.yaml                     |  12 +-
 .pre-commit-config.yaml                       |   2 +-
 build.sh                                      |   8 +-
 ci/build_wheel.sh                             |  34 +++---
 ci/build_wheel_cuvs.sh                        |  23 ++--
 ci/build_wheel_libcuvs.sh                     |  32 +++++
 ci/check_style.sh                             |   7 ++
 ci/release/update-version.sh                  |   4 +-
 ci/test_wheel_cuvs.sh                         |   9 +-
 ci/validate_wheel.sh                          |  12 --
 .../all_cuda-118_arch-aarch64.yaml            |   5 +-
 .../all_cuda-118_arch-x86_64.yaml             |   5 +-
 .../all_cuda-125_arch-aarch64.yaml            |   5 +-
 .../all_cuda-125_arch-x86_64.yaml             |   5 +-
 .../bench_ann_cuda-118_arch-aarch64.yaml      |   3 +-
 .../bench_ann_cuda-118_arch-x86_64.yaml       |   3 +-
 .../bench_ann_cuda-125_arch-aarch64.yaml      |   3 +-
 .../bench_ann_cuda-125_arch-x86_64.yaml       |   3 +-
 cpp/CMakeLists.txt                            |  77 +++++++-----
 cpp/cmake/thirdparty/get_raft.cmake           |   1 +
 dependencies.yaml                             | 110 +++++++++++++++---
 examples/cpp/CMakeLists.txt                   |  22 +++-
 python/cuvs/CMakeLists.txt                    |  70 +----------
 python/cuvs/cuvs/__init__.py                  |  11 ++
 python/cuvs/cuvs/common/CMakeLists.txt        |   2 +-
 python/cuvs/cuvs/distance/CMakeLists.txt      |   2 +-
 python/cuvs/cuvs/neighbors/CMakeLists.txt     |   2 +-
 .../cuvs/neighbors/brute_force/CMakeLists.txt |   3 +-
 .../cuvs/cuvs/neighbors/cagra/CMakeLists.txt  |   2 +-
 .../cuvs/neighbors/filters/CMakeLists.txt     |   2 +-
 .../cuvs/cuvs/neighbors/hnsw/CMakeLists.txt   |   2 +-
 .../cuvs/neighbors/ivf_flat/CMakeLists.txt    |   2 +-
 .../cuvs/cuvs/neighbors/ivf_pq/CMakeLists.txt |   2 +-
 python/cuvs/pyproject.toml                    |  19 ++-
 python/libcuvs/CMakeLists.txt                 |  69 +++++++++++
 python/libcuvs/LICENSE                        |   1 +
 python/libcuvs/README.md                      |   1 +
 python/libcuvs/libcuvs/VERSION                |   1 +
 python/libcuvs/libcuvs/__init__.py            |  16 +++
 python/libcuvs/libcuvs/_version.py            |  33 ++++++
 python/libcuvs/libcuvs/load.py                | 100 ++++++++++++++++
 python/libcuvs/pyproject.toml                 | 108 +++++++++++++++++
 43 files changed, 653 insertions(+), 204 deletions(-)
 create mode 100755 ci/build_wheel_libcuvs.sh
 create mode 100644 python/libcuvs/CMakeLists.txt
 create mode 120000 python/libcuvs/LICENSE
 create mode 120000 python/libcuvs/README.md
 create mode 120000 python/libcuvs/libcuvs/VERSION
 create mode 100644 python/libcuvs/libcuvs/__init__.py
 create mode 100644 python/libcuvs/libcuvs/_version.py
 create mode 100644 python/libcuvs/libcuvs/load.py
 create mode 100644 python/libcuvs/pyproject.toml

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index e93b7a694..59b8e00de 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -80,7 +80,30 @@ jobs:
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
       sha: ${{ inputs.sha }}
+  wheel-build-libcuvs:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_libcuvs.sh
+      # build for every combination of arch and CUDA version, but only for the latest Python
+      matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
+  wheel-publish-libcuvs:
+    needs: wheel-build-libcuvs
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: libcuvs
+      package-type: cpp
   wheel-build-cuvs:
+    needs: wheel-build-libcuvs
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
@@ -99,3 +122,4 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: cuvs
+      package-type: python
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 91f51bd90..843439f26 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -22,6 +22,7 @@ jobs:
       - conda-python-tests
       - docs-build
       - rust-build
+      - wheel-build-libcuvs
       - wheel-build-cuvs
       - wheel-tests-cuvs
       - devcontainer
@@ -135,10 +136,19 @@ jobs:
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_rust.sh"
-  wheel-build-cuvs:
+  wheel-build-libcuvs:
     needs: checks
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
+    with:
+      build_type: pull-request
+      script: ci/build_wheel_libcuvs.sh
+      # build for every combination of arch and CUDA version, but only for the latest Python
+      matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
+  wheel-build-cuvs:
+    needs: wheel-build-libcuvs
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       build_type: pull-request
       script: ci/build_wheel_cuvs.sh
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index fcfc7e1fa..240f82be6 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -115,7 +115,7 @@ repos:
                   cpp/cmake/modules/FindAVX\.cmake|
           - id: verify-alpha-spec
       - repo: https://github.com/rapidsai/dependency-file-generator
-        rev: v1.16.0
+        rev: v1.17.0
         hooks:
             - id: rapids-dependency-file-generator
               args: ["--clean"]
diff --git a/build.sh b/build.sh
index bd5fa649b..3b9a9a3a8 100755
--- a/build.sh
+++ b/build.sh
@@ -313,12 +313,6 @@ if [[ ${CMAKE_TARGET} == "" ]]; then
     CMAKE_TARGET="all"
 fi
 
-
-SKBUILD_EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS}"
-if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_CUVS_CPP"* ]]; then
-    SKBUILD_EXTRA_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS};-DFIND_CUVS_CPP=ON"
-fi
-
 # If clean given, run it prior to any other steps
 if (( ${CLEAN} == 1 )); then
     # If the dirs to clean are mounted dirs in a container, the
@@ -434,7 +428,7 @@ fi
 
 # Build and (optionally) install the cuvs Python package
 if (( ${NUMARGS} == 0 )) || hasArg python; then
-    SKBUILD_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS}" \
+    SKBUILD_CMAKE_ARGS="${EXTRA_CMAKE_ARGS}" \
         SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL}" \
         python -m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true ${REPODIR}/python/cuvs
 fi
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 4994374a8..c6f1232b3 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -1,10 +1,11 @@
 #!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
 package_name=$1
 package_dir=$2
+package_type=$3
 underscore_package_name=$(echo "${package_name}" | tr "-" "_")
 
 source rapids-configure-sccache
@@ -16,21 +17,22 @@ rapids-generate-version > ./VERSION
 
 cd "${package_dir}"
 
-case "${RAPIDS_CUDA_VERSION}" in
-  12.*)
-    EXCLUDE_ARGS=(
-      --exclude "libcublas.so.12"
-      --exclude "libcublasLt.so.12"
-      --exclude "libcurand.so.10"
-      --exclude "libcusolver.so.11"
-      --exclude "libcusparse.so.12"
-      --exclude "libnvJitLink.so.12"
+EXCLUDE_ARGS=(
+  --exclude "libraft.so"
+  --exclude "libcublas.so.*"
+  --exclude "libcublasLt.so.*"
+  --exclude "libcurand.so.*"
+  --exclude "libcusolver.so.*"
+  --exclude "libcusparse.so.*"
+  --exclude "libnvJitLink.so.*"
+)
+
+if [[ "${package_dir}" != "python/libcuvs" ]]; then
+    EXCLUDE_ARGS+=(
+      --exclude "libcuvs_c.so"
+      --exclude "libcuvs.so"
     )
-  ;;
-  11.*)
-    EXCLUDE_ARGS=()
-  ;;
-esac
+fi
 
 rapids-logger "Building '${package_name}' wheel"
 
@@ -48,4 +50,4 @@ sccache --show-adv-stats
 mkdir -p final_dist
 python -m auditwheel repair -w final_dist "${EXCLUDE_ARGS[@]}" dist/*
 
-RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python final_dist
+RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_type} final_dist
diff --git a/ci/build_wheel_cuvs.sh b/ci/build_wheel_cuvs.sh
index 444657cc0..fb40d1459 100755
--- a/ci/build_wheel_cuvs.sh
+++ b/ci/build_wheel_cuvs.sh
@@ -1,21 +1,20 @@
 #!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
 package_dir="python/cuvs"
 
-case "${RAPIDS_CUDA_VERSION}" in
-  12.*)
-    EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=ON"
-  ;;
-  11.*)
-    EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=OFF"
-  ;;
-esac
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-# Set up skbuild options. Enable sccache in skbuild config options
-export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_CUVS_CPP=OFF${EXTRA_CMAKE_ARGS}"
+# Downloads libcuvs wheels from this current build,
+# then ensures 'cuvs' wheel builds always use the 'libcuvs' just built in the same CI run.
+#
+# Using env variable PIP_CONSTRAINT is necessary to ensure the constraints
+# are used when creating the isolated build environment.
+RAPIDS_PY_WHEEL_NAME="libcuvs_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp /tmp/libcuvs_dist
+echo "libcuvs-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libcuvs_dist/libcuvs_*.whl)" > /tmp/constraints.txt
+export PIP_CONSTRAINT="/tmp/constraints.txt"
 
-ci/build_wheel.sh cuvs ${package_dir}
+ci/build_wheel.sh cuvs ${package_dir} python
 ci/validate_wheel.sh ${package_dir} final_dist
diff --git a/ci/build_wheel_libcuvs.sh b/ci/build_wheel_libcuvs.sh
new file mode 100755
index 000000000..148be89a2
--- /dev/null
+++ b/ci/build_wheel_libcuvs.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# Copyright (c) 2025, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_name="libcuvs"
+package_dir="python/libcuvs"
+
+rapids-logger "Generating build requirements"
+matrix_selectors="cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};cuda_suffixed=true"
+
+rapids-dependency-file-generator \
+  --output requirements \
+  --file-key "py_build_${package_name}" \
+  --file-key "py_rapids_build_${package_name}" \
+  --matrix "${matrix_selectors}" \
+| tee /tmp/requirements-build.txt
+
+rapids-logger "Installing build requirements"
+python -m pip install \
+    -v \
+    --prefer-binary \
+    -r /tmp/requirements-build.txt
+
+# build with '--no-build-isolation', for better sccache hit rate
+# 0 really means "add --no-build-isolation" (ref: https://github.com/pypa/pip/issues/5735)
+export PIP_NO_BUILD_ISOLATION=0
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+
+ci/build_wheel.sh libcuvs ${package_dir} cpp
+ci/validate_wheel.sh ${package_dir} final_dist libcuvs
diff --git a/ci/check_style.sh b/ci/check_style.sh
index c22f3f9f0..952e94bf1 100755
--- a/ci/check_style.sh
+++ b/ci/check_style.sh
@@ -14,5 +14,12 @@ rapids-dependency-file-generator \
 rapids-mamba-retry env create --yes -f env.yaml -n checks
 conda activate checks
 
+# get config for cmake-format checks
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+FORMAT_FILE_URL="https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION_MAJOR_MINOR}/cmake-format-rapids-cmake.json"
+export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json
+mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE})
+wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL}
+
 # Run pre-commit checks
 pre-commit run --all-files --show-diff-on-failure
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 4cf1f0617..7562035a9 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -44,8 +44,10 @@ echo "${NEXT_FULL_TAG}" > VERSION
 DEPENDENCIES=(
   dask-cuda
   cuvs
-  pylibraft
+  libcuvs
+  libraft
   librmm
+  pylibraft
   rmm
   rapids-dask-dependency
 )
diff --git a/ci/test_wheel_cuvs.sh b/ci/test_wheel_cuvs.sh
index 7033003e9..862c69a3a 100755
--- a/ci/test_wheel_cuvs.sh
+++ b/ci/test_wheel_cuvs.sh
@@ -1,13 +1,16 @@
 #!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
 mkdir -p ./dist
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="cuvs_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="libcuvs_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./local-libcuvs-dep
+RAPIDS_PY_WHEEL_NAME="cuvs_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
 # echo to expand wildcard before adding `[extra]` requires for pip
-python -m pip install $(echo ./dist/cuvs*.whl)[test]
+python -m pip install \
+    ./local-libcuvs-dep/libcuvs*.whl \
+    "$(echo ./dist/cuvs*.whl)[test]"
 
 python -m pytest ./python/cuvs/cuvs/test
diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh
index f2b235765..19d413fa2 100755
--- a/ci/validate_wheel.sh
+++ b/ci/validate_wheel.sh
@@ -8,24 +8,12 @@ wheel_dir_relative_path=$2
 
 RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}"
 
-# some packages are much larger on CUDA 11 than on CUDA 12
-if [[ "${RAPIDS_CUDA_MAJOR}" == "11" ]]; then
-    PYDISTCHECK_ARGS=(
-        --max-allowed-size-compressed '1.4G'
-    )
-else
-    PYDISTCHECK_ARGS=(
-        --max-allowed-size-compressed '950M'
-    )
-fi
-
 cd "${package_dir}"
 
 rapids-logger "validate packages with 'pydistcheck'"
 
 pydistcheck \
     --inspect \
-    "${PYDISTCHECK_ARGS[@]}" \
     "$(echo ${wheel_dir_relative_path}/*.whl)"
 
 rapids-logger "validate packages with 'twine'"
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 4c464ef4e..123acb421 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -9,8 +9,7 @@ channels:
 dependencies:
 - breathe>=4.35.0
 - c-compiler
-- clang
-- clang-tools=16.0.6
+- clang-tools==16.0.6
 - clang==16.0.6
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
@@ -26,7 +25,7 @@ dependencies:
 - gcc_linux-aarch64=11.*
 - graphviz
 - ipython
-- libclang
+- libclang==16.0.6
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcurand-dev=10.3.0.86
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 71cbeeaf3..c6a65e684 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -9,8 +9,7 @@ channels:
 dependencies:
 - breathe>=4.35.0
 - c-compiler
-- clang
-- clang-tools=16.0.6
+- clang-tools==16.0.6
 - clang==16.0.6
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
@@ -26,7 +25,7 @@ dependencies:
 - gcc_linux-64=11.*
 - graphviz
 - ipython
-- libclang
+- libclang==16.0.6
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcurand-dev=10.3.0.86
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index 4fd08fa97..b71f5ed43 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -9,8 +9,7 @@ channels:
 dependencies:
 - breathe>=4.35.0
 - c-compiler
-- clang
-- clang-tools=16.0.6
+- clang-tools==16.0.6
 - clang==16.0.6
 - cmake>=3.26.4,!=3.30.0
 - cuda-cudart-dev
@@ -27,7 +26,7 @@ dependencies:
 - gcc_linux-aarch64=13.*
 - graphviz
 - ipython
-- libclang
+- libclang==16.0.6
 - libcublas-dev
 - libcurand-dev
 - libcusolver-dev
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index de5f341fa..16cd595d3 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -9,8 +9,7 @@ channels:
 dependencies:
 - breathe>=4.35.0
 - c-compiler
-- clang
-- clang-tools=16.0.6
+- clang-tools==16.0.6
 - clang==16.0.6
 - cmake>=3.26.4,!=3.30.0
 - cuda-cudart-dev
@@ -27,7 +26,7 @@ dependencies:
 - gcc_linux-64=13.*
 - graphviz
 - ipython
-- libclang
+- libclang==16.0.6
 - libcublas-dev
 - libcurand-dev
 - libcusolver-dev
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index fb69ac251..2e2ad8446 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -9,7 +9,7 @@ channels:
 dependencies:
 - benchmark>=1.8.2
 - c-compiler
-- clang-tools=16.0.6
+- clang-tools==16.0.6
 - clang==16.0.6
 - click
 - cmake>=3.26.4,!=3.30.0
@@ -26,6 +26,7 @@ dependencies:
 - gcc_linux-aarch64=11.*
 - glog>=0.6.0
 - h5py>=3.8.0
+- libclang==16.0.6
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcurand-dev=10.3.0.86
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 123033b08..90243415c 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -9,7 +9,7 @@ channels:
 dependencies:
 - benchmark>=1.8.2
 - c-compiler
-- clang-tools=16.0.6
+- clang-tools==16.0.6
 - clang==16.0.6
 - click
 - cmake>=3.26.4,!=3.30.0
@@ -26,6 +26,7 @@ dependencies:
 - gcc_linux-64=11.*
 - glog>=0.6.0
 - h5py>=3.8.0
+- libclang==16.0.6
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcurand-dev=10.3.0.86
diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
index fa2ae7955..34e01aeea 100644
--- a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
@@ -9,7 +9,7 @@ channels:
 dependencies:
 - benchmark>=1.8.2
 - c-compiler
-- clang-tools=16.0.6
+- clang-tools==16.0.6
 - clang==16.0.6
 - click
 - cmake>=3.26.4,!=3.30.0
@@ -27,6 +27,7 @@ dependencies:
 - gcc_linux-aarch64=13.*
 - glog>=0.6.0
 - h5py>=3.8.0
+- libclang==16.0.6
 - libcublas-dev
 - libcurand-dev
 - libcusolver-dev
diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
index 76b005e3c..dcfb54a22 100644
--- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
@@ -9,7 +9,7 @@ channels:
 dependencies:
 - benchmark>=1.8.2
 - c-compiler
-- clang-tools=16.0.6
+- clang-tools==16.0.6
 - clang==16.0.6
 - click
 - cmake>=3.26.4,!=3.30.0
@@ -27,6 +27,7 @@ dependencies:
 - gcc_linux-64=13.*
 - glog>=0.6.0
 - h5py>=3.8.0
+- libclang==16.0.6
 - libcublas-dev
 - libcurand-dev
 - libcusolver-dev
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 26c0b82d3..11f21db44 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -68,6 +68,7 @@ option(CUDA_LOG_COMPILE_TIME "Write a log of compilation times to nvcc_compile_l
 option(DETECT_CONDA_ENV "Enable detection of conda environment for dependencies" ON)
 option(DISABLE_DEPRECATION_WARNINGS "Disable deprecaction warnings " ON)
 option(DISABLE_OPENMP "Disable OpenMP" OFF)
+option(CUVS_COMPILE_DYNAMIC_ONLY "Only build the shared library and skip the static library." OFF)
 option(CUVS_NVTX "Enable nvtx markers" OFF)
 option(CUVS_RAFT_CLONE_ON_PIN "Explicitly clone RAFT branch when pinned to non-feature branch" ON)
 
@@ -94,6 +95,7 @@ include(CMakeDependentOption)
 message(VERBOSE "cuVS: Build cuVS unit-tests: ${BUILD_TESTS}")
 message(VERBOSE "cuVS: Build CPU only components: ${BUILD_CPU_ONLY}")
 message(VERBOSE "cuVS: Build ANN benchmarks: ${BUILD_CUVS_BENCH}")
+message(VERBOSE "cuVS: Build only the shared library: ${CUVS_COMPILE_DYNAMIC_ONLY}")
 message(VERBOSE "cuVS: Enable detection of conda environment for dependencies: ${DETECT_CONDA_ENV}")
 message(VERBOSE "cuVS: Disable depreaction warnings " ${DISABLE_DEPRECATION_WARNINGS})
 message(VERBOSE "cuVS: Disable OpenMP: ${DISABLE_OPENMP}")
@@ -493,7 +495,10 @@ if(BUILD_SHARED_LIBS)
   )
 
   add_library(cuvs SHARED $<FILTER:$<TARGET_OBJECTS:cuvs_objs>,EXCLUDE,rmm.*logger>)
-  add_library(cuvs_static STATIC $<FILTER:$<TARGET_OBJECTS:cuvs_objs>,EXCLUDE,rmm.*logger>)
+
+  if(NOT CUVS_COMPILE_DYNAMIC_ONLY)
+    add_library(cuvs_static STATIC $<FILTER:$<TARGET_OBJECTS:cuvs_objs>,EXCLUDE,rmm.*logger>)
+  endif()
 
   target_compile_options(
     cuvs INTERFACE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-extended-lambda
@@ -501,20 +506,23 @@ if(BUILD_SHARED_LIBS)
   )
 
   add_library(cuvs::cuvs ALIAS cuvs)
-  add_library(cuvs::cuvs_static ALIAS cuvs_static)
 
-  set_target_properties(
-    cuvs_static
-    PROPERTIES BUILD_RPATH "\$ORIGIN"
-               INSTALL_RPATH "\$ORIGIN"
-               CXX_STANDARD 17
-               CXX_STANDARD_REQUIRED ON
-               POSITION_INDEPENDENT_CODE ON
-               INTERFACE_POSITION_INDEPENDENT_CODE ON
-               EXPORT_NAME cuvs_static
-  )
+  if(NOT CUVS_COMPILE_DYNAMIC_ONLY)
+    add_library(cuvs::cuvs_static ALIAS cuvs_static)
 
-  target_compile_options(cuvs_static PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>")
+    set_target_properties(
+      cuvs_static
+      PROPERTIES BUILD_RPATH "\$ORIGIN"
+                 INSTALL_RPATH "\$ORIGIN"
+                 CXX_STANDARD 17
+                 CXX_STANDARD_REQUIRED ON
+                 POSITION_INDEPENDENT_CODE ON
+                 INTERFACE_POSITION_INDEPENDENT_CODE ON
+                 EXPORT_NAME cuvs_static
+    )
+
+    target_compile_options(cuvs_static PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>")
+  endif()
 
   target_include_directories(
     cuvs_objs
@@ -523,19 +531,21 @@ if(BUILD_SHARED_LIBS)
     INTERFACE "$<INSTALL_INTERFACE:include>"
   )
 
-  target_include_directories(
-    cuvs_static
-    PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>"
-    INTERFACE "$<INSTALL_INTERFACE:include>"
-  )
+  if(NOT CUVS_COMPILE_DYNAMIC_ONLY)
+    target_include_directories(
+      cuvs_static
+      PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>"
+      INTERFACE "$<INSTALL_INTERFACE:include>"
+    )
 
-  # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
-  target_link_options(cuvs_static PRIVATE $<HOST_LINK:${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld>)
+    # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
+    target_link_options(cuvs_static PRIVATE $<HOST_LINK:${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld>)
 
-  target_include_directories(
-    cuvs_static PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
-                       "$<INSTALL_INTERFACE:include>"
-  )
+    target_include_directories(
+      cuvs_static PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+                         "$<INSTALL_INTERFACE:include>"
+    )
+  endif()
 
   target_include_directories(
     cuvs PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
@@ -571,11 +581,13 @@ if(BUILD_SHARED_LIBS)
               cuvs-cagra-search ${CUVS_COMMS_DEPENDENCY}
     )
 
-    target_link_libraries(
-      cuvs_static
-      PUBLIC rmm::rmm raft::raft ${CUVS_CTK_MATH_DEPENDENCIES}
-      PRIVATE nvidia::cutlass::cutlass $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
-    )
+    if(NOT CUVS_COMPILE_DYNAMIC_ONLY)
+      target_link_libraries(
+        cuvs_static
+        PUBLIC rmm::rmm raft::raft ${CUVS_CTK_MATH_DEPENDENCIES}
+        PRIVATE nvidia::cutlass::cutlass $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+      )
+    endif()
   endif()
 
   if(BUILD_MG_ALGOS)
@@ -718,8 +730,13 @@ target_compile_definitions(cuvs::cuvs INTERFACE $<$<BOOL:${CUVS_NVTX}>:NVTX_ENAB
   include(GNUInstallDirs)
   include(CPack)
 
+  set(_cuvs_lib_targets cuvs)
+  if(NOT CUVS_COMPILE_DYNAMIC_ONLY)
+    list(APPEND _cuvs_lib_targets cuvs_static)
+  endif()
+
   install(
-    TARGETS cuvs cuvs_static
+    TARGETS ${_cuvs_lib_targets}
     DESTINATION ${lib_dir}
     COMPONENT cuvs
     EXPORT cuvs-exports
diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake
index 2e57df84e..845c7a833 100644
--- a/cpp/cmake/thirdparty/get_raft.cmake
+++ b/cpp/cmake/thirdparty/get_raft.cmake
@@ -44,6 +44,7 @@ function(find_and_configure_raft)
             INSTALL_EXPORT_SET  cuvs-exports
             COMPONENTS          ${RAFT_COMPONENTS}
             CPM_ARGS
+              EXCLUDE_FROM_ALL TRUE  
               GIT_REPOSITORY        https://github.com/${PKG_FORK}/raft.git
               GIT_TAG               ${PKG_PINNED_TAG}
               SOURCE_SUBDIR         cpp
diff --git a/dependencies.yaml b/dependencies.yaml
index d23c118c0..478b2acc2 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -7,15 +7,16 @@ files:
       arch: [x86_64, aarch64]
     includes:
       - build
+      - build_cython
       - build_py_cuvs
       - build_wheels
       - checks
+      - clang
       - cuda
       - cuda_version
       - depends_on_cupy
       - depends_on_librmm
       - depends_on_pylibraft
-      - develop
       - docs
       - rapids_build
       - run_py_cuvs
@@ -31,13 +32,15 @@ files:
     includes:
       - bench
       - bench_python
+      - build_cython
       - build_py_cuvs
+      - clang
       - cuda
       - cuda_version
       - depends_on_cupy
       - depends_on_pylibraft
+      - depends_on_libcuvs
       - depends_on_librmm
-      - develop
       - rapids_build
       - rapids_build_setuptools
   test_cpp:
@@ -61,6 +64,7 @@ files:
   docs:
     output: none
     includes:
+      - clang
       - cuda
       - cuda_version
       - depends_on_cupy
@@ -71,10 +75,37 @@ files:
   rust:
     output: none
     includes:
+      # clang/libclang only needed for bindgen support
+      - clang
       - cuda
       - cuda_version
       - rapids_build
       - rust
+  py_build_libcuvs:
+    output: pyproject
+    pyproject_dir: python/libcuvs
+    extras:
+      table: build-system
+    includes:
+      - build
+  py_rapids_build_libcuvs:
+    output: pyproject
+    pyproject_dir: python/libcuvs
+    extras:
+      table: tool.rapids-build-backend
+      key: requires
+    includes:
+      - depends_on_libraft
+      - depends_on_librmm
+      - rapids_build
+  py_run_libcuvs:
+    output: pyproject
+    pyproject_dir: python/libcuvs
+    extras:
+      table: project
+    includes:
+      - cuda_wheels
+      - depends_on_libraft
   py_build_cuvs:
     output: pyproject
     pyproject_dir: python/cuvs
@@ -89,7 +120,11 @@ files:
       table: tool.rapids-build-backend
       key: requires
     includes:
+      - build_cython
       - build_py_cuvs
+      - depends_on_libcuvs
+      - depends_on_libraft
+      - depends_on_librmm
       - rapids_build
   py_run_cuvs:
     output: pyproject
@@ -97,7 +132,6 @@ files:
     extras:
       table: project
     includes:
-      - cuda_wheels
       - depends_on_pylibraft
       - run_py_cuvs
   py_test_cuvs:
@@ -149,12 +183,16 @@ dependencies:
       - output_types: [requirements, pyproject]
         packages:
           - scikit-build-core[pyproject]>=0.10.0
+  build_cython:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - cython>=3.0.0
   rapids_build:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
           - &cmake_ver cmake>=3.26.4,!=3.30.0
-          - cython>=3.0.0
           - ninja
       - output_types: [conda]
         packages:
@@ -241,12 +279,13 @@ dependencies:
       - output_types: [conda, requirements]
         packages:
           - pre-commit
-  develop:
+  clang:
     common:
       - output_types: conda
         packages:
           - clang==16.0.6
-          - clang-tools=16.0.6
+          - clang-tools==16.0.6
+          - libclang==16.0.6
   cuda_version:
     specific:
       - output_types: conda
@@ -363,13 +402,14 @@ dependencies:
               - nvidia-curand-cu12
               - nvidia-cusolver-cu12
               - nvidia-cusparse-cu12
-          # CUDA 11 does not provide wheels, so use the system libraries instead
           - matrix:
               cuda: "11.*"
               use_cuda_wheels: "true"
             packages:
-          # if use_cuda_wheels=false is provided, do not add dependencies on any CUDA wheels
-          # (e.g. for DLFW and pip devcontainers)
+              - nvidia-cublas-cu11
+              - nvidia-curand-cu11
+              - nvidia-cusolver-cu11
+              - nvidia-cusparse-cu11
           - matrix:
               use_cuda_wheels: "false"
             packages:
@@ -425,9 +465,6 @@ dependencies:
         packages:
           - make
           - rust
-          # clang/libclang only needed for bindgen support
-          - clang
-          - libclang
   build_wheels:
     common:
       - output_types: [requirements, pyproject]
@@ -492,7 +529,6 @@ dependencies:
           - h5py>=3.8.0
           - benchmark>=1.8.2
           - openblas
-          - libcuvs==25.2.*,>=0.0.0a0
   bench_python:
     common:
       - output_types: [conda, pyproject, requirements]
@@ -502,6 +538,54 @@ dependencies:
           - matplotlib
           - pandas
           - pyyaml
+  depends_on_libcuvs:
+    common:
+      - output_types: conda
+        packages:
+          - &libcuvs_unsuffixed libcuvs==25.2.*,>=0.0.0a0
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - libcuvs-cu12==25.2.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - libcuvs-cu11==25.2.*,>=0.0.0a0
+          - {matrix: null, packages: [*libcuvs_unsuffixed]}
+  depends_on_libraft:
+    common:
+      - output_types: conda
+        packages:
+          - &libraft_unsuffixed libraft==25.2.*,>=0.0.0a0
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - libraft-cu12==25.2.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - libraft-cu11==25.2.*,>=0.0.0a0
+          - {matrix: null, packages: [*libraft_unsuffixed]}
   depends_on_librmm:
     common:
       - output_types: conda
diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
index 9554207bb..b0d0ae9ee 100644
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -48,13 +48,23 @@ add_executable(VAMANA_EXAMPLE src/vamana_example.cu)
 add_library(rmm_logger OBJECT)
 target_link_libraries(rmm_logger PRIVATE rmm::rmm_logger_impl)
 
-target_link_libraries(CAGRA_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> rmm_logger)
 target_link_libraries(
-  CAGRA_PERSISTENT_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> Threads::Threads rmm_logger
+  CAGRA_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> rmm_logger
 )
 target_link_libraries(
-  DYNAMIC_BATCHING_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> Threads::Threads rmm_logger
+  CAGRA_PERSISTENT_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> Threads::Threads
+                                   rmm_logger
+)
+target_link_libraries(
+  DYNAMIC_BATCHING_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> Threads::Threads
+                                   rmm_logger
+)
+target_link_libraries(
+  IVF_PQ_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> rmm_logger
+)
+target_link_libraries(
+  IVF_FLAT_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> rmm_logger
+)
+target_link_libraries(
+  VAMANA_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> rmm_logger
 )
-target_link_libraries(IVF_PQ_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> rmm_logger)
-target_link_libraries(IVF_FLAT_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> rmm_logger)
-target_link_libraries(VAMANA_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> rmm_logger)
diff --git a/python/cuvs/CMakeLists.txt b/python/cuvs/CMakeLists.txt
index c0990995f..f3feae9a7 100644
--- a/python/cuvs/CMakeLists.txt
+++ b/python/cuvs/CMakeLists.txt
@@ -31,18 +31,6 @@ project(
             C CXX CUDA
 )
 
-# ##################################################################################################
-# * User Options  --------------------------------------------------------------
-
-option(FIND_CUVS_CPP "Search for existing CUVS C++ installations before defaulting to local files"
-       OFF
-)
-option(USE_CUDA_MATH_WHEELS "Use the CUDA math wheels instead of the system libraries" OFF)
-
-message(
-  "CUVS_PY: Searching for existing cuVS C/C++ installations before defaulting to local files: ${FIND_CUVS_CPP}"
-)
-
 # ##################################################################################################
 # * Process User Options  ------------------------------------------------------
 
@@ -54,56 +42,14 @@ include(rapids-find)
 
 rapids_cpm_init()
 
-# If the user requested it we attempt to find CUVS.
-if(FIND_CUVS_CPP)
-  find_package(cuvs "${RAPIDS_VERSION}" REQUIRED COMPONENTS c_api)
-  include(../../cpp/cmake/thirdparty/get_dlpack.cmake)
-else()
-  set(cuvs_FOUND OFF)
-endif()
+# --- cuVS ---#
+find_package(cuvs "${RAPIDS_VERSION}" REQUIRED COMPONENTS c_api)
 
-if(NOT cuvs_FOUND)
-  find_package(CUDAToolkit REQUIRED)
+# --- dlpack ---#
+include(../../cpp/cmake/thirdparty/get_dlpack.cmake)
 
-  set(BUILD_TESTS OFF)
-  set(BUILD_C_LIBRARY ON)
-
-  # Statically link dependencies if building wheels
-  set(CUDA_STATIC_RUNTIME ON)
-  set(CUDA_STATIC_MATH_LIBRARIES ON)
-  set(CUVS_USE_RAFT_STATIC ON)
-
-  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.0)
-    set(CUDA_STATIC_MATH_LIBRARIES OFF)
-  elseif(USE_CUDA_MATH_WHEELS)
-    message(FATAL_ERROR "Cannot use CUDA math wheels with CUDA < 12.0")
-  endif()
-
-  add_subdirectory(../../cpp cuvs-cpp EXCLUDE_FROM_ALL)
-
-  if(NOT CUDA_STATIC_MATH_LIBRARIES AND USE_CUDA_MATH_WHEELS)
-    set(rpaths
-        "$ORIGIN/../nvidia/cublas/lib"
-        "$ORIGIN/../nvidia/curand/lib"
-        "$ORIGIN/../nvidia/cusolver/lib"
-        "$ORIGIN/../nvidia/cusparse/lib"
-        "$ORIGIN/../nvidia/nvjitlink/lib"
-    )
-    set_property(
-      TARGET cuvs
-      PROPERTY INSTALL_RPATH ${rpaths}
-      APPEND
-    )
-    set_property(
-      TARGET cuvs_c
-      PROPERTY INSTALL_RPATH ${rpaths}
-      APPEND
-    )
-  endif()
-
-  set(cython_lib_dir cuvs)
-  install(TARGETS cuvs cuvs_c DESTINATION ${cython_lib_dir})
-endif()
+# ensure Cython targets can find dlpack headers (these do not come installed with with cuVS)
+target_include_directories(cuvs::cuvs INTERFACE "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")
 
 # ##################################################################################################
 # * Build Cython artifacts -----------------------------------------------------
@@ -116,7 +62,3 @@ target_link_libraries(cuvs_rmm_logger PRIVATE rmm::rmm_logger_impl)
 add_subdirectory(cuvs/common)
 add_subdirectory(cuvs/distance)
 add_subdirectory(cuvs/neighbors)
-
-if(DEFINED cython_lib_dir)
-  rapids_cython_add_rpath_entries(TARGET cuvs PATHS "${cython_lib_dir}")
-endif()
diff --git a/python/cuvs/cuvs/__init__.py b/python/cuvs/cuvs/__init__.py
index 9f0481cb7..1a41f0d76 100644
--- a/python/cuvs/cuvs/__init__.py
+++ b/python/cuvs/cuvs/__init__.py
@@ -13,4 +13,15 @@
 # limitations under the License.
 #
 
+# If libcuvs was installed as a wheel, we must request it to load the library
+# symbols. Otherwise, we assume that the library was installed in a system path that ld
+# can find.
+try:
+    import libcuvs
+except ModuleNotFoundError:
+    pass
+else:
+    libcuvs.load_library()
+    del libcuvs
+
 from cuvs._version import __git_commit__, __version__
diff --git a/python/cuvs/cuvs/common/CMakeLists.txt b/python/cuvs/cuvs/common/CMakeLists.txt
index 361f2fafc..b0e1cb335 100644
--- a/python/cuvs/cuvs/common/CMakeLists.txt
+++ b/python/cuvs/cuvs/common/CMakeLists.txt
@@ -20,7 +20,7 @@ set(linked_libraries cuvs::cuvs cuvs::c_api)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX common_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX common_
 )
 
 foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
diff --git a/python/cuvs/cuvs/distance/CMakeLists.txt b/python/cuvs/cuvs/distance/CMakeLists.txt
index 514b08c43..ded07395c 100644
--- a/python/cuvs/cuvs/distance/CMakeLists.txt
+++ b/python/cuvs/cuvs/distance/CMakeLists.txt
@@ -20,7 +20,7 @@ set(linked_libraries cuvs::cuvs cuvs::c_api)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX distance_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX distance_
 )
 
 foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
diff --git a/python/cuvs/cuvs/neighbors/CMakeLists.txt b/python/cuvs/cuvs/neighbors/CMakeLists.txt
index 031fd485e..b9161eefc 100644
--- a/python/cuvs/cuvs/neighbors/CMakeLists.txt
+++ b/python/cuvs/cuvs/neighbors/CMakeLists.txt
@@ -27,7 +27,7 @@ set(linked_libraries cuvs::cuvs cuvs::c_api)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX neighbors_refine_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_refine_
 )
 
 foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
diff --git a/python/cuvs/cuvs/neighbors/brute_force/CMakeLists.txt b/python/cuvs/cuvs/neighbors/brute_force/CMakeLists.txt
index 61eda649c..3c646f498 100644
--- a/python/cuvs/cuvs/neighbors/brute_force/CMakeLists.txt
+++ b/python/cuvs/cuvs/neighbors/brute_force/CMakeLists.txt
@@ -20,8 +20,7 @@ set(linked_libraries cuvs::cuvs cuvs::c_api)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX
-                   neighbors_brute_force_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_brute_force_
 )
 
 foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
diff --git a/python/cuvs/cuvs/neighbors/cagra/CMakeLists.txt b/python/cuvs/cuvs/neighbors/cagra/CMakeLists.txt
index 1f40daab2..6cf0956a2 100644
--- a/python/cuvs/cuvs/neighbors/cagra/CMakeLists.txt
+++ b/python/cuvs/cuvs/neighbors/cagra/CMakeLists.txt
@@ -20,7 +20,7 @@ set(linked_libraries cuvs::cuvs cuvs::c_api)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX neighbors_cagra_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_cagra_
 )
 
 foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
diff --git a/python/cuvs/cuvs/neighbors/filters/CMakeLists.txt b/python/cuvs/cuvs/neighbors/filters/CMakeLists.txt
index a678852d9..43e008363 100644
--- a/python/cuvs/cuvs/neighbors/filters/CMakeLists.txt
+++ b/python/cuvs/cuvs/neighbors/filters/CMakeLists.txt
@@ -20,7 +20,7 @@ set(linked_libraries cuvs::cuvs cuvs::c_api)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX neighbors_prefilter_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_prefilter_
 )
 
 foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
diff --git a/python/cuvs/cuvs/neighbors/hnsw/CMakeLists.txt b/python/cuvs/cuvs/neighbors/hnsw/CMakeLists.txt
index 8351916e6..c33313c3c 100644
--- a/python/cuvs/cuvs/neighbors/hnsw/CMakeLists.txt
+++ b/python/cuvs/cuvs/neighbors/hnsw/CMakeLists.txt
@@ -20,7 +20,7 @@ set(linked_libraries cuvs::cuvs cuvs::c_api)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX neighbors_hnsw_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_hnsw_
 )
 
 foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
diff --git a/python/cuvs/cuvs/neighbors/ivf_flat/CMakeLists.txt b/python/cuvs/cuvs/neighbors/ivf_flat/CMakeLists.txt
index f5663cdaa..eadb8934c 100644
--- a/python/cuvs/cuvs/neighbors/ivf_flat/CMakeLists.txt
+++ b/python/cuvs/cuvs/neighbors/ivf_flat/CMakeLists.txt
@@ -20,7 +20,7 @@ set(linked_libraries cuvs::cuvs cuvs::c_api)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX neighbors_ivf_flat_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_ivf_flat_
 )
 
 foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
diff --git a/python/cuvs/cuvs/neighbors/ivf_pq/CMakeLists.txt b/python/cuvs/cuvs/neighbors/ivf_pq/CMakeLists.txt
index a24320ded..df61793b8 100644
--- a/python/cuvs/cuvs/neighbors/ivf_pq/CMakeLists.txt
+++ b/python/cuvs/cuvs/neighbors/ivf_pq/CMakeLists.txt
@@ -20,7 +20,7 @@ set(linked_libraries cuvs::cuvs cuvs::c_api)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX neighbors_pq_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_pq_
 )
 
 foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
diff --git a/python/cuvs/pyproject.toml b/python/cuvs/pyproject.toml
index 155e454a8..30658623b 100644
--- a/python/cuvs/pyproject.toml
+++ b/python/cuvs/pyproject.toml
@@ -33,10 +33,6 @@ requires-python = ">=3.10"
 dependencies = [
     "cuda-python",
     "numpy>=1.23,<3.0a0",
-    "nvidia-cublas",
-    "nvidia-curand",
-    "nvidia-cusolver",
-    "nvidia-cusparse",
     "pylibraft==25.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -59,12 +55,6 @@ test = [
 Homepage = "https://github.com/rapidsai/cuvs"
 Documentation = "https://docs.rapids.ai/api/cuvs/stable/"
 
-[tool.setuptools]
-license-files = ["LICENSE"]
-
-[tool.setuptools.dynamic]
-version = {file = "cuvs/VERSION"}
-
 [tool.isort]
 line_length = 79
 multi_line_output = 3
@@ -127,18 +117,23 @@ requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cuda-python",
     "cython>=3.0.0",
+    "libcuvs==25.2.*,>=0.0.0a0",
+    "libraft==25.2.*,>=0.0.0a0",
+    "librmm==25.2.*,>=0.0.0a0",
     "ninja",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"
-matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
+matrix-entry = "cuda_suffixed=true"
 
 [tool.pydistcheck]
 select = [
-    # NOTE: size threshold is managed via CLI args in CI scripts
     "distro-too-large-compressed",
 ]
 
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'
+
 [tool.pytest.ini_options]
 filterwarnings = [
     "error",
diff --git a/python/libcuvs/CMakeLists.txt b/python/libcuvs/CMakeLists.txt
new file mode 100644
index 000000000..569652b71
--- /dev/null
+++ b/python/libcuvs/CMakeLists.txt
@@ -0,0 +1,69 @@
+# =============================================================================
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
+
+include(../../rapids_config.cmake)
+
+include(rapids-cuda)
+rapids_cuda_init_architectures(libcuvs-python)
+
+project(
+  libcuvs-python
+  VERSION "${RAPIDS_VERSION}"
+  LANGUAGES CXX CUDA
+)
+
+# Check if cuVS is already available. If so, it is the user's responsibility to ensure that the
+# CMake package is also available at build time of the Python cuvs package.
+find_package(cuvs "${RAPIDS_VERSION}")
+
+if(cuvs_FOUND)
+  return()
+endif()
+
+unset(cuvs_FOUND)
+
+# --- CUDA --- #
+set(CUDA_STATIC_RUNTIME ON)
+set(CUDA_STATIC_MATH_LIBRARIES OFF)
+
+# --- RAFT ---#
+set(CUVS_USE_RAFT_STATIC OFF)
+
+# --- cuVS ---#
+set(BUILD_TESTS OFF)
+set(BUILD_C_LIBRARY ON)
+set(CUVS_COMPILE_DYNAMIC_ONLY ON)
+
+add_subdirectory(../../cpp cuvs-cpp)
+
+# assumes libcuvs.so is installed 2 levels deep, e.g. site-packages/libcuvs/lib64/libcuvs.so
+set(rpaths
+    "$ORIGIN/../../nvidia/cublas/lib"
+    "$ORIGIN/../../nvidia/curand/lib"
+    "$ORIGIN/../../nvidia/cusolver/lib"
+    "$ORIGIN/../../nvidia/cusparse/lib"
+    "$ORIGIN/../../nvidia/nvjitlink/lib"
+)
+set_property(
+  TARGET cuvs
+  PROPERTY INSTALL_RPATH ${rpaths}
+  APPEND
+)
+set_property(
+  TARGET cuvs_c
+  PROPERTY INSTALL_RPATH ${rpaths}
+  APPEND
+)
diff --git a/python/libcuvs/LICENSE b/python/libcuvs/LICENSE
new file mode 120000
index 000000000..30cff7403
--- /dev/null
+++ b/python/libcuvs/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/libcuvs/README.md b/python/libcuvs/README.md
new file mode 120000
index 000000000..fe8400541
--- /dev/null
+++ b/python/libcuvs/README.md
@@ -0,0 +1 @@
+../../README.md
\ No newline at end of file
diff --git a/python/libcuvs/libcuvs/VERSION b/python/libcuvs/libcuvs/VERSION
new file mode 120000
index 000000000..d62dc733e
--- /dev/null
+++ b/python/libcuvs/libcuvs/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/libcuvs/libcuvs/__init__.py b/python/libcuvs/libcuvs/__init__.py
new file mode 100644
index 000000000..2d3a86015
--- /dev/null
+++ b/python/libcuvs/libcuvs/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from libcuvs._version import __git_commit__, __version__
+from libcuvs.load import load_library
diff --git a/python/libcuvs/libcuvs/_version.py b/python/libcuvs/libcuvs/_version.py
new file mode 100644
index 000000000..530bf8bea
--- /dev/null
+++ b/python/libcuvs/libcuvs/_version.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.resources
+
+__version__ = (
+    importlib.resources.files(__package__)
+    .joinpath("VERSION")
+    .read_text()
+    .strip()
+)
+try:
+    __git_commit__ = (
+        importlib.resources.files(__package__)
+        .joinpath("GIT_COMMIT")
+        .read_text()
+        .strip()
+    )
+except FileNotFoundError:
+    __git_commit__ = ""
+
+__all__ = ["__git_commit__", "__version__"]
diff --git a/python/libcuvs/libcuvs/load.py b/python/libcuvs/libcuvs/load.py
new file mode 100644
index 000000000..a9c6a9325
--- /dev/null
+++ b/python/libcuvs/libcuvs/load.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import ctypes
+import os
+
+# Loading with RTLD_LOCAL adds the library itself to the loader's
+# loaded library cache without loading any symbols into the global
+# namespace. This allows libraries that express a dependency on
+# this library to be loaded later and successfully satisfy this dependency
+# without polluting the global symbol table with symbols from
+# libcuvs that could conflict with symbols from other DSOs.
+PREFERRED_LOAD_FLAG = ctypes.RTLD_LOCAL
+
+
+def _load_system_installation(soname: str):
+    """Try to dlopen() the library indicated by ``soname``
+    Raises ``OSError`` if library cannot be loaded.
+    """
+    return ctypes.CDLL(soname, PREFERRED_LOAD_FLAG)
+
+
+def _load_wheel_installation(soname: str):
+    """Try to dlopen() the library indicated by ``soname``
+    Returns ``None`` if the library cannot be loaded.
+    """
+    if os.path.isfile(
+        lib := os.path.join(os.path.dirname(__file__), "lib64", soname)
+    ):
+        return ctypes.CDLL(lib, PREFERRED_LOAD_FLAG)
+    return None
+
+
+def load_library():
+    """Dynamically load libcuvs.so and its dependencies"""
+    try:
+        # libraft must be loaded before libcuvs because libcuvs
+        # references its symbols
+        import libraft
+
+        libraft.load_library()
+    except ModuleNotFoundError:
+        # 'libcuvs' has a runtime dependency on 'libraft'. However,
+        # that dependency might be satisfied by the 'libraft' conda package
+        # (which does not have any Python modules), instead of the
+        # 'libraft' wheel.
+        #
+        # In that situation, assume that 'libraft.so' is in a place where
+        # the loader can find it.
+        pass
+
+    prefer_system_installation = (
+        os.getenv("RAPIDS_LIBCUVS_PREFER_SYSTEM_LIBRARY", "false").lower()
+        != "false"
+    )
+
+    libs_to_return = []
+    for soname in ["libcuvs.so", "libcuvs_c.so"]:
+        libcuvs_lib = None
+        if prefer_system_installation:
+            # Prefer a system library if one is present to
+            # avoid clobbering symbols that other packages might expect,
+            # but if no other library is present use the one in the wheel.
+            try:
+                libcuvs_lib = _load_system_installation(soname)
+            except OSError:
+                libcuvs_lib = _load_wheel_installation(soname)
+        else:
+            # Prefer the libraries bundled in this package. If they aren't
+            # found (which might be the case in builds where the library was
+            # prebuilt before packaging the wheel), look for a system
+            # installation.
+            try:
+                libcuvs_lib = _load_wheel_installation(soname)
+                if libcuvs_lib is None:
+                    libcuvs_lib = _load_system_installation(soname)
+            except OSError:
+                # If none of the searches above succeed, just silently return
+                # None and rely on other mechanisms (like RPATHs on other DSOs)
+                # to help the loader find the library.
+                pass
+        if libcuvs_lib:
+            libs_to_return.append(libcuvs_lib)
+
+    # The caller almost never needs to do anything with this library, but no
+    # harm in offering the option since this object at least provides a handle
+    # to inspect where libcuvs was loaded from.
+    return libs_to_return
diff --git a/python/libcuvs/pyproject.toml b/python/libcuvs/pyproject.toml
new file mode 100644
index 000000000..28443b782
--- /dev/null
+++ b/python/libcuvs/pyproject.toml
@@ -0,0 +1,108 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[build-system]
+requires = [
+    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
+    "scikit-build-core[pyproject]>=0.10.0",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+build-backend = "rapids_build_backend.build"
+
+[project]
+name = "libcuvs"
+dynamic = ["version"]
+description = "cuVS: Vector Search on the GPU (C++)"
+readme = { file = "README.md", content-type = "text/markdown" }
+authors = [
+    { name = "NVIDIA Corporation" },
+]
+license = { text = "Apache 2.0" }
+requires-python = ">=3.10"
+dependencies = [
+    "libraft==25.2.*,>=0.0.0a0",
+    "nvidia-cublas",
+    "nvidia-curand",
+    "nvidia-cusolver",
+    "nvidia-cusparse",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+classifiers = [
+    "Intended Audience :: Developers",
+]
+
+[project.urls]
+Homepage = "https://github.com/rapidsai/cuvs"
+Documentation = "https://docs.rapids.ai/api/cuvs/stable/"
+
+[project.entry-points."cmake.prefix"]
+libcuvs = "libcuvs"
+
+[tool.isort]
+line_length = 79
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+combine_as_imports = true
+order_by_type = true
+known_first_party = [
+    "libcuvs",
+]
+skip = [
+    "thirdparty",
+    ".eggs",
+    ".git",
+    ".hg",
+    ".mypy_cache",
+    ".tox",
+    ".venv",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "__init__.py",
+]
+
+[tool.scikit-build]
+build-dir = "build/{wheel_tag}"
+cmake.build-type = "Release"
+cmake.version = "CMakeLists.txt"
+minimum-version = "build-system.requires"
+ninja.make-fallback = true
+sdist.reproducible = true
+wheel.install-dir = "libcuvs"
+wheel.packages = ["libcuvs"]
+wheel.py-api = "py3"
+
+[tool.scikit-build.metadata.version]
+provider = "scikit_build_core.metadata.regex"
+input = "libcuvs/VERSION"
+regex = "(?P<value>.*)"
+
+[tool.rapids-build-backend]
+build-backend = "scikit_build_core.build"
+requires = [
+    "cmake>=3.26.4,!=3.30.0",
+    "libraft==25.2.*,>=0.0.0a0",
+    "librmm==25.2.*,>=0.0.0a0",
+    "ninja",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
+
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# detect when package size grows significantly
+max_allowed_size_compressed = '1.1G'

From 80370a1220af0928d69ca96a66e6b6ff1220bab7 Mon Sep 17 00:00:00 2001
From: tsuki <12711693+enp1s0@users.noreply.github.com>
Date: Fri, 24 Jan 2025 02:35:08 +0900
Subject: [PATCH 39/39] Improve the performance of CAGRA new vector addition
 with the default params (#569)

This PR updates the default chunk size of the CAGRA graph extension and also adds a knob to control the batch size of the CAGRA searches run inside for better throughput.

The default chunk size was set to 1 in the current implementation because there is a potential problem with low recall when the chunk size is large, because no edges are made within nodes in the same chunk. However, as I have investigated, the low recall problem rarely occurs with large chunk sizes.

# Search performance

The performance was measured after applying a bugfix https://github.com/rapidsai/cuvs/pull/565

## degree = 32


![extend-ir0 9-degree32](https://github.com/user-attachments/assets/a5bb2fb6-8c12-49ad-b96a-1b384d79a96b)


(I don't know the reason the performance is unstable in NYTimes.)

## degree = 64
![extend-ir0 9-degree64](https://github.com/user-attachments/assets/8e926e1c-d772-4682-9419-9cc027f09d3f)

So I increase the default chunk size to the size of the new dataset vectors for better throughput in this PR. I also make public a knob to control the search batch size in the `extend' function to control the balance between throughput and memory consumption.

Authors:
  - tsuki (https://github.com/enp1s0)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Tamas Bela Feher (https://github.com/tfeher)

URL: https://github.com/rapidsai/cuvs/pull/569
---
 cpp/src/neighbors/detail/cagra/add_nodes.cuh | 34 ++++++++++++++------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/add_nodes.cuh b/cpp/src/neighbors/detail/cagra/add_nodes.cuh
index 358b7643e..453928992 100644
--- a/cpp/src/neighbors/detail/cagra/add_nodes.cuh
+++ b/cpp/src/neighbors/detail/cagra/add_nodes.cuh
@@ -37,7 +37,8 @@ void add_node_core(
   const cuvs::neighbors::cagra::index<T, IdxT>& idx,
   raft::mdspan<const T, raft::matrix_extent<int64_t>, raft::layout_stride, Accessor>
     additional_dataset_view,
-  raft::host_matrix_view<IdxT, std::int64_t> updated_graph)
+  raft::host_matrix_view<IdxT, std::int64_t> updated_graph,
+  const cuvs::neighbors::cagra::extend_params& extend_params)
 {
   using DistanceT                 = float;
   const std::size_t degree        = idx.graph_degree();
@@ -68,7 +69,19 @@ void add_node_core(
              new_size,
              raft::resource::get_cuda_stream(handle));
 
-  const std::size_t max_chunk_size = 1024;
+  std::size_t data_size_per_vector =
+    sizeof(IdxT) * base_degree + sizeof(DistanceT) * base_degree + sizeof(T) * dim;
+  cudaPointerAttributes attr;
+  RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, additional_dataset_view.data_handle()));
+  if (attr.devicePointer == nullptr) {
+    // for batch_load_iterator
+    data_size_per_vector += sizeof(T) * dim;
+  }
+
+  const std::size_t max_search_batch_size =
+    std::min(std::max(1lu, raft::resource::get_workspace_free_bytes(handle) / data_size_per_vector),
+             num_add);
+  RAFT_EXPECTS(max_search_batch_size > 0, "No enough working memory space is left.");
 
   cuvs::neighbors::cagra::search_params params;
   params.itopk_size = std::max(base_degree * 2lu, 256lu);
@@ -77,24 +90,24 @@ void add_node_core(
   auto mr = raft::resource::get_workspace_resource(handle);
 
   auto neighbor_indices = raft::make_device_mdarray<IdxT, std::int64_t>(
-    handle, mr, raft::make_extents<std::int64_t>(max_chunk_size, base_degree));
+    handle, mr, raft::make_extents<std::int64_t>(max_search_batch_size, base_degree));
 
   auto neighbor_distances = raft::make_device_mdarray<DistanceT, std::int64_t>(
-    handle, mr, raft::make_extents<std::int64_t>(max_chunk_size, base_degree));
+    handle, mr, raft::make_extents<std::int64_t>(max_search_batch_size, base_degree));
 
   auto queries = raft::make_device_mdarray<T, std::int64_t>(
-    handle, mr, raft::make_extents<std::int64_t>(max_chunk_size, dim));
+    handle, mr, raft::make_extents<std::int64_t>(max_search_batch_size, dim));
 
   auto host_neighbor_indices =
-    raft::make_host_matrix<IdxT, std::int64_t>(max_chunk_size, base_degree);
+    raft::make_host_matrix<IdxT, std::int64_t>(max_search_batch_size, base_degree);
 
   cuvs::spatial::knn::detail::utils::batch_load_iterator<T> additional_dataset_batch(
     additional_dataset_view.data_handle(),
     num_add,
     additional_dataset_view.stride(0),
-    max_chunk_size,
+    max_search_batch_size,
     raft::resource::get_cuda_stream(handle),
-    raft::resource::get_workspace_resource(handle));
+    mr);
   for (const auto& batch : additional_dataset_batch) {
     // Step 1: Obtain K (=base_degree) nearest neighbors of the new vectors by CAGRA search
     // Create queries
@@ -254,7 +267,8 @@ void add_graph_nodes(
   const std::size_t degree               = index.graph_degree();
   const std::size_t dim                  = index.dim();
   const std::size_t stride               = input_updated_dataset_view.stride(0);
-  const std::size_t max_chunk_size_      = params.max_chunk_size == 0 ? 1 : params.max_chunk_size;
+  const std::size_t max_chunk_size_ =
+    params.max_chunk_size == 0 ? new_dataset_size : params.max_chunk_size;
 
   raft::copy(updated_graph_view.data_handle(),
              index.graph().data_handle(),
@@ -298,7 +312,7 @@ void add_graph_nodes(
       stride);
 
     neighbors::cagra::add_node_core<T, IdxT>(
-      handle, internal_index, additional_dataset_view, updated_graph);
+      handle, internal_index, additional_dataset_view, updated_graph, params);
     raft::resource::sync_stream(handle);
   }
 }