diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index e93b7a694..59b8e00de 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -80,7 +80,30 @@ jobs:
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
       sha: ${{ inputs.sha }}
+  wheel-build-libcuvs:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_libcuvs.sh
+      # build for every combination of arch and CUDA version, but only for the latest Python
+      matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
+  wheel-publish-libcuvs:
+    needs: wheel-build-libcuvs
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-25.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: libcuvs
+      package-type: cpp
   wheel-build-cuvs:
+    needs: wheel-build-libcuvs
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
@@ -99,3 +122,4 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: cuvs
+      package-type: python
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 91f51bd90..843439f26 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -22,6 +22,7 @@ jobs:
       - conda-python-tests
       - docs-build
       - rust-build
+      - wheel-build-libcuvs
       - wheel-build-cuvs
       - wheel-tests-cuvs
       - devcontainer
@@ -135,10 +136,19 @@ jobs:
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_rust.sh"
-  wheel-build-cuvs:
+  wheel-build-libcuvs:
     needs: checks
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
+    with:
+      build_type: pull-request
+      script: ci/build_wheel_libcuvs.sh
+      # build for every combination of arch and CUDA version, but only for the latest Python
+      matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
+  wheel-build-cuvs:
+    needs: wheel-build-libcuvs
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
       build_type: pull-request
       script: ci/build_wheel_cuvs.sh
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index fcfc7e1fa..240f82be6 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -115,7 +115,7 @@ repos:
                   cpp/cmake/modules/FindAVX\.cmake|
           - id: verify-alpha-spec
       - repo: https://github.com/rapidsai/dependency-file-generator
-        rev: v1.16.0
+        rev: v1.17.0
         hooks:
             - id: rapids-dependency-file-generator
               args: ["--clean"]
diff --git a/build.sh b/build.sh
index bd5fa649b..3b9a9a3a8 100755
--- a/build.sh
+++ b/build.sh
@@ -313,12 +313,6 @@ if [[ ${CMAKE_TARGET} == "" ]]; then
     CMAKE_TARGET="all"
 fi
 
-
-SKBUILD_EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS}"
-if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_CUVS_CPP"* ]]; then
-    SKBUILD_EXTRA_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS};-DFIND_CUVS_CPP=ON"
-fi
-
 # If clean given, run it prior to any other steps
 if (( ${CLEAN} == 1 )); then
     # If the dirs to clean are mounted dirs in a container, the
@@ -434,7 +428,7 @@ fi
 
 # Build and (optionally) install the cuvs Python package
 if (( ${NUMARGS} == 0 )) || hasArg python; then
-    SKBUILD_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS}" \
+    SKBUILD_CMAKE_ARGS="${EXTRA_CMAKE_ARGS}" \
         SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL}" \
         python -m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true ${REPODIR}/python/cuvs
 fi
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 4994374a8..c6f1232b3 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -1,10 +1,11 @@
 #!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
 package_name=$1
 package_dir=$2
+package_type=$3
 underscore_package_name=$(echo "${package_name}" | tr "-" "_")
 
 source rapids-configure-sccache
@@ -16,21 +17,22 @@ rapids-generate-version > ./VERSION
 
 cd "${package_dir}"
 
-case "${RAPIDS_CUDA_VERSION}" in
-  12.*)
-    EXCLUDE_ARGS=(
-      --exclude "libcublas.so.12"
-      --exclude "libcublasLt.so.12"
-      --exclude "libcurand.so.10"
-      --exclude "libcusolver.so.11"
-      --exclude "libcusparse.so.12"
-      --exclude "libnvJitLink.so.12"
+EXCLUDE_ARGS=(
+  --exclude "libraft.so"
+  --exclude "libcublas.so.*"
+  --exclude "libcublasLt.so.*"
+  --exclude "libcurand.so.*"
+  --exclude "libcusolver.so.*"
+  --exclude "libcusparse.so.*"
+  --exclude "libnvJitLink.so.*"
+)
+
+if [[ "${package_dir}" != "python/libcuvs" ]]; then
+    EXCLUDE_ARGS+=(
+      --exclude "libcuvs_c.so"
+      --exclude "libcuvs.so"
     )
-  ;;
-  11.*)
-    EXCLUDE_ARGS=()
-  ;;
-esac
+fi
 
 rapids-logger "Building '${package_name}' wheel"
 
@@ -48,4 +50,4 @@ sccache --show-adv-stats
 mkdir -p final_dist
 python -m auditwheel repair -w final_dist "${EXCLUDE_ARGS[@]}" dist/*
 
-RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python final_dist
+RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_type} final_dist
diff --git a/ci/build_wheel_cuvs.sh b/ci/build_wheel_cuvs.sh
index 444657cc0..fb40d1459 100755
--- a/ci/build_wheel_cuvs.sh
+++ b/ci/build_wheel_cuvs.sh
@@ -1,21 +1,20 @@
 #!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
 package_dir="python/cuvs"
 
-case "${RAPIDS_CUDA_VERSION}" in
-  12.*)
-    EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=ON"
-  ;;
-  11.*)
-    EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=OFF"
-  ;;
-esac
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-# Set up skbuild options. Enable sccache in skbuild config options
-export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_CUVS_CPP=OFF${EXTRA_CMAKE_ARGS}"
+# Downloads libcuvs wheels from this current build,
+# then ensures 'cuvs' wheel builds always use the 'libcuvs' just built in the same CI run.
+#
+# Using env variable PIP_CONSTRAINT is necessary to ensure the constraints
+# are used when creating the isolated build environment.
+RAPIDS_PY_WHEEL_NAME="libcuvs_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp /tmp/libcuvs_dist
+echo "libcuvs-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libcuvs_dist/libcuvs_*.whl)" > /tmp/constraints.txt
+export PIP_CONSTRAINT="/tmp/constraints.txt"
 
-ci/build_wheel.sh cuvs ${package_dir}
+ci/build_wheel.sh cuvs ${package_dir} python
 ci/validate_wheel.sh ${package_dir} final_dist
diff --git a/ci/build_wheel_libcuvs.sh b/ci/build_wheel_libcuvs.sh
new file mode 100755
index 000000000..148be89a2
--- /dev/null
+++ b/ci/build_wheel_libcuvs.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# Copyright (c) 2025, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_name="libcuvs"
+package_dir="python/libcuvs"
+
+rapids-logger "Generating build requirements"
+matrix_selectors="cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};cuda_suffixed=true"
+
+rapids-dependency-file-generator \
+  --output requirements \
+  --file-key "py_build_${package_name}" \
+  --file-key "py_rapids_build_${package_name}" \
+  --matrix "${matrix_selectors}" \
+| tee /tmp/requirements-build.txt
+
+rapids-logger "Installing build requirements"
+python -m pip install \
+    -v \
+    --prefer-binary \
+    -r /tmp/requirements-build.txt
+
+# build with '--no-build-isolation', for better sccache hit rate
+# 0 really means "add --no-build-isolation" (ref: https://github.com/pypa/pip/issues/5735)
+export PIP_NO_BUILD_ISOLATION=0
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+
+ci/build_wheel.sh libcuvs ${package_dir} cpp
+ci/validate_wheel.sh ${package_dir} final_dist libcuvs
diff --git a/ci/check_style.sh b/ci/check_style.sh
index c22f3f9f0..952e94bf1 100755
--- a/ci/check_style.sh
+++ b/ci/check_style.sh
@@ -14,5 +14,12 @@ rapids-dependency-file-generator \
 rapids-mamba-retry env create --yes -f env.yaml -n checks
 conda activate checks
 
+# get config for cmake-format checks
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+FORMAT_FILE_URL="https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION_MAJOR_MINOR}/cmake-format-rapids-cmake.json"
+export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json
+mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE})
+wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL}
+
 # Run pre-commit checks
 pre-commit run --all-files --show-diff-on-failure
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 4cf1f0617..7562035a9 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -44,8 +44,10 @@ echo "${NEXT_FULL_TAG}" > VERSION
 DEPENDENCIES=(
   dask-cuda
   cuvs
-  pylibraft
+  libcuvs
+  libraft
   librmm
+  pylibraft
   rmm
   rapids-dask-dependency
 )
diff --git a/ci/test_wheel_cuvs.sh b/ci/test_wheel_cuvs.sh
index 7033003e9..862c69a3a 100755
--- a/ci/test_wheel_cuvs.sh
+++ b/ci/test_wheel_cuvs.sh
@@ -1,13 +1,16 @@
 #!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
 mkdir -p ./dist
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="cuvs_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="libcuvs_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./local-libcuvs-dep
+RAPIDS_PY_WHEEL_NAME="cuvs_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
 # echo to expand wildcard before adding `[extra]` requires for pip
-python -m pip install $(echo ./dist/cuvs*.whl)[test]
+python -m pip install \
+    ./local-libcuvs-dep/libcuvs*.whl \
+    "$(echo ./dist/cuvs*.whl)[test]"
 
 python -m pytest ./python/cuvs/cuvs/test
diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh
index f2b235765..19d413fa2 100755
--- a/ci/validate_wheel.sh
+++ b/ci/validate_wheel.sh
@@ -8,24 +8,12 @@ wheel_dir_relative_path=$2
 
 RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}"
 
-# some packages are much larger on CUDA 11 than on CUDA 12
-if [[ "${RAPIDS_CUDA_MAJOR}" == "11" ]]; then
-    PYDISTCHECK_ARGS=(
-        --max-allowed-size-compressed '1.4G'
-    )
-else
-    PYDISTCHECK_ARGS=(
-        --max-allowed-size-compressed '950M'
-    )
-fi
-
 cd "${package_dir}"
 
 rapids-logger "validate packages with 'pydistcheck'"
 
 pydistcheck \
     --inspect \
-    "${PYDISTCHECK_ARGS[@]}" \
     "$(echo ${wheel_dir_relative_path}/*.whl)"
 
 rapids-logger "validate packages with 'twine'"
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 01853da84..123acb421 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -9,8 +9,7 @@ channels:
 dependencies:
 - breathe>=4.35.0
 - c-compiler
-- clang
-- clang-tools=16.0.6
+- clang-tools==16.0.6
 - clang==16.0.6
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
@@ -26,7 +25,7 @@ dependencies:
 - gcc_linux-aarch64=11.*
 - graphviz
 - ipython
-- libclang
+- libclang==16.0.6
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcurand-dev=10.3.0.86
@@ -55,7 +54,7 @@ dependencies:
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sphinx>=8.0.0
-- sysroot_linux-aarch64==2.17
+- sysroot_linux-aarch64==2.28
 - pip:
   - nvidia-sphinx-theme
 name: all_cuda-118_arch-aarch64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index a1ad68d7f..c6a65e684 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -9,8 +9,7 @@ channels:
 dependencies:
 - breathe>=4.35.0
 - c-compiler
-- clang
-- clang-tools=16.0.6
+- clang-tools==16.0.6
 - clang==16.0.6
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvtx=11.8
@@ -26,7 +25,7 @@ dependencies:
 - gcc_linux-64=11.*
 - graphviz
 - ipython
-- libclang
+- libclang==16.0.6
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcurand-dev=10.3.0.86
@@ -55,7 +54,7 @@ dependencies:
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sphinx>=8.0.0
-- sysroot_linux-64==2.17
+- sysroot_linux-64==2.28
 - pip:
   - nvidia-sphinx-theme
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-125_arch-aarch64.yaml b/conda/environments/all_cuda-125_arch-aarch64.yaml
index ee0213fff..b71f5ed43 100644
--- a/conda/environments/all_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-125_arch-aarch64.yaml
@@ -9,8 +9,7 @@ channels:
 dependencies:
 - breathe>=4.35.0
 - c-compiler
-- clang
-- clang-tools=16.0.6
+- clang-tools==16.0.6
 - clang==16.0.6
 - cmake>=3.26.4,!=3.30.0
 - cuda-cudart-dev
@@ -24,10 +23,10 @@ dependencies:
 - cython>=3.0.0
 - dlpack>=0.8,<1.0
 - doxygen>=1.8.20
-- gcc_linux-aarch64=11.*
+- gcc_linux-aarch64=13.*
 - graphviz
 - ipython
-- libclang
+- libclang==16.0.6
 - libcublas-dev
 - libcurand-dev
 - libcusolver-dev
@@ -51,7 +50,7 @@ dependencies:
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sphinx>=8.0.0
-- sysroot_linux-aarch64==2.17
+- sysroot_linux-aarch64==2.28
 - pip:
   - nvidia-sphinx-theme
 name: all_cuda-125_arch-aarch64
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index d93dcaf7a..16cd595d3 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -9,8 +9,7 @@ channels:
 dependencies:
 - breathe>=4.35.0
 - c-compiler
-- clang
-- clang-tools=16.0.6
+- clang-tools==16.0.6
 - clang==16.0.6
 - cmake>=3.26.4,!=3.30.0
 - cuda-cudart-dev
@@ -24,10 +23,10 @@ dependencies:
 - cython>=3.0.0
 - dlpack>=0.8,<1.0
 - doxygen>=1.8.20
-- gcc_linux-64=11.*
+- gcc_linux-64=13.*
 - graphviz
 - ipython
-- libclang
+- libclang==16.0.6
 - libcublas-dev
 - libcurand-dev
 - libcusolver-dev
@@ -51,7 +50,7 @@ dependencies:
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sphinx>=8.0.0
-- sysroot_linux-64==2.17
+- sysroot_linux-64==2.28
 - pip:
   - nvidia-sphinx-theme
 name: all_cuda-125_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index a90dc03e7..2e2ad8446 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -9,7 +9,7 @@ channels:
 dependencies:
 - benchmark>=1.8.2
 - c-compiler
-- clang-tools=16.0.6
+- clang-tools==16.0.6
 - clang==16.0.6
 - click
 - cmake>=3.26.4,!=3.30.0
@@ -26,6 +26,7 @@ dependencies:
 - gcc_linux-aarch64=11.*
 - glog>=0.6.0
 - h5py>=3.8.0
+- libclang==16.0.6
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcurand-dev=10.3.0.86
@@ -47,6 +48,6 @@ dependencies:
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - setuptools
-- sysroot_linux-aarch64==2.17
+- sysroot_linux-aarch64==2.28
 - wheel
 name: bench_ann_cuda-118_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index b7344c822..90243415c 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -9,7 +9,7 @@ channels:
 dependencies:
 - benchmark>=1.8.2
 - c-compiler
-- clang-tools=16.0.6
+- clang-tools==16.0.6
 - clang==16.0.6
 - click
 - cmake>=3.26.4,!=3.30.0
@@ -26,6 +26,7 @@ dependencies:
 - gcc_linux-64=11.*
 - glog>=0.6.0
 - h5py>=3.8.0
+- libclang==16.0.6
 - libcublas-dev=11.11.3.6
 - libcublas=11.11.3.6
 - libcurand-dev=10.3.0.86
@@ -47,6 +48,6 @@ dependencies:
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - setuptools
-- sysroot_linux-64==2.17
+- sysroot_linux-64==2.28
 - wheel
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
index da7229004..34e01aeea 100644
--- a/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-aarch64.yaml
@@ -9,7 +9,7 @@ channels:
 dependencies:
 - benchmark>=1.8.2
 - c-compiler
-- clang-tools=16.0.6
+- clang-tools==16.0.6
 - clang==16.0.6
 - click
 - cmake>=3.26.4,!=3.30.0
@@ -24,9 +24,10 @@ dependencies:
 - cxx-compiler
 - cython>=3.0.0
 - dlpack>=0.8,<1.0
-- gcc_linux-aarch64=11.*
+- gcc_linux-aarch64=13.*
 - glog>=0.6.0
 - h5py>=3.8.0
+- libclang==16.0.6
 - libcublas-dev
 - libcurand-dev
 - libcusolver-dev
@@ -43,6 +44,6 @@ dependencies:
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - setuptools
-- sysroot_linux-aarch64==2.17
+- sysroot_linux-aarch64==2.28
 - wheel
 name: bench_ann_cuda-125_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
index 5d1dd8fc7..dcfb54a22 100644
--- a/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-125_arch-x86_64.yaml
@@ -9,7 +9,7 @@ channels:
 dependencies:
 - benchmark>=1.8.2
 - c-compiler
-- clang-tools=16.0.6
+- clang-tools==16.0.6
 - clang==16.0.6
 - click
 - cmake>=3.26.4,!=3.30.0
@@ -24,9 +24,10 @@ dependencies:
 - cxx-compiler
 - cython>=3.0.0
 - dlpack>=0.8,<1.0
-- gcc_linux-64=11.*
+- gcc_linux-64=13.*
 - glog>=0.6.0
 - h5py>=3.8.0
+- libclang==16.0.6
 - libcublas-dev
 - libcurand-dev
 - libcusolver-dev
@@ -43,6 +44,6 @@ dependencies:
 - pyyaml
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - setuptools
-- sysroot_linux-64==2.17
+- sysroot_linux-64==2.28
 - wheel
 name: bench_ann_cuda-125_arch-x86_64
diff --git a/conda/recipes/cuvs-bench-cpu/conda_build_config.yaml b/conda/recipes/cuvs-bench-cpu/conda_build_config.yaml
index ed6f708e1..5407d7c17 100644
--- a/conda/recipes/cuvs-bench-cpu/conda_build_config.yaml
+++ b/conda/recipes/cuvs-bench-cpu/conda_build_config.yaml
@@ -1,14 +1,16 @@
 c_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cxx_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 c_stdlib:
   - sysroot
 
 c_stdlib_version:
-  - "2.17"
+  - "2.28"
 
 cmake_version:
   - ">=3.26.4,!=3.30.0"
diff --git a/conda/recipes/cuvs-bench/conda_build_config.yaml b/conda/recipes/cuvs-bench/conda_build_config.yaml
index 47bd730da..ccd7341d1 100644
--- a/conda/recipes/cuvs-bench/conda_build_config.yaml
+++ b/conda/recipes/cuvs-bench/conda_build_config.yaml
@@ -1,20 +1,20 @@
 c_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cxx_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cuda_compiler:
-  - cuda-nvcc
-
-cuda11_compiler:
-  - nvcc
+  - cuda-nvcc  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - nvcc  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 c_stdlib:
   - sysroot
 
 c_stdlib_version:
-  - "2.17"
+  - "2.28"
 
 cmake_version:
   - ">=3.26.4,!=3.30.0"
diff --git a/conda/recipes/cuvs-bench/meta.yaml b/conda/recipes/cuvs-bench/meta.yaml
index d77aee8ce..33b1745ec 100644
--- a/conda/recipes/cuvs-bench/meta.yaml
+++ b/conda/recipes/cuvs-bench/meta.yaml
@@ -37,10 +37,8 @@ build:
   number: {{ GIT_DESCRIBE_NUMBER }}
   string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
   ignore_run_exports_from:
-    {% if cuda_major == "11" %}
-    - {{ compiler('cuda11') }}
-    {% else %}
     - {{ compiler('cuda') }}
+    {% if cuda_major != "11" %}
     - cuda-cudart-dev
     - libcublas-dev
     {% endif %}
@@ -50,7 +48,7 @@ requirements:
     - {{ compiler('c') }}
     - {{ compiler('cxx') }}
     {% if cuda_major == "11" %}
-    - {{ compiler('cuda11') }} ={{ cuda_version }}
+    - {{ compiler('cuda') }} ={{ cuda_version }}
     {% else %}
     - {{ compiler('cuda') }}
     {% endif %}
diff --git a/conda/recipes/cuvs/conda_build_config.yaml b/conda/recipes/cuvs/conda_build_config.yaml
index 001878ff2..83f5ebcb1 100644
--- a/conda/recipes/cuvs/conda_build_config.yaml
+++ b/conda/recipes/cuvs/conda_build_config.yaml
@@ -1,20 +1,20 @@
 c_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cxx_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cuda_compiler:
-  - cuda-nvcc
-
-cuda11_compiler:
-  - nvcc
+  - cuda-nvcc  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - nvcc  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 c_stdlib:
   - sysroot
 
 c_stdlib_version:
-  - "2.17"
+  - "2.28"
 
 cmake_version:
   - ">=3.26.4,!=3.30.0"
diff --git a/conda/recipes/cuvs/meta.yaml b/conda/recipes/cuvs/meta.yaml
index ad7ffe756..25fc204a8 100644
--- a/conda/recipes/cuvs/meta.yaml
+++ b/conda/recipes/cuvs/meta.yaml
@@ -20,10 +20,8 @@ build:
   number: {{ GIT_DESCRIBE_NUMBER }}
   string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
   ignore_run_exports_from:
-    {% if cuda_major == "11" %}
-    - {{ compiler('cuda11') }}
-    {% else %}
     - {{ compiler('cuda') }}
+    {% if cuda_major != "11" %}
     - cuda-cudart-dev
     {% endif %}
     - cuda-python
@@ -33,7 +31,7 @@ requirements:
     - {{ compiler('c') }}
     - {{ compiler('cxx') }}
     {% if cuda_major == "11" %}
-    - {{ compiler('cuda11') }} ={{ cuda_version }}
+    - {{ compiler('cuda') }} ={{ cuda_version }}
     {% else %}
     - {{ compiler('cuda') }}
     {% endif %}
diff --git a/conda/recipes/libcuvs/conda_build_config.yaml b/conda/recipes/libcuvs/conda_build_config.yaml
index b8c49943e..72cc4415d 100644
--- a/conda/recipes/libcuvs/conda_build_config.yaml
+++ b/conda/recipes/libcuvs/conda_build_config.yaml
@@ -1,20 +1,20 @@
 c_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cxx_compiler_version:
-  - 11
+  - 13  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - 11  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 cuda_compiler:
-  - cuda-nvcc
-
-cuda11_compiler:
-  - nvcc
+  - cuda-nvcc  # [not os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
+  - nvcc  # [os.environ.get("RAPIDS_CUDA_VERSION", "").startswith("11")]
 
 c_stdlib:
   - sysroot
 
 c_stdlib_version:
-  - "2.17"
+  - "2.28"
 
 cmake_version:
   - ">=3.26.4,!=3.30.0"
diff --git a/conda/recipes/libcuvs/meta.yaml b/conda/recipes/libcuvs/meta.yaml
index 46552c397..fd466cd22 100644
--- a/conda/recipes/libcuvs/meta.yaml
+++ b/conda/recipes/libcuvs/meta.yaml
@@ -39,10 +39,8 @@ outputs:
       number: {{ GIT_DESCRIBE_NUMBER }}
       string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
       ignore_run_exports_from:
-        {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }}
-        {% else %}
         - {{ compiler('cuda') }}
+        {% if cuda_major != "11" %}
         - cuda-cudart-dev
         - libcublas-dev
         - libcurand-dev
@@ -54,7 +52,7 @@ outputs:
         - {{ compiler('c') }}
         - {{ compiler('cxx') }}
         {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }} ={{ cuda_version }}
+        - {{ compiler('cuda') }} ={{ cuda_version }}
         {% else %}
         - {{ compiler('cuda') }}
         {% endif %}
@@ -106,10 +104,8 @@ outputs:
       number: {{ GIT_DESCRIBE_NUMBER }}
       string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
       ignore_run_exports_from:
-        {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }}
-        {% else %}
         - {{ compiler('cuda') }}
+        {% if cuda_major != "11" %}
         - cuda-cudart-dev
         - libcublas-dev
         - libcurand-dev
@@ -121,7 +117,7 @@ outputs:
         - {{ compiler('c') }}
         - {{ compiler('cxx') }}
         {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }} ={{ cuda_version }}
+        - {{ compiler('cuda') }} ={{ cuda_version }}
         {% else %}
         - {{ compiler('cuda') }}
         {% endif %}
@@ -174,10 +170,8 @@ outputs:
       number: {{ GIT_DESCRIBE_NUMBER }}
       string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
       ignore_run_exports_from:
-        {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }}
-        {% else %}
         - {{ compiler('cuda') }}
+        {% if cuda_major != "11" %}
         - cuda-cudart-dev
         - libcublas-dev
         - libcurand-dev
@@ -189,7 +183,7 @@ outputs:
         - {{ compiler('c') }}
         - {{ compiler('cxx') }}
         {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }} ={{ cuda_version }}
+        - {{ compiler('cuda') }} ={{ cuda_version }}
         {% else %}
         - {{ compiler('cuda') }}
         {% endif %}
@@ -246,10 +240,8 @@ outputs:
       number: {{ GIT_DESCRIBE_NUMBER }}
       string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
       ignore_run_exports_from:
-        {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }}
-        {% else %}
         - {{ compiler('cuda') }}
+        {% if cuda_major != "11" %}
         - cuda-cudart-dev
         - libcublas-dev
         - libcurand-dev
@@ -261,7 +253,7 @@ outputs:
         - {{ compiler('c') }}
         - {{ compiler('cxx') }}
         {% if cuda_major == "11" %}
-        - {{ compiler('cuda11') }} ={{ cuda_version }}
+        - {{ compiler('cuda') }} ={{ cuda_version }}
         {% else %}
         - {{ compiler('cuda') }}
         {% endif %}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 26c0b82d3..11f21db44 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -68,6 +68,7 @@ option(CUDA_LOG_COMPILE_TIME "Write a log of compilation times to nvcc_compile_l
 option(DETECT_CONDA_ENV "Enable detection of conda environment for dependencies" ON)
 option(DISABLE_DEPRECATION_WARNINGS "Disable deprecaction warnings " ON)
 option(DISABLE_OPENMP "Disable OpenMP" OFF)
+option(CUVS_COMPILE_DYNAMIC_ONLY "Only build the shared library and skip the static library." OFF)
 option(CUVS_NVTX "Enable nvtx markers" OFF)
 option(CUVS_RAFT_CLONE_ON_PIN "Explicitly clone RAFT branch when pinned to non-feature branch" ON)
 
@@ -94,6 +95,7 @@ include(CMakeDependentOption)
 message(VERBOSE "cuVS: Build cuVS unit-tests: ${BUILD_TESTS}")
 message(VERBOSE "cuVS: Build CPU only components: ${BUILD_CPU_ONLY}")
 message(VERBOSE "cuVS: Build ANN benchmarks: ${BUILD_CUVS_BENCH}")
+message(VERBOSE "cuVS: Build only the shared library: ${CUVS_COMPILE_DYNAMIC_ONLY}")
 message(VERBOSE "cuVS: Enable detection of conda environment for dependencies: ${DETECT_CONDA_ENV}")
 message(VERBOSE "cuVS: Disable depreaction warnings " ${DISABLE_DEPRECATION_WARNINGS})
 message(VERBOSE "cuVS: Disable OpenMP: ${DISABLE_OPENMP}")
@@ -493,7 +495,10 @@ if(BUILD_SHARED_LIBS)
   )
 
   add_library(cuvs SHARED $<FILTER:$<TARGET_OBJECTS:cuvs_objs>,EXCLUDE,rmm.*logger>)
-  add_library(cuvs_static STATIC $<FILTER:$<TARGET_OBJECTS:cuvs_objs>,EXCLUDE,rmm.*logger>)
+
+  if(NOT CUVS_COMPILE_DYNAMIC_ONLY)
+    add_library(cuvs_static STATIC $<FILTER:$<TARGET_OBJECTS:cuvs_objs>,EXCLUDE,rmm.*logger>)
+  endif()
 
   target_compile_options(
     cuvs INTERFACE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-extended-lambda
@@ -501,20 +506,23 @@ if(BUILD_SHARED_LIBS)
   )
 
   add_library(cuvs::cuvs ALIAS cuvs)
-  add_library(cuvs::cuvs_static ALIAS cuvs_static)
 
-  set_target_properties(
-    cuvs_static
-    PROPERTIES BUILD_RPATH "\$ORIGIN"
-               INSTALL_RPATH "\$ORIGIN"
-               CXX_STANDARD 17
-               CXX_STANDARD_REQUIRED ON
-               POSITION_INDEPENDENT_CODE ON
-               INTERFACE_POSITION_INDEPENDENT_CODE ON
-               EXPORT_NAME cuvs_static
-  )
+  if(NOT CUVS_COMPILE_DYNAMIC_ONLY)
+    add_library(cuvs::cuvs_static ALIAS cuvs_static)
 
-  target_compile_options(cuvs_static PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>")
+    set_target_properties(
+      cuvs_static
+      PROPERTIES BUILD_RPATH "\$ORIGIN"
+                 INSTALL_RPATH "\$ORIGIN"
+                 CXX_STANDARD 17
+                 CXX_STANDARD_REQUIRED ON
+                 POSITION_INDEPENDENT_CODE ON
+                 INTERFACE_POSITION_INDEPENDENT_CODE ON
+                 EXPORT_NAME cuvs_static
+    )
+
+    target_compile_options(cuvs_static PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUVS_CXX_FLAGS}>")
+  endif()
 
   target_include_directories(
     cuvs_objs
@@ -523,19 +531,21 @@ if(BUILD_SHARED_LIBS)
     INTERFACE "$<INSTALL_INTERFACE:include>"
   )
 
-  target_include_directories(
-    cuvs_static
-    PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>"
-    INTERFACE "$<INSTALL_INTERFACE:include>"
-  )
+  if(NOT CUVS_COMPILE_DYNAMIC_ONLY)
+    target_include_directories(
+      cuvs_static
+      PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>"
+      INTERFACE "$<INSTALL_INTERFACE:include>"
+    )
 
-  # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
-  target_link_options(cuvs_static PRIVATE $<HOST_LINK:${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld>)
+    # ensure CUDA symbols aren't relocated to the middle of the debug build binaries
+    target_link_options(cuvs_static PRIVATE $<HOST_LINK:${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld>)
 
-  target_include_directories(
-    cuvs_static PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
-                       "$<INSTALL_INTERFACE:include>"
-  )
+    target_include_directories(
+      cuvs_static PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+                         "$<INSTALL_INTERFACE:include>"
+    )
+  endif()
 
   target_include_directories(
     cuvs PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
@@ -571,11 +581,13 @@ if(BUILD_SHARED_LIBS)
               cuvs-cagra-search ${CUVS_COMMS_DEPENDENCY}
     )
 
-    target_link_libraries(
-      cuvs_static
-      PUBLIC rmm::rmm raft::raft ${CUVS_CTK_MATH_DEPENDENCIES}
-      PRIVATE nvidia::cutlass::cutlass $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
-    )
+    if(NOT CUVS_COMPILE_DYNAMIC_ONLY)
+      target_link_libraries(
+        cuvs_static
+        PUBLIC rmm::rmm raft::raft ${CUVS_CTK_MATH_DEPENDENCIES}
+        PRIVATE nvidia::cutlass::cutlass $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+      )
+    endif()
   endif()
 
   if(BUILD_MG_ALGOS)
@@ -718,8 +730,13 @@ target_compile_definitions(cuvs::cuvs INTERFACE $<$<BOOL:${CUVS_NVTX}>:NVTX_ENAB
   include(GNUInstallDirs)
   include(CPack)
 
+  set(_cuvs_lib_targets cuvs)
+  if(NOT CUVS_COMPILE_DYNAMIC_ONLY)
+    list(APPEND _cuvs_lib_targets cuvs_static)
+  endif()
+
   install(
-    TARGETS cuvs cuvs_static
+    TARGETS ${_cuvs_lib_targets}
     DESTINATION ${lib_dir}
     COMPONENT cuvs
     EXPORT cuvs-exports
diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake
index 2e57df84e..845c7a833 100644
--- a/cpp/cmake/thirdparty/get_raft.cmake
+++ b/cpp/cmake/thirdparty/get_raft.cmake
@@ -44,6 +44,7 @@ function(find_and_configure_raft)
             INSTALL_EXPORT_SET  cuvs-exports
             COMPONENTS          ${RAFT_COMPONENTS}
             CPM_ARGS
+              EXCLUDE_FROM_ALL TRUE  
               GIT_REPOSITORY        https://github.com/${PKG_FORK}/raft.git
               GIT_TAG               ${PKG_PINNED_TAG}
               SOURCE_SUBDIR         cpp
diff --git a/cpp/src/distance/detail/distance_ops/l2_exp.cuh b/cpp/src/distance/detail/distance_ops/l2_exp.cuh
index 04817aa8b..f49771605 100644
--- a/cpp/src/distance/detail/distance_ops/l2_exp.cuh
+++ b/cpp/src/distance/detail/distance_ops/l2_exp.cuh
@@ -28,14 +28,14 @@ namespace cuvs::distance::detail::ops {
  * for round-off error tolerance.
  * @tparam DataT
  */
-template <typename DataT>
-__device__ constexpr DataT get_clamp_precision()
+template <typename DataT, typename AccT>
+__device__ constexpr AccT get_clamp_precision()
 {
   switch (sizeof(DataT)) {
-    case 2: return 1e-3;
-    case 4: return 1e-6;
-    case 8: return 1e-15;
-    default: return 0;
+    case 2: return AccT{1e-3};
+    case 4: return AccT{1e-6};
+    case 8: return AccT{1e-15};
+    default: return AccT{0};
   }
 }
 
@@ -46,19 +46,27 @@ struct l2_exp_cutlass_op {
 
   __device__ l2_exp_cutlass_op() noexcept : sqrt(false) {}
   __device__ l2_exp_cutlass_op(bool isSqrt) noexcept : sqrt(isSqrt) {}
-  inline __device__ AccT operator()(DataT aNorm, DataT bNorm, DataT accVal) const noexcept
+  inline __device__ AccT operator()(AccT aNorm, AccT bNorm, AccT accVal) const noexcept
   {
-    AccT outVal = aNorm + bNorm - DataT(2.0) * accVal;
+    AccT outVal = aNorm + bNorm - AccT(2.0) * accVal;
 
     /**
      * Self-neighboring points should have (aNorm == bNorm) == accVal and the dot product (accVal)
      * can sometimes have round-off errors, which will cause (aNorm == bNorm) ~ accVal instead.
      */
-    outVal = outVal * AccT(!((outVal * outVal < get_clamp_precision<AccT>()) * (aNorm == bNorm)));
+    outVal =
+      outVal * AccT(!((outVal * outVal < get_clamp_precision<DataT, AccT>()) * (aNorm == bNorm)));
     return sqrt ? raft::sqrt(outVal * static_cast<AccT>(outVal > AccT(0))) : outVal;
   }
 
-  __device__ AccT operator()(DataT aData) const noexcept { return aData; }
+  __device__ AccT operator()(DataT aData) const noexcept
+  {
+    if constexpr (std::is_same_v<DataT, half> && std::is_same_v<AccT, float>) {
+      return __half2float(aData);
+    } else {
+      return aData;
+    }
+  }
 };
 
 /**
@@ -121,9 +129,9 @@ struct l2_exp_distance_op {
          * (accVal) can sometimes have round-off errors, which will cause (aNorm == bNorm) ~ accVal
          * instead.
          */
-        acc[i][j] =
-          val * static_cast<AccT>((val > AccT(0))) *
-          static_cast<AccT>(!((val * val < get_clamp_precision<AccT>()) * (regxn[i] == regyn[j])));
+        acc[i][j] = val * static_cast<AccT>((val > AccT(0))) *
+                    static_cast<AccT>(
+                      !((val * val < get_clamp_precision<DataT, AccT>()) * (regxn[i] == regyn[j])));
       }
     }
     if (sqrt) {
diff --git a/cpp/src/neighbors/detail/cagra/add_nodes.cuh b/cpp/src/neighbors/detail/cagra/add_nodes.cuh
index 63f5c51a6..913094e2a 100644
--- a/cpp/src/neighbors/detail/cagra/add_nodes.cuh
+++ b/cpp/src/neighbors/detail/cagra/add_nodes.cuh
@@ -37,7 +37,8 @@ void add_node_core(
   const cuvs::neighbors::cagra::index<T, IdxT>& idx,
   raft::mdspan<const T, raft::matrix_extent<int64_t>, raft::layout_stride, Accessor>
     additional_dataset_view,
-  raft::host_matrix_view<IdxT, std::int64_t> updated_graph)
+  raft::host_matrix_view<IdxT, std::int64_t> updated_graph,
+  const cuvs::neighbors::cagra::extend_params& extend_params)
 {
   using DistanceT                 = float;
   const std::size_t degree        = idx.graph_degree();
@@ -68,7 +69,19 @@ void add_node_core(
              new_size,
              raft::resource::get_cuda_stream(handle));
 
-  const std::size_t max_chunk_size = 1024;
+  std::size_t data_size_per_vector =
+    sizeof(IdxT) * base_degree + sizeof(DistanceT) * base_degree + sizeof(T) * dim;
+  cudaPointerAttributes attr;
+  RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, additional_dataset_view.data_handle()));
+  if (attr.devicePointer == nullptr) {
+    // for batch_load_iterator
+    data_size_per_vector += sizeof(T) * dim;
+  }
+
+  const std::size_t max_search_batch_size =
+    std::min(std::max(1lu, raft::resource::get_workspace_free_bytes(handle) / data_size_per_vector),
+             num_add);
+  RAFT_EXPECTS(max_search_batch_size > 0, "No enough working memory space is left.");
 
   cuvs::neighbors::cagra::search_params params;
   params.itopk_size = std::max(base_degree * 2lu, 256lu);
@@ -77,24 +90,24 @@ void add_node_core(
   auto mr = raft::resource::get_workspace_resource(handle);
 
   auto neighbor_indices = raft::make_device_mdarray<IdxT, std::int64_t>(
-    handle, mr, raft::make_extents<std::int64_t>(max_chunk_size, base_degree));
+    handle, mr, raft::make_extents<std::int64_t>(max_search_batch_size, base_degree));
 
   auto neighbor_distances = raft::make_device_mdarray<DistanceT, std::int64_t>(
-    handle, mr, raft::make_extents<std::int64_t>(max_chunk_size, base_degree));
+    handle, mr, raft::make_extents<std::int64_t>(max_search_batch_size, base_degree));
 
   auto queries = raft::make_device_mdarray<T, std::int64_t>(
-    handle, mr, raft::make_extents<std::int64_t>(max_chunk_size, dim));
+    handle, mr, raft::make_extents<std::int64_t>(max_search_batch_size, dim));
 
   auto host_neighbor_indices =
-    raft::make_host_matrix<IdxT, std::int64_t>(max_chunk_size, base_degree);
+    raft::make_host_matrix<IdxT, std::int64_t>(max_search_batch_size, base_degree);
 
   cuvs::spatial::knn::detail::utils::batch_load_iterator<T> additional_dataset_batch(
     additional_dataset_view.data_handle(),
     num_add,
     additional_dataset_view.stride(0),
-    max_chunk_size,
+    max_search_batch_size,
     raft::resource::get_cuda_stream(handle),
-    raft::resource::get_workspace_resource(handle));
+    mr);
   for (const auto& batch : additional_dataset_batch) {
     // Step 1: Obtain K (=base_degree) nearest neighbors of the new vectors by CAGRA search
     // Create queries
@@ -298,7 +311,8 @@ void add_graph_nodes(
   const std::size_t degree               = index.graph_degree();
   const std::size_t dim                  = index.dim();
   const std::size_t stride               = input_updated_dataset_view.stride(0);
-  const std::size_t max_chunk_size_      = params.max_chunk_size == 0 ? 1 : params.max_chunk_size;
+  const std::size_t max_chunk_size_ =
+    params.max_chunk_size == 0 ? new_dataset_size : params.max_chunk_size;
 
   raft::copy(updated_graph_view.data_handle(),
              index.graph().data_handle(),
@@ -342,7 +356,7 @@ void add_graph_nodes(
       stride);
 
     neighbors::cagra::add_node_core<T, IdxT>(
-      handle, internal_index, additional_dataset_view, updated_graph);
+      handle, internal_index, additional_dataset_view, updated_graph, params);
     raft::resource::sync_stream(handle);
   }
 }
diff --git a/cpp/test/distance/masked_nn.cu b/cpp/test/distance/masked_nn.cu
index a8f2f5163..a1c784669 100644
--- a/cpp/test/distance/masked_nn.cu
+++ b/cpp/test/distance/masked_nn.cu
@@ -314,8 +314,8 @@ template <typename K, typename V, typename L>
                                        cudaStream_t stream = 0)
 {
   typedef typename raft::KeyValuePair<K, V> KVP;
-  std::shared_ptr<KVP> exp_h(new KVP[size]);
-  std::shared_ptr<KVP> act_h(new KVP[size]);
+  std::shared_ptr<KVP[]> exp_h(new KVP[size]);
+  std::shared_ptr<KVP[]> act_h(new KVP[size]);
   raft::update_host<KVP>(exp_h.get(), expected, size, stream);
   raft::update_host<KVP>(act_h.get(), actual, size, stream);
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
diff --git a/dependencies.yaml b/dependencies.yaml
index fbd1d8372..478b2acc2 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -7,15 +7,16 @@ files:
       arch: [x86_64, aarch64]
     includes:
       - build
+      - build_cython
       - build_py_cuvs
       - build_wheels
       - checks
+      - clang
       - cuda
       - cuda_version
       - depends_on_cupy
       - depends_on_librmm
       - depends_on_pylibraft
-      - develop
       - docs
       - rapids_build
       - run_py_cuvs
@@ -31,13 +32,15 @@ files:
     includes:
       - bench
       - bench_python
+      - build_cython
       - build_py_cuvs
+      - clang
       - cuda
       - cuda_version
       - depends_on_cupy
       - depends_on_pylibraft
+      - depends_on_libcuvs
       - depends_on_librmm
-      - develop
       - rapids_build
       - rapids_build_setuptools
   test_cpp:
@@ -61,6 +64,7 @@ files:
   docs:
     output: none
     includes:
+      - clang
       - cuda
       - cuda_version
       - depends_on_cupy
@@ -71,10 +75,37 @@ files:
   rust:
     output: none
     includes:
+      # clang/libclang only needed for bindgen support
+      - clang
       - cuda
       - cuda_version
       - rapids_build
       - rust
+  py_build_libcuvs:
+    output: pyproject
+    pyproject_dir: python/libcuvs
+    extras:
+      table: build-system
+    includes:
+      - build
+  py_rapids_build_libcuvs:
+    output: pyproject
+    pyproject_dir: python/libcuvs
+    extras:
+      table: tool.rapids-build-backend
+      key: requires
+    includes:
+      - depends_on_libraft
+      - depends_on_librmm
+      - rapids_build
+  py_run_libcuvs:
+    output: pyproject
+    pyproject_dir: python/libcuvs
+    extras:
+      table: project
+    includes:
+      - cuda_wheels
+      - depends_on_libraft
   py_build_cuvs:
     output: pyproject
     pyproject_dir: python/cuvs
@@ -89,7 +120,11 @@ files:
       table: tool.rapids-build-backend
       key: requires
     includes:
+      - build_cython
       - build_py_cuvs
+      - depends_on_libcuvs
+      - depends_on_libraft
+      - depends_on_librmm
       - rapids_build
   py_run_cuvs:
     output: pyproject
@@ -97,7 +132,6 @@ files:
     extras:
       table: project
     includes:
-      - cuda_wheels
       - depends_on_pylibraft
       - run_py_cuvs
   py_test_cuvs:
@@ -149,12 +183,16 @@ dependencies:
       - output_types: [requirements, pyproject]
         packages:
           - scikit-build-core[pyproject]>=0.10.0
+  build_cython:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - cython>=3.0.0
   rapids_build:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
           - &cmake_ver cmake>=3.26.4,!=3.30.0
-          - cython>=3.0.0
           - ninja
       - output_types: [conda]
         packages:
@@ -166,14 +204,28 @@ dependencies:
         matrices:
           - matrix:
               arch: x86_64
+              cuda: "11.*"
             packages:
               - gcc_linux-64=11.*
-              - sysroot_linux-64==2.17
+              - sysroot_linux-64==2.28
           - matrix:
               arch: aarch64
+              cuda: "11.*"
             packages:
               - gcc_linux-aarch64=11.*
-              - sysroot_linux-aarch64==2.17
+              - sysroot_linux-aarch64==2.28
+          - matrix:
+              arch: x86_64
+              cuda: "12.*"
+            packages:
+              - gcc_linux-64=13.*
+              - sysroot_linux-64==2.28
+          - matrix:
+              arch: aarch64
+              cuda: "12.*"
+            packages:
+              - gcc_linux-aarch64=13.*
+              - sysroot_linux-aarch64==2.28
       - output_types: conda
         matrices:
           - matrix: {cuda: "12.*"}
@@ -227,12 +279,13 @@ dependencies:
       - output_types: [conda, requirements]
         packages:
           - pre-commit
-  develop:
+  clang:
     common:
       - output_types: conda
         packages:
           - clang==16.0.6
-          - clang-tools=16.0.6
+          - clang-tools==16.0.6
+          - libclang==16.0.6
   cuda_version:
     specific:
       - output_types: conda
@@ -349,13 +402,14 @@ dependencies:
               - nvidia-curand-cu12
               - nvidia-cusolver-cu12
               - nvidia-cusparse-cu12
-          # CUDA 11 does not provide wheels, so use the system libraries instead
           - matrix:
               cuda: "11.*"
               use_cuda_wheels: "true"
             packages:
-          # if use_cuda_wheels=false is provided, do not add dependencies on any CUDA wheels
-          # (e.g. for DLFW and pip devcontainers)
+              - nvidia-cublas-cu11
+              - nvidia-curand-cu11
+              - nvidia-cusolver-cu11
+              - nvidia-cusparse-cu11
           - matrix:
               use_cuda_wheels: "false"
             packages:
@@ -411,9 +465,6 @@ dependencies:
         packages:
           - make
           - rust
-          # clang/libclang only needed for bindgen support
-          - clang
-          - libclang
   build_wheels:
     common:
       - output_types: [requirements, pyproject]
@@ -478,7 +529,6 @@ dependencies:
           - h5py>=3.8.0
           - benchmark>=1.8.2
           - openblas
-          - libcuvs==25.2.*,>=0.0.0a0
   bench_python:
     common:
       - output_types: [conda, pyproject, requirements]
@@ -488,6 +538,54 @@ dependencies:
           - matplotlib
           - pandas
           - pyyaml
+  depends_on_libcuvs:
+    common:
+      - output_types: conda
+        packages:
+          - &libcuvs_unsuffixed libcuvs==25.2.*,>=0.0.0a0
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - libcuvs-cu12==25.2.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - libcuvs-cu11==25.2.*,>=0.0.0a0
+          - {matrix: null, packages: [*libcuvs_unsuffixed]}
+  depends_on_libraft:
+    common:
+      - output_types: conda
+        packages:
+          - &libraft_unsuffixed libraft==25.2.*,>=0.0.0a0
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - libraft-cu12==25.2.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - libraft-cu11==25.2.*,>=0.0.0a0
+          - {matrix: null, packages: [*libraft_unsuffixed]}
   depends_on_librmm:
     common:
       - output_types: conda
diff --git a/docs/source/cuvs_bench/index.rst b/docs/source/cuvs_bench/index.rst
index 820c44c4f..c15aa41c1 100644
--- a/docs/source/cuvs_bench/index.rst
+++ b/docs/source/cuvs_bench/index.rst
@@ -24,16 +24,6 @@ This tool offers several benefits, including
 
   * `Docker`_
 
-- `How benchmarks are run`_
-
-  * `Step 1: Prepare the dataset`_
-
-  * `Step 2: Build and search index`_
-
-  * `Step 3: Data export`_
-
-  * `Step 4: Plot the results`_
-
 - `Running the benchmarks`_
 
   * `End-to-end: smaller-scale benchmarks (<1M to 10M)`_
@@ -75,7 +65,7 @@ Conda
    conda activate cuvs_benchmarks
 
    # to install GPU package:
-   conda install -c rapidsai -c conda-forge -c nvidia cuvs-ann-bench=<rapids_version> cuda-version=11.8*
+   conda install -c rapidsai -c conda-forge -c nvidia cuvs-bench=<rapids_version> cuda-version=11.8*
 
    # to install CPU package for usage in CPU-only systems:
    conda install -c rapidsai -c conda-forge  cuvs-bench-cpu
@@ -99,7 +89,7 @@ The following command pulls the nightly container for Python version 3.10, CUDA
 
 .. code-block:: bash
 
-   docker pull rapidsai/cuvs-bench:24.12a-cuda12.5-py3.10 #substitute cuvs-bench for the exact desired container.
+   docker pull rapidsai/cuvs-bench:24.12a-cuda12.5-py3.10 # substitute cuvs-bench for the exact desired container.
 
 The CUDA and python versions can be changed for the supported values:
 - Supported CUDA versions: 11.8 and 12.5
@@ -112,185 +102,6 @@ You can see the exact versions as well in the dockerhub site:
 
 **Note:** GPU containers use the CUDA toolkit from inside the container, the only requirement is a driver installed on the host machine that supports that version. So, for example, CUDA 11.8 containers can run in systems with a CUDA 12.x capable driver. Please also note that the Nvidia-Docker runtime from the `Nvidia Container Toolkit <https://github.com/NVIDIA/nvidia-docker>`_ is required to use GPUs inside docker containers.
 
-How benchmarks are run
-======================
-
-The `cuvs-bench` package contains lightweight Python scripts to run the benchmarks. There are 4 general steps to running the benchmarks and visualizing the results.
-
-#. Prepare Dataset
-
-#. Build Index and Search Index
-
-#. Data Export
-
-#. Plot Results
-
-Step 1: Prepare the dataset
----------------------------
-
-The script `cuvs_bench.get_dataset` will download and unpack the dataset in directory that the user provides. As of now, only million-scale datasets are supported by this script. For more information on :doc:`datasets and formats <datasets>`.
-
-The usage of this script is:
-
-.. code-block:: bash
-
-    usage: get_dataset.py [-h] [--name NAME] [--dataset-path DATASET_PATH] [--normalize]
-
-    options:
-      -h, --help            show this help message and exit
-      --dataset DATASET     dataset to download (default: glove-100-angular)
-      --dataset-path DATASET_PATH
-                            path to download dataset (default: ${RAPIDS_DATASET_ROOT_DIR})
-      --normalize           normalize cosine distance to inner product (default: False)
-
-When option `normalize` is provided to the script, any dataset that has cosine distances
-will be normalized to inner product. So, for example, the dataset `glove-100-angular`
-will be written at location `datasets/glove-100-inner/`.
-
-Step 2: Build and search index
-------------------------------
-
-The script `cuvs_bench.run` will build and search indices for a given dataset and its
-specified configuration.
-
-The usage of the script `cuvs_bench.run` is:
-
-.. code-block:: bash
-
-    usage: __main__.py [-h] [--subset-size SUBSET_SIZE] [-k COUNT] [-bs BATCH_SIZE] [--dataset-configuration DATASET_CONFIGURATION] [--configuration CONFIGURATION] [--dataset DATASET]
-                       [--dataset-path DATASET_PATH] [--build] [--search] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS] [-f] [-m SEARCH_MODE]
-
-    options:
-      -h, --help            show this help message and exit
-      --subset-size SUBSET_SIZE
-                            the number of subset rows of the dataset to build the index (default: None)
-      -k COUNT, --count COUNT
-                            the number of nearest neighbors to search for (default: 10)
-      -bs BATCH_SIZE, --batch-size BATCH_SIZE
-                            number of query vectors to use in each query trial (default: 10000)
-      --dataset-configuration DATASET_CONFIGURATION
-                            path to YAML configuration file for datasets (default: None)
-      --configuration CONFIGURATION
-                            path to YAML configuration file or directory for algorithms Any run groups found in the specified file/directory will automatically override groups of the same name
-                            present in the default configurations, including `base` (default: None)
-      --dataset DATASET     name of dataset (default: glove-100-inner)
-      --dataset-path DATASET_PATH
-                            path to dataset folder, by default will look in RAPIDS_DATASET_ROOT_DIR if defined, otherwise a datasets subdirectory from the calling directory (default:
-                            os.getcwd()/datasets/)
-      --build
-      --search
-      --algorithms ALGORITHMS
-                            run only comma separated list of named algorithms. If parameters `groups` and `algo-groups` are both undefined, then group `base` is run by default (default: None)
-      --groups GROUPS       run only comma separated groups of parameters (default: base)
-      --algo-groups ALGO_GROUPS
-                            add comma separated <algorithm>.<group> to run. Example usage: "--algo-groups=cuvs_cagra.large,hnswlib.large" (default: None)
-      -f, --force           re-run algorithms even if their results already exist (default: False)
-      -m SEARCH_MODE, --search-mode SEARCH_MODE
-                            run search in 'latency' (measure individual batches) or 'throughput' (pipeline batches and measure end-to-end) mode (default: throughput)
-      -t SEARCH_THREADS, --search-threads SEARCH_THREADS
-                            specify the number threads to use for throughput benchmark. Single value or a pair of min and max separated by ':'. Example --search-threads=1:4. Power of 2 values between 'min' and 'max' will be used. If only 'min' is
-                            specified, then a single test is run with 'min' threads. By default min=1, max=<num hyper threads>. (default: None)
-      -r, --dry-run         dry-run mode will convert the yaml config for the specified algorithms and datasets to the json format that's consumed by the lower-level c++ binaries and then print the command to run execute the benchmarks but
-                            will not actually execute the command. (default: False)
-
-`dataset`: name of the dataset to be searched in `datasets.yaml`_
-
-`dataset-configuration`: optional filepath to custom dataset YAML config which has an entry for arg `dataset`
-
-`configuration`: optional filepath to YAML configuration for an algorithm or to directory that contains YAML configurations for several algorithms. Refer to `Dataset.yaml config`_ for more info.
-
-`algorithms`: runs all algorithms that it can find in YAML configs found by `configuration`. By default, only `base` group will be run.
-
-`groups`: run only specific groups of parameters configurations for an algorithm. Groups are defined in YAML configs (see `configuration`), and by default run `base` group
-
-`algo-groups`: this parameter is helpful to append any specific algorithm+group combination to run the benchmark for in addition to all the arguments from `algorithms` and `groups`. It is of the format `<algorithm>.<group>`, or for example, `cuvs_cagra.large`
-
-For every algorithm run by this script, it outputs an index build statistics JSON file in `<dataset-path/<dataset>/result/build/<{algo},{group}.json>`
-and an index search statistics JSON file in `<dataset-path/<dataset>/result/search/<{algo},{group},k{k},bs{batch_size}.json>`. NOTE: The filenames will not have ",{group}" if `group = "base"`.
-
-For every algorithm run by this script, it outputs an index build statistics JSON file in `<dataset-path/<dataset>/result/build/<{algo},{group}.json>`
-and an index search statistics JSON file in `<dataset-path/<dataset>/result/search/<{algo},{group},k{k},bs{batch_size}.json>`. NOTE: The filenames will not have ",{group}" if `group = "base"`.
-
-`dataset-path` :
-#. data is read from `<dataset-path>/<dataset>`
-#. indices are built in `<dataset-path>/<dataset>/index`
-#. build/search results are stored in `<dataset-path>/<dataset>/result`
-
-`build` and `search` : if both parameters are not supplied to the script then it is assumed both are `True`.
-
-`indices` and `algorithms` : these parameters ensure that the algorithm specified for an index is available in `algos.yaml` and not disabled, as well as having an associated executable.
-
-Step 3: Data export
--------------------
-
-The script `cuvs_bench.data_export` will convert the intermediate JSON outputs produced by `cuvs_bench.run` to more easily readable CSV files, which are needed to build charts made by `cuvs_bench.plot`.
-
-.. code-block:: bash
-
-    usage: data_export.py [-h] [--dataset DATASET] [--dataset-path DATASET_PATH]
-
-    options:
-      -h, --help            show this help message and exit
-      --dataset DATASET     dataset to download (default: glove-100-inner)
-      --dataset-path DATASET_PATH
-                            path to dataset folder (default: ${RAPIDS_DATASET_ROOT_DIR})
-
-Build statistics CSV file is stored in `<dataset-path/<dataset>/result/build/<{algo},{group}.csv>`
-and index search statistics CSV file in `<dataset-path/<dataset>/result/search/<{algo},{group},k{k},bs{batch_size},{suffix}.csv>`, where suffix has three values:
-#. `raw`: All search results are exported
-#. `throughput`: Pareto frontier of throughput results is exported
-#. `latency`: Pareto frontier of latency results is exported
-
-Step 4: Plot the results
-------------------------
-
-The script `cuvs_bench.plot` will plot results for all algorithms found in index search statistics CSV files `<dataset-path/<dataset>/result/search/*.csv`.
-
-The usage of this script is:
-
-.. code-block:: bash
-
-    usage:  [-h] [--dataset DATASET] [--dataset-path DATASET_PATH] [--output-filepath OUTPUT_FILEPATH] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS]
-            [-k COUNT] [-bs BATCH_SIZE] [--build] [--search] [--x-scale X_SCALE] [--y-scale {linear,log,symlog,logit}] [--x-start X_START] [--mode {throughput,latency}]
-            [--time-unit {s,ms,us}] [--raw]
-
-    options:
-      -h, --help            show this help message and exit
-      --dataset DATASET     dataset to plot (default: glove-100-inner)
-      --dataset-path DATASET_PATH
-                            path to dataset folder (default: /home/coder/cuvs/datasets/)
-      --output-filepath OUTPUT_FILEPATH
-                            directory for PNG to be saved (default: /home/coder/cuvs)
-      --algorithms ALGORITHMS
-                            plot only comma separated list of named algorithms. If parameters `groups` and `algo-groups are both undefined, then group `base` is plot by default
-                            (default: None)
-      --groups GROUPS       plot only comma separated groups of parameters (default: base)
-      --algo-groups ALGO_GROUPS, --algo-groups ALGO_GROUPS
-                            add comma separated <algorithm>.<group> to plot. Example usage: "--algo-groups=cuvs_cagra.large,hnswlib.large" (default: None)
-      -k COUNT, --count COUNT
-                            the number of nearest neighbors to search for (default: 10)
-      -bs BATCH_SIZE, --batch-size BATCH_SIZE
-                            number of query vectors to use in each query trial (default: 10000)
-      --build
-      --search
-      --x-scale X_SCALE     Scale to use when drawing the X-axis. Typically linear, logit or a2 (default: linear)
-      --y-scale {linear,log,symlog,logit}
-                            Scale to use when drawing the Y-axis (default: linear)
-      --x-start X_START     Recall values to start the x-axis from (default: 0.8)
-      --mode {throughput,latency}
-                            search mode whose Pareto frontier is used on the y-axis (default: throughput)
-      --time-unit {s,ms,us}
-                            time unit to plot when mode is latency (default: ms)
-      --raw                 Show raw results (not just Pareto frontier) of mode arg (default: False)
-
-`mode`: plots pareto frontier of `throughput` or `latency` results exported in the previous step
-
-`algorithms`: plots all algorithms that it can find results for the specified `dataset`. By default, only `base` group will be plotted.
-
-`groups`: plot only specific groups of parameters configurations for an algorithm. Groups are defined in YAML configs (see `configuration`), and by default run `base` group
-
-`algo-groups`: this parameter is helpful to append any specific algorithm+group combination to plot results for in addition to all the arguments from `algorithms` and `groups`. It is of the format `<algorithm>.<group>`, or for example, `cuvs_cagra.large`
-
 Running the benchmarks
 ======================
 
@@ -576,7 +387,7 @@ Creating and customizing dataset configurations
 
 A single configuration will often define a set of algorithms, with associated index and search parameters, that can be generalize across datasets. We use YAML to define dataset specific and algorithm specific configurations.
 
-A default `datasets.yaml` is provided by CUVS in `${CUVS_HOME}/python/cuvs-ann-bench/src/cuvs_bench/run/conf` with configurations available for several datasets. Here's a simple example entry for the `sift-128-euclidean` dataset:
+A default `datasets.yaml` is provided by CUVS in `${CUVS_HOME}/python/cuvs_bench/src/cuvs_bench/run/conf` with configurations available for several datasets. Here's a simple example entry for the `sift-128-euclidean` dataset:
 
 .. code-block:: yaml
 
diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
index 9554207bb..b0d0ae9ee 100644
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -48,13 +48,23 @@ add_executable(VAMANA_EXAMPLE src/vamana_example.cu)
 add_library(rmm_logger OBJECT)
 target_link_libraries(rmm_logger PRIVATE rmm::rmm_logger_impl)
 
-target_link_libraries(CAGRA_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> rmm_logger)
 target_link_libraries(
-  CAGRA_PERSISTENT_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> Threads::Threads rmm_logger
+  CAGRA_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> rmm_logger
 )
 target_link_libraries(
-  DYNAMIC_BATCHING_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> Threads::Threads rmm_logger
+  CAGRA_PERSISTENT_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> Threads::Threads
+                                   rmm_logger
+)
+target_link_libraries(
+  DYNAMIC_BATCHING_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> Threads::Threads
+                                   rmm_logger
+)
+target_link_libraries(
+  IVF_PQ_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> rmm_logger
+)
+target_link_libraries(
+  IVF_FLAT_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> rmm_logger
+)
+target_link_libraries(
+  VAMANA_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> rmm_logger
 )
-target_link_libraries(IVF_PQ_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> rmm_logger)
-target_link_libraries(IVF_FLAT_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> rmm_logger)
-target_link_libraries(VAMANA_EXAMPLE PRIVATE cuvs::cuvs $<TARGET_NAME_IF_EXISTS:conda_env> rmm_logger)
diff --git a/python/cuvs/CMakeLists.txt b/python/cuvs/CMakeLists.txt
index c0990995f..f3feae9a7 100644
--- a/python/cuvs/CMakeLists.txt
+++ b/python/cuvs/CMakeLists.txt
@@ -31,18 +31,6 @@ project(
             C CXX CUDA
 )
 
-# ##################################################################################################
-# * User Options  --------------------------------------------------------------
-
-option(FIND_CUVS_CPP "Search for existing CUVS C++ installations before defaulting to local files"
-       OFF
-)
-option(USE_CUDA_MATH_WHEELS "Use the CUDA math wheels instead of the system libraries" OFF)
-
-message(
-  "CUVS_PY: Searching for existing cuVS C/C++ installations before defaulting to local files: ${FIND_CUVS_CPP}"
-)
-
 # ##################################################################################################
 # * Process User Options  ------------------------------------------------------
 
@@ -54,56 +42,14 @@ include(rapids-find)
 
 rapids_cpm_init()
 
-# If the user requested it we attempt to find CUVS.
-if(FIND_CUVS_CPP)
-  find_package(cuvs "${RAPIDS_VERSION}" REQUIRED COMPONENTS c_api)
-  include(../../cpp/cmake/thirdparty/get_dlpack.cmake)
-else()
-  set(cuvs_FOUND OFF)
-endif()
+# --- cuVS ---#
+find_package(cuvs "${RAPIDS_VERSION}" REQUIRED COMPONENTS c_api)
 
-if(NOT cuvs_FOUND)
-  find_package(CUDAToolkit REQUIRED)
+# --- dlpack ---#
+include(../../cpp/cmake/thirdparty/get_dlpack.cmake)
 
-  set(BUILD_TESTS OFF)
-  set(BUILD_C_LIBRARY ON)
-
-  # Statically link dependencies if building wheels
-  set(CUDA_STATIC_RUNTIME ON)
-  set(CUDA_STATIC_MATH_LIBRARIES ON)
-  set(CUVS_USE_RAFT_STATIC ON)
-
-  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.0)
-    set(CUDA_STATIC_MATH_LIBRARIES OFF)
-  elseif(USE_CUDA_MATH_WHEELS)
-    message(FATAL_ERROR "Cannot use CUDA math wheels with CUDA < 12.0")
-  endif()
-
-  add_subdirectory(../../cpp cuvs-cpp EXCLUDE_FROM_ALL)
-
-  if(NOT CUDA_STATIC_MATH_LIBRARIES AND USE_CUDA_MATH_WHEELS)
-    set(rpaths
-        "$ORIGIN/../nvidia/cublas/lib"
-        "$ORIGIN/../nvidia/curand/lib"
-        "$ORIGIN/../nvidia/cusolver/lib"
-        "$ORIGIN/../nvidia/cusparse/lib"
-        "$ORIGIN/../nvidia/nvjitlink/lib"
-    )
-    set_property(
-      TARGET cuvs
-      PROPERTY INSTALL_RPATH ${rpaths}
-      APPEND
-    )
-    set_property(
-      TARGET cuvs_c
-      PROPERTY INSTALL_RPATH ${rpaths}
-      APPEND
-    )
-  endif()
-
-  set(cython_lib_dir cuvs)
-  install(TARGETS cuvs cuvs_c DESTINATION ${cython_lib_dir})
-endif()
+# ensure Cython targets can find dlpack headers (these do not come installed with with cuVS)
+target_include_directories(cuvs::cuvs INTERFACE "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")
 
 # ##################################################################################################
 # * Build Cython artifacts -----------------------------------------------------
@@ -116,7 +62,3 @@ target_link_libraries(cuvs_rmm_logger PRIVATE rmm::rmm_logger_impl)
 add_subdirectory(cuvs/common)
 add_subdirectory(cuvs/distance)
 add_subdirectory(cuvs/neighbors)
-
-if(DEFINED cython_lib_dir)
-  rapids_cython_add_rpath_entries(TARGET cuvs PATHS "${cython_lib_dir}")
-endif()
diff --git a/python/cuvs/cuvs/__init__.py b/python/cuvs/cuvs/__init__.py
index 9f0481cb7..1a41f0d76 100644
--- a/python/cuvs/cuvs/__init__.py
+++ b/python/cuvs/cuvs/__init__.py
@@ -13,4 +13,15 @@
 # limitations under the License.
 #
 
+# If libcuvs was installed as a wheel, we must request it to load the library
+# symbols. Otherwise, we assume that the library was installed in a system path that ld
+# can find.
+try:
+    import libcuvs
+except ModuleNotFoundError:
+    pass
+else:
+    libcuvs.load_library()
+    del libcuvs
+
 from cuvs._version import __git_commit__, __version__
diff --git a/python/cuvs/cuvs/common/CMakeLists.txt b/python/cuvs/cuvs/common/CMakeLists.txt
index 361f2fafc..b0e1cb335 100644
--- a/python/cuvs/cuvs/common/CMakeLists.txt
+++ b/python/cuvs/cuvs/common/CMakeLists.txt
@@ -20,7 +20,7 @@ set(linked_libraries cuvs::cuvs cuvs::c_api)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX common_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX common_
 )
 
 foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
diff --git a/python/cuvs/cuvs/common/c_api.pxd b/python/cuvs/cuvs/common/c_api.pxd
index f99fd5348..dae93d750 100644
--- a/python/cuvs/cuvs/common/c_api.pxd
+++ b/python/cuvs/cuvs/common/c_api.pxd
@@ -16,7 +16,7 @@
 # cython: language_level=3
 
 
-from cuda.ccudart cimport cudaStream_t
+from cuda.bindings.cyruntime cimport cudaStream_t
 from libc.stdint cimport uintptr_t
 
 
diff --git a/python/cuvs/cuvs/common/resources.pyx b/python/cuvs/cuvs/common/resources.pyx
index c0b72ae34..0edf53fc1 100644
--- a/python/cuvs/cuvs/common/resources.pyx
+++ b/python/cuvs/cuvs/common/resources.pyx
@@ -17,7 +17,7 @@
 
 import functools
 
-from cuda.ccudart cimport cudaStream_t
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 from cuvs.common.c_api cimport (
     cuvsResources_t,
diff --git a/python/cuvs/cuvs/distance/CMakeLists.txt b/python/cuvs/cuvs/distance/CMakeLists.txt
index 514b08c43..ded07395c 100644
--- a/python/cuvs/cuvs/distance/CMakeLists.txt
+++ b/python/cuvs/cuvs/distance/CMakeLists.txt
@@ -20,7 +20,7 @@ set(linked_libraries cuvs::cuvs cuvs::c_api)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX distance_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX distance_
 )
 
 foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
diff --git a/python/cuvs/cuvs/neighbors/CMakeLists.txt b/python/cuvs/cuvs/neighbors/CMakeLists.txt
index 031fd485e..b9161eefc 100644
--- a/python/cuvs/cuvs/neighbors/CMakeLists.txt
+++ b/python/cuvs/cuvs/neighbors/CMakeLists.txt
@@ -27,7 +27,7 @@ set(linked_libraries cuvs::cuvs cuvs::c_api)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX neighbors_refine_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_refine_
 )
 
 foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
diff --git a/python/cuvs/cuvs/neighbors/brute_force/CMakeLists.txt b/python/cuvs/cuvs/neighbors/brute_force/CMakeLists.txt
index 61eda649c..3c646f498 100644
--- a/python/cuvs/cuvs/neighbors/brute_force/CMakeLists.txt
+++ b/python/cuvs/cuvs/neighbors/brute_force/CMakeLists.txt
@@ -20,8 +20,7 @@ set(linked_libraries cuvs::cuvs cuvs::c_api)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX
-                   neighbors_brute_force_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_brute_force_
 )
 
 foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
diff --git a/python/cuvs/cuvs/neighbors/cagra/CMakeLists.txt b/python/cuvs/cuvs/neighbors/cagra/CMakeLists.txt
index 1f40daab2..6cf0956a2 100644
--- a/python/cuvs/cuvs/neighbors/cagra/CMakeLists.txt
+++ b/python/cuvs/cuvs/neighbors/cagra/CMakeLists.txt
@@ -20,7 +20,7 @@ set(linked_libraries cuvs::cuvs cuvs::c_api)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX neighbors_cagra_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_cagra_
 )
 
 foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
diff --git a/python/cuvs/cuvs/neighbors/filters/CMakeLists.txt b/python/cuvs/cuvs/neighbors/filters/CMakeLists.txt
index a678852d9..43e008363 100644
--- a/python/cuvs/cuvs/neighbors/filters/CMakeLists.txt
+++ b/python/cuvs/cuvs/neighbors/filters/CMakeLists.txt
@@ -20,7 +20,7 @@ set(linked_libraries cuvs::cuvs cuvs::c_api)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX neighbors_prefilter_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_prefilter_
 )
 
 foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
diff --git a/python/cuvs/cuvs/neighbors/hnsw/CMakeLists.txt b/python/cuvs/cuvs/neighbors/hnsw/CMakeLists.txt
index 8351916e6..c33313c3c 100644
--- a/python/cuvs/cuvs/neighbors/hnsw/CMakeLists.txt
+++ b/python/cuvs/cuvs/neighbors/hnsw/CMakeLists.txt
@@ -20,7 +20,7 @@ set(linked_libraries cuvs::cuvs cuvs::c_api)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX neighbors_hnsw_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_hnsw_
 )
 
 foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
diff --git a/python/cuvs/cuvs/neighbors/ivf_flat/CMakeLists.txt b/python/cuvs/cuvs/neighbors/ivf_flat/CMakeLists.txt
index f5663cdaa..eadb8934c 100644
--- a/python/cuvs/cuvs/neighbors/ivf_flat/CMakeLists.txt
+++ b/python/cuvs/cuvs/neighbors/ivf_flat/CMakeLists.txt
@@ -20,7 +20,7 @@ set(linked_libraries cuvs::cuvs cuvs::c_api)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX neighbors_ivf_flat_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_ivf_flat_
 )
 
 foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
diff --git a/python/cuvs/cuvs/neighbors/ivf_pq/CMakeLists.txt b/python/cuvs/cuvs/neighbors/ivf_pq/CMakeLists.txt
index a24320ded..df61793b8 100644
--- a/python/cuvs/cuvs/neighbors/ivf_pq/CMakeLists.txt
+++ b/python/cuvs/cuvs/neighbors/ivf_pq/CMakeLists.txt
@@ -20,7 +20,7 @@ set(linked_libraries cuvs::cuvs cuvs::c_api)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cuvs MODULE_PREFIX neighbors_pq_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_pq_
 )
 
 foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
diff --git a/python/cuvs/cuvs/test/test_distance.py b/python/cuvs/cuvs/test/test_distance.py
index 483d5d201..370dd773a 100644
--- a/python/cuvs/cuvs/test/test_distance.py
+++ b/python/cuvs/cuvs/test/test_distance.py
@@ -21,6 +21,7 @@
 from cuvs.distance import pairwise_distance
 
 
+@pytest.mark.parametrize("times", range(20))
 @pytest.mark.parametrize("n_rows", [50, 100])
 @pytest.mark.parametrize("n_cols", [10, 50])
 @pytest.mark.parametrize(
@@ -43,7 +44,7 @@
 @pytest.mark.parametrize("inplace", [True, False])
 @pytest.mark.parametrize("order", ["F", "C"])
 @pytest.mark.parametrize("dtype", [np.float32, np.float64, np.float16])
-def test_distance(n_rows, n_cols, inplace, order, metric, dtype):
+def test_distance(n_rows, n_cols, inplace, order, metric, dtype, times):
     input1 = np.random.random_sample((n_rows, n_cols))
     input1 = np.asarray(input1, order=order).astype(dtype)
 
@@ -79,7 +80,5 @@ def test_distance(n_rows, n_cols, inplace, order, metric, dtype):
     actual = output_device.copy_to_host()
 
     tol = 1e-3
-    if np.issubdtype(dtype, np.float16):
-        tol = 1e-1
 
     assert np.allclose(expected, actual, atol=tol, rtol=tol)
diff --git a/python/cuvs/pyproject.toml b/python/cuvs/pyproject.toml
index 155e454a8..30658623b 100644
--- a/python/cuvs/pyproject.toml
+++ b/python/cuvs/pyproject.toml
@@ -33,10 +33,6 @@ requires-python = ">=3.10"
 dependencies = [
     "cuda-python",
     "numpy>=1.23,<3.0a0",
-    "nvidia-cublas",
-    "nvidia-curand",
-    "nvidia-cusolver",
-    "nvidia-cusparse",
     "pylibraft==25.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -59,12 +55,6 @@ test = [
 Homepage = "https://github.com/rapidsai/cuvs"
 Documentation = "https://docs.rapids.ai/api/cuvs/stable/"
 
-[tool.setuptools]
-license-files = ["LICENSE"]
-
-[tool.setuptools.dynamic]
-version = {file = "cuvs/VERSION"}
-
 [tool.isort]
 line_length = 79
 multi_line_output = 3
@@ -127,18 +117,23 @@ requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cuda-python",
     "cython>=3.0.0",
+    "libcuvs==25.2.*,>=0.0.0a0",
+    "libraft==25.2.*,>=0.0.0a0",
+    "librmm==25.2.*,>=0.0.0a0",
     "ninja",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"
-matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
+matrix-entry = "cuda_suffixed=true"
 
 [tool.pydistcheck]
 select = [
-    # NOTE: size threshold is managed via CLI args in CI scripts
     "distro-too-large-compressed",
 ]
 
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'
+
 [tool.pytest.ini_options]
 filterwarnings = [
     "error",
diff --git a/python/libcuvs/CMakeLists.txt b/python/libcuvs/CMakeLists.txt
new file mode 100644
index 000000000..569652b71
--- /dev/null
+++ b/python/libcuvs/CMakeLists.txt
@@ -0,0 +1,69 @@
+# =============================================================================
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
+
+include(../../rapids_config.cmake)
+
+include(rapids-cuda)
+rapids_cuda_init_architectures(libcuvs-python)
+
+project(
+  libcuvs-python
+  VERSION "${RAPIDS_VERSION}"
+  LANGUAGES CXX CUDA
+)
+
+# Check if cuVS is already available. If so, it is the user's responsibility to ensure that the
+# CMake package is also available at build time of the Python cuvs package.
+find_package(cuvs "${RAPIDS_VERSION}")
+
+if(cuvs_FOUND)
+  return()
+endif()
+
+unset(cuvs_FOUND)
+
+# --- CUDA --- #
+set(CUDA_STATIC_RUNTIME ON)
+set(CUDA_STATIC_MATH_LIBRARIES OFF)
+
+# --- RAFT ---#
+set(CUVS_USE_RAFT_STATIC OFF)
+
+# --- cuVS ---#
+set(BUILD_TESTS OFF)
+set(BUILD_C_LIBRARY ON)
+set(CUVS_COMPILE_DYNAMIC_ONLY ON)
+
+add_subdirectory(../../cpp cuvs-cpp)
+
+# assumes libcuvs.so is installed 2 levels deep, e.g. site-packages/libcuvs/lib64/libcuvs.so
+set(rpaths
+    "$ORIGIN/../../nvidia/cublas/lib"
+    "$ORIGIN/../../nvidia/curand/lib"
+    "$ORIGIN/../../nvidia/cusolver/lib"
+    "$ORIGIN/../../nvidia/cusparse/lib"
+    "$ORIGIN/../../nvidia/nvjitlink/lib"
+)
+set_property(
+  TARGET cuvs
+  PROPERTY INSTALL_RPATH ${rpaths}
+  APPEND
+)
+set_property(
+  TARGET cuvs_c
+  PROPERTY INSTALL_RPATH ${rpaths}
+  APPEND
+)
diff --git a/python/libcuvs/LICENSE b/python/libcuvs/LICENSE
new file mode 120000
index 000000000..30cff7403
--- /dev/null
+++ b/python/libcuvs/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/libcuvs/README.md b/python/libcuvs/README.md
new file mode 120000
index 000000000..fe8400541
--- /dev/null
+++ b/python/libcuvs/README.md
@@ -0,0 +1 @@
+../../README.md
\ No newline at end of file
diff --git a/python/libcuvs/libcuvs/VERSION b/python/libcuvs/libcuvs/VERSION
new file mode 120000
index 000000000..d62dc733e
--- /dev/null
+++ b/python/libcuvs/libcuvs/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/libcuvs/libcuvs/__init__.py b/python/libcuvs/libcuvs/__init__.py
new file mode 100644
index 000000000..2d3a86015
--- /dev/null
+++ b/python/libcuvs/libcuvs/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from libcuvs._version import __git_commit__, __version__
+from libcuvs.load import load_library
diff --git a/python/libcuvs/libcuvs/_version.py b/python/libcuvs/libcuvs/_version.py
new file mode 100644
index 000000000..530bf8bea
--- /dev/null
+++ b/python/libcuvs/libcuvs/_version.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.resources
+
+__version__ = (
+    importlib.resources.files(__package__)
+    .joinpath("VERSION")
+    .read_text()
+    .strip()
+)
+try:
+    __git_commit__ = (
+        importlib.resources.files(__package__)
+        .joinpath("GIT_COMMIT")
+        .read_text()
+        .strip()
+    )
+except FileNotFoundError:
+    __git_commit__ = ""
+
+__all__ = ["__git_commit__", "__version__"]
diff --git a/python/libcuvs/libcuvs/load.py b/python/libcuvs/libcuvs/load.py
new file mode 100644
index 000000000..a9c6a9325
--- /dev/null
+++ b/python/libcuvs/libcuvs/load.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import ctypes
+import os
+
+# Loading with RTLD_LOCAL adds the library itself to the loader's
+# loaded library cache without loading any symbols into the global
+# namespace. This allows libraries that express a dependency on
+# this library to be loaded later and successfully satisfy this dependency
+# without polluting the global symbol table with symbols from
+# libcuvs that could conflict with symbols from other DSOs.
+PREFERRED_LOAD_FLAG = ctypes.RTLD_LOCAL
+
+
+def _load_system_installation(soname: str):
+    """Try to dlopen() the library indicated by ``soname``
+    Raises ``OSError`` if library cannot be loaded.
+    """
+    return ctypes.CDLL(soname, PREFERRED_LOAD_FLAG)
+
+
+def _load_wheel_installation(soname: str):
+    """Try to dlopen() the library indicated by ``soname``
+    Returns ``None`` if the library cannot be loaded.
+    """
+    if os.path.isfile(
+        lib := os.path.join(os.path.dirname(__file__), "lib64", soname)
+    ):
+        return ctypes.CDLL(lib, PREFERRED_LOAD_FLAG)
+    return None
+
+
+def load_library():
+    """Dynamically load libcuvs.so and its dependencies"""
+    try:
+        # libraft must be loaded before libcuvs because libcuvs
+        # references its symbols
+        import libraft
+
+        libraft.load_library()
+    except ModuleNotFoundError:
+        # 'libcuvs' has a runtime dependency on 'libraft'. However,
+        # that dependency might be satisfied by the 'libraft' conda package
+        # (which does not have any Python modules), instead of the
+        # 'libraft' wheel.
+        #
+        # In that situation, assume that 'libraft.so' is in a place where
+        # the loader can find it.
+        pass
+
+    prefer_system_installation = (
+        os.getenv("RAPIDS_LIBCUVS_PREFER_SYSTEM_LIBRARY", "false").lower()
+        != "false"
+    )
+
+    libs_to_return = []
+    for soname in ["libcuvs.so", "libcuvs_c.so"]:
+        libcuvs_lib = None
+        if prefer_system_installation:
+            # Prefer a system library if one is present to
+            # avoid clobbering symbols that other packages might expect,
+            # but if no other library is present use the one in the wheel.
+            try:
+                libcuvs_lib = _load_system_installation(soname)
+            except OSError:
+                libcuvs_lib = _load_wheel_installation(soname)
+        else:
+            # Prefer the libraries bundled in this package. If they aren't
+            # found (which might be the case in builds where the library was
+            # prebuilt before packaging the wheel), look for a system
+            # installation.
+            try:
+                libcuvs_lib = _load_wheel_installation(soname)
+                if libcuvs_lib is None:
+                    libcuvs_lib = _load_system_installation(soname)
+            except OSError:
+                # If none of the searches above succeed, just silently return
+                # None and rely on other mechanisms (like RPATHs on other DSOs)
+                # to help the loader find the library.
+                pass
+        if libcuvs_lib:
+            libs_to_return.append(libcuvs_lib)
+
+    # The caller almost never needs to do anything with this library, but no
+    # harm in offering the option since this object at least provides a handle
+    # to inspect where libcuvs was loaded from.
+    return libs_to_return
diff --git a/python/libcuvs/pyproject.toml b/python/libcuvs/pyproject.toml
new file mode 100644
index 000000000..28443b782
--- /dev/null
+++ b/python/libcuvs/pyproject.toml
@@ -0,0 +1,108 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[build-system]
+requires = [
+    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
+    "scikit-build-core[pyproject]>=0.10.0",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+build-backend = "rapids_build_backend.build"
+
+[project]
+name = "libcuvs"
+dynamic = ["version"]
+description = "cuVS: Vector Search on the GPU (C++)"
+readme = { file = "README.md", content-type = "text/markdown" }
+authors = [
+    { name = "NVIDIA Corporation" },
+]
+license = { text = "Apache 2.0" }
+requires-python = ">=3.10"
+dependencies = [
+    "libraft==25.2.*,>=0.0.0a0",
+    "nvidia-cublas",
+    "nvidia-curand",
+    "nvidia-cusolver",
+    "nvidia-cusparse",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+classifiers = [
+    "Intended Audience :: Developers",
+]
+
+[project.urls]
+Homepage = "https://github.com/rapidsai/cuvs"
+Documentation = "https://docs.rapids.ai/api/cuvs/stable/"
+
+[project.entry-points."cmake.prefix"]
+libcuvs = "libcuvs"
+
+[tool.isort]
+line_length = 79
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+combine_as_imports = true
+order_by_type = true
+known_first_party = [
+    "libcuvs",
+]
+skip = [
+    "thirdparty",
+    ".eggs",
+    ".git",
+    ".hg",
+    ".mypy_cache",
+    ".tox",
+    ".venv",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "__init__.py",
+]
+
+[tool.scikit-build]
+build-dir = "build/{wheel_tag}"
+cmake.build-type = "Release"
+cmake.version = "CMakeLists.txt"
+minimum-version = "build-system.requires"
+ninja.make-fallback = true
+sdist.reproducible = true
+wheel.install-dir = "libcuvs"
+wheel.packages = ["libcuvs"]
+wheel.py-api = "py3"
+
+[tool.scikit-build.metadata.version]
+provider = "scikit_build_core.metadata.regex"
+input = "libcuvs/VERSION"
+regex = "(?P<value>.*)"
+
+[tool.rapids-build-backend]
+build-backend = "scikit_build_core.build"
+requires = [
+    "cmake>=3.26.4,!=3.30.0",
+    "libraft==25.2.*,>=0.0.0a0",
+    "librmm==25.2.*,>=0.0.0a0",
+    "ninja",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
+
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# detect when package size grows significantly
+max_allowed_size_compressed = '1.1G'