From 00d1ececa78dc1399381c97729e2166af3d5d71e Mon Sep 17 00:00:00 2001
From: Rui Lan <abc99lr@gmail.com>
Date: Fri, 8 Dec 2023 11:47:07 -0800
Subject: [PATCH 01/14] Add subsample support for PQ codebook generation. More
 benchmark needed.

---
 .../raft/neighbors/detail/ivf_pq_build.cuh        | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index 6adc4d583c..0012f3a9d2 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -401,8 +401,11 @@ void train_per_subset(raft::resources const& handle,
   auto device_memory = resource::get_workspace_resource(handle);
 
   rmm::device_uvector<float> pq_centers_tmp(index.pq_centers().size(), stream, device_memory);
-  rmm::device_uvector<float> sub_trainset(n_rows * size_t(index.pq_len()), stream, device_memory);
-  rmm::device_uvector<uint32_t> sub_labels(n_rows, stream, device_memory);
+  // Subsampling the train set for codebook generation. Using similar subsampling strategy as train_per_cluster
+  size_t big_enough     = 256ul * std::max<size_t>(index.pq_book_size(), index.pq_dim());
+  auto pq_n_rows        = uint32_t(std::min(big_enough, n_rows));
+  rmm::device_uvector<float> sub_trainset(pq_n_rows * size_t(index.pq_len()), stream, device_memory);
+  rmm::device_uvector<uint32_t> sub_labels(pq_n_rows, stream, device_memory);
 
   rmm::device_uvector<uint32_t> pq_cluster_sizes(index.pq_book_size(), stream, device_memory);
 
@@ -413,7 +416,7 @@ void train_per_subset(raft::resources const& handle,
     // Get the rotated cluster centers for each training vector.
     // This will be subtracted from the input vectors afterwards.
     utils::copy_selected<float, float, size_t, uint32_t>(
-      n_rows,
+      pq_n_rows,
       index.pq_len(),
       index.centers_rot().data_handle() + index.pq_len() * j,
       labels,
@@ -429,7 +432,7 @@ void train_per_subset(raft::resources const& handle,
                  true,
                  false,
                  index.pq_len(),
-                 n_rows,
+                 pq_n_rows,
                  index.dim(),
                  &alpha,
                  index.rotation_matrix().data_handle() + index.dim() * index.pq_len() * j,
@@ -443,12 +446,12 @@ void train_per_subset(raft::resources const& handle,
 
     // train PQ codebook for this subspace
     auto sub_trainset_view =
-      raft::make_device_matrix_view<const float, IdxT>(sub_trainset.data(), n_rows, index.pq_len());
+      raft::make_device_matrix_view<const float, IdxT>(sub_trainset.data(), pq_n_rows, index.pq_len());
     auto centers_tmp_view = raft::make_device_matrix_view<float, IdxT>(
       pq_centers_tmp.data() + index.pq_book_size() * index.pq_len() * j,
       index.pq_book_size(),
       index.pq_len());
-    auto sub_labels_view = raft::make_device_vector_view<uint32_t, IdxT>(sub_labels.data(), n_rows);
+    auto sub_labels_view = raft::make_device_vector_view<uint32_t, IdxT>(sub_labels.data(), pq_n_rows);
     auto cluster_sizes_view =
       raft::make_device_vector_view<uint32_t, IdxT>(pq_cluster_sizes.data(), index.pq_book_size());
     raft::cluster::kmeans_balanced_params kmeans_params;

From 7332a274749f715357e726675e0b33650fdf6015 Mon Sep 17 00:00:00 2001
From: Rui Lan <abc99lr@gmail.com>
Date: Fri, 5 Jan 2024 17:18:02 -0800
Subject: [PATCH 02/14] Add knob to control the amount of PQ codebook training
 subsampling.

---
 .../ann/src/raft/raft_ann_bench_param_parser.h    |  2 ++
 .../raft/neighbors/detail/ivf_pq_build.cuh        |  7 ++++---
 cpp/include/raft/neighbors/ivf_pq_types.hpp       |  6 ++++++
 .../pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd   |  3 ++-
 .../pylibraft/neighbors/ivf_pq/ivf_pq.pyx         | 15 ++++++++++++++-
 5 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
index 2a021a8a12..c88cee71fc 100644
--- a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
@@ -86,6 +86,8 @@ void parse_build_param(const nlohmann::json& conf,
       throw std::runtime_error("codebook_kind: '" + kind +
                                "', should be either 'cluster' or 'subspace'");
     }
+    if (conf.contains("pq_codebook_ratio")) { 
+      param.pq_codebook_trainset_fraction = 1.0 / (double)conf.at("pq_codebook_ratio"); }
   }
 }
 
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index 0012f3a9d2..afd5f73db5 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -395,15 +395,15 @@ void train_per_subset(raft::resources const& handle,
                       const float* trainset,   // [n_rows, dim]
                       const uint32_t* labels,  // [n_rows]
                       uint32_t kmeans_n_iters,
+                      double pq_codebook_trainset_fraction,
                       rmm::mr::device_memory_resource* managed_memory)
 {
   auto stream        = resource::get_cuda_stream(handle);
   auto device_memory = resource::get_workspace_resource(handle);
 
   rmm::device_uvector<float> pq_centers_tmp(index.pq_centers().size(), stream, device_memory);
-  // Subsampling the train set for codebook generation. Using similar subsampling strategy as train_per_cluster
-  size_t big_enough     = 256ul * std::max<size_t>(index.pq_book_size(), index.pq_dim());
-  auto pq_n_rows        = uint32_t(std::min(big_enough, n_rows));
+  // Subsampling the train set for codebook generation based on pq_codebook_trainset_fraction. 
+  auto pq_n_rows     = uint32_t(n_rows * pq_codebook_trainset_fraction);
   rmm::device_uvector<float> sub_trainset(pq_n_rows * size_t(index.pq_len()), stream, device_memory);
   rmm::device_uvector<uint32_t> sub_labels(pq_n_rows, stream, device_memory);
 
@@ -1861,6 +1861,7 @@ auto build(raft::resources const& handle,
                          trainset.data(),
                          labels.data(),
                          params.kmeans_n_iters,
+                         params.pq_codebook_trainset_fraction,
                          &managed_memory_upstream);
         break;
       case codebook_gen::PER_CLUSTER:
diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index 45ab18c84f..8340bfde67 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -105,6 +105,12 @@ struct index_params : ann::index_params {
    * flag to `true` if you prefer to use as little GPU memory for the database as possible.
    */
   bool conservative_memory_allocation = false;
+  /** 
+   * The fraction of data to use during PQ codebook generation on top of the subsampled data 
+   * controlled by kmeans_trainset_fraction. The parameter is only used when PQ codebook generation 
+   * kind is PER_SUBSPACE and ignored when PQ codebook generation kind is PER_CLUSTER. 
+  */
+  double pq_codebook_trainset_fraction = 1;
 };
 
 struct search_params : ann::search_params {
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd b/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
index 531c2428e9..81f96df9cd 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
@@ -78,7 +78,8 @@ cdef extern from "raft/neighbors/ivf_pq_types.hpp" \
         codebook_gen codebook_kind
         bool force_random_rotation
         bool conservative_memory_allocation
-
+        double pq_codebook_trainset_fraction
+        
     cdef cppclass index[IdxT](ann_index):
         index(const device_resources& handle,
               DistanceType metric,
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
index 0c1bbf6b9c..5b0675d069 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
@@ -156,6 +156,13 @@ cdef class IndexParams:
         repeated calls to `extend` (extending the database).
         To disable this behavior and use as little GPU memory for the
         database as possible, set this flat to `True`.
+    pq_codebook_trainset_fraction : int, default = 0.5 
+        If pq_codebook_trainset_fraction is less than 1, then the dataset is
+        subsampled for PQ codebook generation, and only n_samples * 
+        pq_codebook_trainset_fraction rows are used for PQ codebook generation.
+        This subsampling is applied after kmeans_trainset subsampling, 
+        controlled by kmeans_trainset_fraction and only used when codebook_kind
+        is PER_SUBSPACE. 
     """
     def __init__(self, *,
                  n_lists=1024,
@@ -167,7 +174,8 @@ cdef class IndexParams:
                  codebook_kind="subspace",
                  force_random_rotation=False,
                  add_data_on_build=True,
-                 conservative_memory_allocation=False):
+                 conservative_memory_allocation=False,
+                 pq_codebook_trainset_fraction=0.5):
         self.params.n_lists = n_lists
         self.params.metric = _get_metric(metric)
         self.params.metric_arg = 0
@@ -185,6 +193,8 @@ cdef class IndexParams:
         self.params.add_data_on_build = add_data_on_build
         self.params.conservative_memory_allocation = \
             conservative_memory_allocation
+        self.params.pq_codebook_trainset_fraction = \
+            pq_codebook_trainset_fraction
 
     @property
     def n_lists(self):
@@ -226,6 +236,9 @@ cdef class IndexParams:
     def conservative_memory_allocation(self):
         return self.params.conservative_memory_allocation
 
+    @property
+        def pq_codebook_trainset_fraction(self):
+            return self.params.pq_codebook_trainset_fraction
 
 cdef class Index:
     # We store a pointer to the index because it dose not have a trivial

From 3153f0e6607c703dc927e3a05167b2aca08405d9 Mon Sep 17 00:00:00 2001
From: Rui Lan <abc99lr@gmail.com>
Date: Fri, 12 Jan 2024 16:56:52 -0800
Subject: [PATCH 03/14] Fix if-statement dependency issue in parse build
 parameter.

---
 cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
index c88cee71fc..1b876ea147 100644
--- a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
@@ -86,8 +86,9 @@ void parse_build_param(const nlohmann::json& conf,
       throw std::runtime_error("codebook_kind: '" + kind +
                                "', should be either 'cluster' or 'subspace'");
     }
-    if (conf.contains("pq_codebook_ratio")) { 
-      param.pq_codebook_trainset_fraction = 1.0 / (double)conf.at("pq_codebook_ratio"); }
+  }
+  if (conf.contains("pq_codebook_ratio")) { 
+    param.pq_codebook_trainset_fraction = 1.0 / (double)conf.at("pq_codebook_ratio"); 
   }
 }
 

From a03f0af8adcafafe7427d4baf05c26fe6ae994eb Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Thu, 18 Jan 2024 14:51:02 -0500
Subject: [PATCH 04/14] DOC v24.04 Updates [skip ci]

---
 .../cuda11.8-conda/devcontainer.json          |  4 +--
 .devcontainer/cuda11.8-pip/devcontainer.json  |  6 ++---
 .../cuda12.0-conda/devcontainer.json          |  4 +--
 .devcontainer/cuda12.0-pip/devcontainer.json  |  6 ++---
 .github/workflows/build.yaml                  | 16 ++++++------
 .github/workflows/pr.yaml                     | 26 +++++++++----------
 .github/workflows/test.yaml                   | 10 +++----
 README.md                                     |  2 +-
 VERSION                                       |  2 +-
 ci/build_docs.sh                              |  2 +-
 .../all_cuda-118_arch-aarch64.yaml            |  8 +++---
 .../all_cuda-118_arch-x86_64.yaml             |  8 +++---
 .../all_cuda-120_arch-aarch64.yaml            |  8 +++---
 .../all_cuda-120_arch-x86_64.yaml             |  8 +++---
 .../bench_ann_cuda-118_arch-aarch64.yaml      |  2 +-
 .../bench_ann_cuda-118_arch-x86_64.yaml       |  2 +-
 .../bench_ann_cuda-120_arch-aarch64.yaml      |  2 +-
 .../bench_ann_cuda-120_arch-x86_64.yaml       |  2 +-
 .../recipes/raft-dask/conda_build_config.yaml |  2 +-
 cpp/CMakeLists.txt                            |  4 +--
 cpp/doxygen/Doxyfile                          |  2 +-
 .../cmake/thirdparty/fetch_rapids.cmake       |  2 +-
 dependencies.yaml                             | 24 ++++++++---------
 docs/source/build.md                          |  2 +-
 docs/source/conf.py                           |  4 +--
 docs/source/developer_guide.md                |  8 +++---
 docs/source/raft_ann_benchmarks.md            | 12 ++++-----
 fetch_rapids.cmake                            |  2 +-
 python/pylibraft/CMakeLists.txt               |  2 +-
 python/pylibraft/pyproject.toml               |  4 +--
 python/raft-ann-bench/pyproject.toml          |  2 +-
 python/raft-dask/CMakeLists.txt               |  2 +-
 python/raft-dask/pyproject.toml               |  8 +++---
 33 files changed, 99 insertions(+), 99 deletions(-)

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 4c2161cab7..2682510ed1 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,12 +5,12 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.02-cpp-llvm16-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.04-cpp-llvm16-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 8bbc016620..f96fef205f 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,13 +5,13 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.02-cpp-llvm16-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.04-cpp-llvm16-cuda11.8-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:24.2": {"version": "1.14.1"},
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
+    "ghcr.io/rapidsai/devcontainers/features/ucx:24.4": {"version": "1.14.1"},
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/ucx",
diff --git a/.devcontainer/cuda12.0-conda/devcontainer.json b/.devcontainer/cuda12.0-conda/devcontainer.json
index ebb101d9d5..7ce4ea20d3 100644
--- a/.devcontainer/cuda12.0-conda/devcontainer.json
+++ b/.devcontainer/cuda12.0-conda/devcontainer.json
@@ -5,12 +5,12 @@
     "args": {
       "CUDA": "12.0",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.02-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.04-cpp-mambaforge-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.0-pip/devcontainer.json b/.devcontainer/cuda12.0-pip/devcontainer.json
index 3e2efb2f37..4f97af830a 100644
--- a/.devcontainer/cuda12.0-pip/devcontainer.json
+++ b/.devcontainer/cuda12.0-pip/devcontainer.json
@@ -5,13 +5,13 @@
     "args": {
       "CUDA": "12.0",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.02-cpp-llvm16-cuda12.0-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.04-cpp-llvm16-cuda12.0-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:24.2": {"version": "1.14.1"},
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
+    "ghcr.io/rapidsai/devcontainers/features/ucx:24.4": {"version": "1.14.1"},
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/ucx",
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 4c6e3d0ed4..bd8b13d21e 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-raft-dask:
     needs: wheel-publish-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -99,7 +99,7 @@ jobs:
   wheel-publish-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 19b990c8e9..889eef2a2c 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -25,29 +25,29 @@ jobs:
       - wheel-tests-raft-dask
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.04
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.04
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04
     with:
       build_type: pull-request
       enable_check_symbols: true
@@ -55,19 +55,19 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -77,34 +77,34 @@ jobs:
   wheel-build-pylibraft:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/build_wheel_pylibraft.sh
   wheel-tests-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/test_wheel_pylibraft.sh
   wheel-build-raft-dask:
     needs: wheel-tests-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
     with:
       build_type: pull-request
       script: "ci/build_wheel_raft_dask.sh"
   wheel-tests-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: pull-request
       script: ci/test_wheel_raft_dask.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.04
     with:
       build_command: |
         sccache -z;
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 138fd0e300..24a9954b96 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -26,7 +26,7 @@ jobs:
       symbol_exclusions: (void (thrust::|cub::)|_ZN\d+raft_cutlass)
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -42,7 +42,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
       script: ci/test_wheel_pylibraft.sh
   wheel-tests-raft-dask:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/README.md b/README.md
index 26ddc30ed4..55b8fca8f6 100755
--- a/README.md
+++ b/README.md
@@ -287,7 +287,7 @@ You can also install the conda packages individually using the `mamba` command a
 mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.0
 ```
 
-If installing the C++ APIs please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.02/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
+If installing the C++ APIs please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.04/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
 
 ### Installing Python through Pip
 
diff --git a/VERSION b/VERSION
index 3c6c5e2b70..4a2fe8aa57 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.02.00
+24.04.00
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index b7c8c7a3e0..4c07683642 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -28,7 +28,7 @@ rapids-mamba-retry install \
   pylibraft \
   raft-dask
 
-export RAPIDS_VERSION_NUMBER="24.02"
+export RAPIDS_VERSION_NUMBER="24.04"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build CPP docs"
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index ac076f5505..96c10a763a 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -20,7 +20,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.2.*
+- dask-cuda==24.4.*
 - doxygen>=1.8.20
 - gcc_linux-aarch64=11.*
 - gmock>=1.13.0
@@ -46,9 +46,9 @@ dependencies:
 - pydata-sphinx-theme
 - pytest
 - pytest-cov
-- rapids-dask-dependency==24.2.*
+- rapids-dask-dependency==24.4.*
 - recommonmark
-- rmm==24.2.*
+- rmm==24.4.*
 - scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
@@ -56,6 +56,6 @@ dependencies:
 - sphinx-markdown-tables
 - sysroot_linux-aarch64==2.17
 - ucx-proc=*=gpu
-- ucx-py==0.36.*
+- ucx-py==0.37.*
 - ucx>=1.13.0
 name: all_cuda-118_arch-aarch64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index b3ded51bb5..aea5bd1961 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -20,7 +20,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.2.*
+- dask-cuda==24.4.*
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - gmock>=1.13.0
@@ -46,9 +46,9 @@ dependencies:
 - pydata-sphinx-theme
 - pytest
 - pytest-cov
-- rapids-dask-dependency==24.2.*
+- rapids-dask-dependency==24.4.*
 - recommonmark
-- rmm==24.2.*
+- rmm==24.4.*
 - scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
@@ -56,6 +56,6 @@ dependencies:
 - sphinx-markdown-tables
 - sysroot_linux-64==2.17
 - ucx-proc=*=gpu
-- ucx-py==0.36.*
+- ucx-py==0.37.*
 - ucx>=1.13.0
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-120_arch-aarch64.yaml b/conda/environments/all_cuda-120_arch-aarch64.yaml
index c0eede1389..0ea4a979d6 100644
--- a/conda/environments/all_cuda-120_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-120_arch-aarch64.yaml
@@ -21,7 +21,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.2.*
+- dask-cuda==24.4.*
 - doxygen>=1.8.20
 - gcc_linux-aarch64=11.*
 - gmock>=1.13.0
@@ -42,9 +42,9 @@ dependencies:
 - pydata-sphinx-theme
 - pytest
 - pytest-cov
-- rapids-dask-dependency==24.2.*
+- rapids-dask-dependency==24.4.*
 - recommonmark
-- rmm==24.2.*
+- rmm==24.4.*
 - scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
@@ -52,6 +52,6 @@ dependencies:
 - sphinx-markdown-tables
 - sysroot_linux-aarch64==2.17
 - ucx-proc=*=gpu
-- ucx-py==0.36.*
+- ucx-py==0.37.*
 - ucx>=1.13.0
 name: all_cuda-120_arch-aarch64
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index cebaf96493..48a463bc6f 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -21,7 +21,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.2.*
+- dask-cuda==24.4.*
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - gmock>=1.13.0
@@ -42,9 +42,9 @@ dependencies:
 - pydata-sphinx-theme
 - pytest
 - pytest-cov
-- rapids-dask-dependency==24.2.*
+- rapids-dask-dependency==24.4.*
 - recommonmark
-- rmm==24.2.*
+- rmm==24.4.*
 - scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
@@ -52,6 +52,6 @@ dependencies:
 - sphinx-markdown-tables
 - sysroot_linux-64==2.17
 - ucx-proc=*=gpu
-- ucx-py==0.36.*
+- ucx-py==0.37.*
 - ucx>=1.13.0
 name: all_cuda-120_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index 87e3942e6a..0e0385ceeb 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -38,7 +38,7 @@ dependencies:
 - openblas
 - pandas
 - pyyaml
-- rmm==24.2.*
+- rmm==24.4.*
 - scikit-build-core>=0.7.0
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-118_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index a4ac253a85..dfe76a2948 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -38,7 +38,7 @@ dependencies:
 - openblas
 - pandas
 - pyyaml
-- rmm==24.2.*
+- rmm==24.4.*
 - scikit-build-core>=0.7.0
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
index 9ef9799363..0a6567c646 100644
--- a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
@@ -34,7 +34,7 @@ dependencies:
 - openblas
 - pandas
 - pyyaml
-- rmm==24.2.*
+- rmm==24.4.*
 - scikit-build-core>=0.7.0
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-120_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
index 5fa09096ba..a89d5317b6 100644
--- a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
@@ -34,7 +34,7 @@ dependencies:
 - openblas
 - pandas
 - pyyaml
-- rmm==24.2.*
+- rmm==24.4.*
 - scikit-build-core>=0.7.0
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-120_arch-x86_64
diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
index c12c35af3b..483e53026a 100644
--- a/conda/recipes/raft-dask/conda_build_config.yaml
+++ b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -17,7 +17,7 @@ ucx_version:
   - ">=1.14.1,<1.16.0"
 
 ucx_py_version:
-  - "0.36.*"
+  - "0.37.*"
 
 cmake_version:
   - ">=3.26.4"
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index dbce46abfe..1d01d71735 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -10,8 +10,8 @@
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
-set(RAPIDS_VERSION "24.02")
-set(RAFT_VERSION "24.02.00")
+set(RAPIDS_VERSION "24.04")
+set(RAFT_VERSION "24.04.00")
 
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 include(../fetch_rapids.cmake)
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index 3eb0763eaf..779472d880 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "RAFT C++ API"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = "24.02"
+PROJECT_NUMBER         = "24.04"
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/cpp/template/cmake/thirdparty/fetch_rapids.cmake b/cpp/template/cmake/thirdparty/fetch_rapids.cmake
index 15b6c43a6f..aadfdb0028 100644
--- a/cpp/template/cmake/thirdparty/fetch_rapids.cmake
+++ b/cpp/template/cmake/thirdparty/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 
 # Use this variable to update RAPIDS and RAFT versions
-set(RAPIDS_VERSION "24.02")
+set(RAPIDS_VERSION "24.04")
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake)
     file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
diff --git a/dependencies.yaml b/dependencies.yaml
index 0e4d6d4693..37404a1e37 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -180,7 +180,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - &rmm_conda rmm==24.2.*
+          - &rmm_conda rmm==24.4.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -201,12 +201,12 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &build_pylibraft_packages_cu12
-              - &rmm_cu12 rmm-cu12==24.2.*
+              - &rmm_cu12 rmm-cu12==24.4.*
           - {matrix: {cuda: "12.1"}, packages: *build_pylibraft_packages_cu12}
           - {matrix: {cuda: "12.0"}, packages: *build_pylibraft_packages_cu12}
           - matrix: {cuda: "11.8"}
             packages: &build_pylibraft_packages_cu11
-              - &rmm_cu11 rmm-cu11==24.2.*
+              - &rmm_cu11 rmm-cu11==24.4.*
           - {matrix: {cuda: "11.5"}, packages: *build_pylibraft_packages_cu11}
           - {matrix: {cuda: "11.4"}, packages: *build_pylibraft_packages_cu11}
           - {matrix: {cuda: "11.2"}, packages: *build_pylibraft_packages_cu11}
@@ -459,20 +459,20 @@ dependencies:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - dask-cuda==24.2.*
+          - dask-cuda==24.4.*
           - joblib>=0.11
           - numba>=0.57
           - *numpy
-          - rapids-dask-dependency==24.2.*
-          - ucx-py==0.36.*
+          - rapids-dask-dependency==24.4.*
+          - ucx-py==0.37.*
       - output_types: conda
         packages:
           - ucx>=1.13.0
           - ucx-proc=*=gpu
-          - &ucx_py_conda ucx-py==0.36.*
+          - &ucx_py_conda ucx-py==0.37.*
       - output_types: pyproject
         packages:
-          - &pylibraft_conda pylibraft==24.2.*
+          - &pylibraft_conda pylibraft==24.4.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -484,14 +484,14 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &run_raft_dask_packages_cu12
-              - &pylibraft_cu12 pylibraft-cu12==24.2.*
-              - &ucx_py_cu12 ucx-py-cu12==0.36.*
+              - &pylibraft_cu12 pylibraft-cu12==24.4.*
+              - &ucx_py_cu12 ucx-py-cu12==0.37.*
           - {matrix: {cuda: "12.1"}, packages: *run_raft_dask_packages_cu12}
           - {matrix: {cuda: "12.0"}, packages: *run_raft_dask_packages_cu12}
           - matrix: {cuda: "11.8"}
             packages: &run_raft_dask_packages_cu11
-              - &pylibraft_cu11 pylibraft-cu11==24.2.*
-              - &ucx_py_cu11 ucx-py-cu11==0.36.*
+              - &pylibraft_cu11 pylibraft-cu11==24.4.*
+              - &ucx_py_cu11 ucx-py-cu11==0.37.*
           - {matrix: {cuda: "11.5"}, packages: *run_raft_dask_packages_cu11}
           - {matrix: {cuda: "11.4"}, packages: *run_raft_dask_packages_cu11}
           - {matrix: {cuda: "11.2"}, packages: *run_raft_dask_packages_cu11}
diff --git a/docs/source/build.md b/docs/source/build.md
index ae7734d0ed..e76512d16f 100644
--- a/docs/source/build.md
+++ b/docs/source/build.md
@@ -56,7 +56,7 @@ You can also install the conda packages individually using the `mamba` command a
 mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.0
 ```
 
-If installing the C++ APIs Please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.02/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
+If installing the C++ APIs Please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.04/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
 
 ## Installing Python through Pip
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 2a2c700926..07dd4825fa 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -67,9 +67,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '24.02'
+version = '24.04'
 # The full version, including alpha/beta/rc tags.
-release = '24.02.00'
+release = '24.04.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md
index c5bcd03f69..d29130add0 100644
--- a/docs/source/developer_guide.md
+++ b/docs/source/developer_guide.md
@@ -187,7 +187,7 @@ RAFT relies on `clang-format` to enforce code style across all C++ and CUDA sour
 1. Do not split empty functions/records/namespaces.
 2. Two-space indentation everywhere, including the line continuations.
 3. Disable reflowing of comments.
-   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-24.02/cpp/.clang-format).
+   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-24.04/cpp/.clang-format).
 
 [`doxygen`](https://doxygen.nl/) is used as documentation generator and also as a documentation linter.
 In order to run doxygen as a linter on C++/CUDA code, run
@@ -205,7 +205,7 @@ you can run  `codespell -i 3 -w .` from the repository root directory.
 This will bring up an interactive prompt to select which spelling fixes to apply.
 
 ### #include style
-[include_checker.py](https://github.com/rapidsai/raft/blob/branch-24.02/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
+[include_checker.py](https://github.com/rapidsai/raft/blob/branch-24.04/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
 1. `#include "..."` should be used for referencing local files only. It is acceptable to be used for referencing files in a sub-folder/parent-folder of the same algorithm, but should never be used to include files in other algorithms or between algorithms and the primitives or other dependencies.
 2. `#include <...>` should be used for referencing everything else
 
@@ -215,7 +215,7 @@ python ./cpp/scripts/include_checker.py --inplace [cpp/include cpp/test ... list
 ```
 
 ### Copyright header
-[copyright.py](https://github.com/rapidsai/raft/blob/branch-24.02/ci/checks/copyright.py) checks the Copyright header for all git-modified files
+[copyright.py](https://github.com/rapidsai/raft/blob/branch-24.04/ci/checks/copyright.py) checks the Copyright header for all git-modified files
 
 Manually, you can run the following to bulk-fix the header if only the years need to be updated:
 ```bash
@@ -229,7 +229,7 @@ Call CUDA APIs via the provided helper macros `RAFT_CUDA_TRY`, `RAFT_CUBLAS_TRY`
 ## Logging
 
 ### Introduction
-Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-24.02/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
+Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-24.04/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
 
 ### Usage
 ```cpp
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index 3789c3f01f..68fe80f9ce 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -62,7 +62,7 @@ Nightly images are located in [dockerhub](https://hub.docker.com/r/rapidsai/raft
 - The following command pulls the nightly container for python version 10, cuda version 12, and RAFT version 23.10:
 
 ```bash
-docker pull rapidsai/raft-ann-bench:24.02a-cuda12.0-py3.10 #substitute raft-ann-bench for the exact desired container.
+docker pull rapidsai/raft-ann-bench:24.04a-cuda12.0-py3.10 #substitute raft-ann-bench for the exact desired container.
 ```
 
 The CUDA and python versions can be changed for the supported values:
@@ -83,7 +83,7 @@ You can see the exact versions as well in the dockerhub site:
 [//]: # ()
 [//]: # (```bash)
 
-[//]: # (docker pull nvcr.io/nvidia/rapidsai/raft-ann-bench:24.02-cuda11.8-py3.10 #substitute raft-ann-bench for the exact desired container.)
+[//]: # (docker pull nvcr.io/nvidia/rapidsai/raft-ann-bench:24.04-cuda11.8-py3.10 #substitute raft-ann-bench for the exact desired container.)
 
 [//]: # (```)
 
@@ -344,7 +344,7 @@ For GPU-enabled systems, the `DATA_FOLDER` variable should be a local folder whe
 export DATA_FOLDER=path/to/store/datasets/and/results
 docker run --gpus all --rm -it -u $(id -u)                      \
     -v $DATA_FOLDER:/data/benchmarks                            \
-    rapidsai/raft-ann-bench:24.02a-cuda11.8-py3.10              \
+    rapidsai/raft-ann-bench:24.04a-cuda11.8-py3.10              \
     "--dataset deep-image-96-angular"                           \
     "--normalize"                                               \
     "--algorithms raft_cagra,raft_ivf_pq --batch-size 10 -k 10" \
@@ -355,7 +355,7 @@ Usage of the above command is as follows:
 
 | Argument                                                  | Description                                                                                        |
 |-----------------------------------------------------------|----------------------------------------------------------------------------------------------------|
-| `rapidsai/raft-ann-bench:24.02a-cuda11.8-py3.10`          | Image to use. Can be either `raft-ann-bench` or `raft-ann-bench-datasets`                          |
+| `rapidsai/raft-ann-bench:24.04a-cuda11.8-py3.10`          | Image to use. Can be either `raft-ann-bench` or `raft-ann-bench-datasets`                          |
 | `"--dataset deep-image-96-angular"`                       | Dataset name                                                                                       |
 | `"--normalize"`                                           | Whether to normalize the dataset                                                                   |
 | `"--algorithms raft_cagra,hnswlib --batch-size 10 -k 10"` | Arguments passed to the `run` script, such as the algorithms to benchmark, the batch size, and `k` |
@@ -372,7 +372,7 @@ The container arguments in the above section also be used for the CPU-only conta
 export DATA_FOLDER=path/to/store/datasets/and/results
 docker run  --rm -it -u $(id -u)                  \
     -v $DATA_FOLDER:/data/benchmarks              \
-    rapidsai/raft-ann-bench-cpu:24.02a-py3.10     \
+    rapidsai/raft-ann-bench-cpu:24.04a-py3.10     \
      "--dataset deep-image-96-angular"            \
      "--normalize"                                \
      "--algorithms hnswlib --batch-size 10 -k 10" \
@@ -389,7 +389,7 @@ docker run --gpus all --rm -it -u $(id -u)          \
     --entrypoint /bin/bash                          \
     --workdir /data/benchmarks                      \
     -v $DATA_FOLDER:/data/benchmarks                \
-    rapidsai/raft-ann-bench:24.02a-cuda11.8-py3.10 
+    rapidsai/raft-ann-bench:24.04a-cuda11.8-py3.10 
 ```
 
 This will drop you into a command line in the container, with the `raft-ann-bench` python package ready to use, as described in the [Running the benchmarks](#running-the-benchmarks) section above:
diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index ca871c5759..1dca136c97 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake)
-  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.02/RAPIDS.cmake
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.04/RAPIDS.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake
   )
 endif()
diff --git a/python/pylibraft/CMakeLists.txt b/python/pylibraft/CMakeLists.txt
index f7e114ae66..c17243728e 100644
--- a/python/pylibraft/CMakeLists.txt
+++ b/python/pylibraft/CMakeLists.txt
@@ -16,7 +16,7 @@ cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
 include(../../fetch_rapids.cmake)
 
-set(pylibraft_version 24.02.00)
+set(pylibraft_version 24.04.00)
 
 # We always need CUDA for pylibraft because the raft dependency brings in a header-only cuco
 # dependency that enables CUDA unconditionally.
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index 5070d6cf6f..ba45a6d33d 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -19,7 +19,7 @@ requires = [
     "cuda-python>=11.7.1,<12.0a0",
     "cython>=3.0.0",
     "ninja",
-    "rmm==24.2.*",
+    "rmm==24.4.*",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 build-backend = "scikit_build_core.build"
@@ -37,7 +37,7 @@ requires-python = ">=3.9"
 dependencies = [
     "cuda-python>=11.7.1,<12.0a0",
     "numpy>=1.21",
-    "rmm==24.2.*",
+    "rmm==24.4.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
diff --git a/python/raft-ann-bench/pyproject.toml b/python/raft-ann-bench/pyproject.toml
index e3ae36ef62..4a185b22ca 100644
--- a/python/raft-ann-bench/pyproject.toml
+++ b/python/raft-ann-bench/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
 
 [project]
 name = "raft-ann-bench"
-version = "24.02.00"
+version = "24.04.00"
 description = "RAFT ANN benchmarks"
 authors = [
     { name = "NVIDIA Corporation" },
diff --git a/python/raft-dask/CMakeLists.txt b/python/raft-dask/CMakeLists.txt
index 1d27e49583..ff441e343e 100644
--- a/python/raft-dask/CMakeLists.txt
+++ b/python/raft-dask/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
-set(raft_dask_version 24.02.00)
+set(raft_dask_version 24.04.00)
 
 include(../../fetch_rapids.cmake)
 include(rapids-cuda)
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 37d94be7f9..33e0c15684 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -33,13 +33,13 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "dask-cuda==24.2.*",
+    "dask-cuda==24.4.*",
     "joblib>=0.11",
     "numba>=0.57",
     "numpy>=1.21",
-    "pylibraft==24.2.*",
-    "rapids-dask-dependency==24.2.*",
-    "ucx-py==0.36.*",
+    "pylibraft==24.4.*",
+    "rapids-dask-dependency==24.4.*",
+    "ucx-py==0.37.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",

From fdd4ad2c78458cd08b663dd4e7fdb511033c2d66 Mon Sep 17 00:00:00 2001
From: Rui Lan <rlan@nvidia.com>
Date: Tue, 23 Jan 2024 14:31:05 -0800
Subject: [PATCH 05/14] Revert "Allow topk larger than 1024 in CAGRA (#2097)"

This reverts commit 0586fc324598cc25e4d8c9f4f6dc3db2eb3ab521.
---
 .../neighbors/detail/cagra/cagra_search.cuh   |   2 +-
 .../detail/cagra/search_multi_kernel.cuh      | 167 ++++--------------
 .../neighbors/detail/cagra/search_plan.cuh    |  10 +-
 cpp/test/neighbors/ann_cagra.cuh              |  20 ---
 4 files changed, 36 insertions(+), 163 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
index 40cc7c76fb..41a43c9bce 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
@@ -131,7 +131,7 @@ void search_main(raft::resources const& res,
     factory<T, internal_IdxT, DistanceT, CagraSampleFilterT_s>::create(
       res, params, index.dim(), index.graph_degree(), topk);
 
-  plan->check(topk);
+  plan->check(neighbors.extent(1));
 
   RAFT_LOG_DEBUG("Cagra search");
   const uint32_t max_queries = plan->max_queries;
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
index f9bf525503..7be3fedfa2 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,7 +37,6 @@
 #include "topk_for_cagra/topk_core.cuh"  //todo replace with raft kernel
 #include "utils.hpp"
 #include <raft/core/logger.hpp>
-#include <raft/matrix/select_k.cuh>
 #include <raft/util/cuda_rt_essentials.hpp>
 #include <raft/util/cudart_utils.hpp>  // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp
 
@@ -654,12 +653,6 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T> {
   rmm::device_scalar<uint32_t> terminate_flag;  // dev_terminate_flag, host_terminate_flag.;
   rmm::device_uvector<uint32_t> topk_workspace;
 
-  // temporary storage for _find_topk
-  rmm::device_uvector<float> input_keys_storage;
-  rmm::device_uvector<float> output_keys_storage;
-  rmm::device_uvector<INDEX_T> input_values_storage;
-  rmm::device_uvector<INDEX_T> output_values_storage;
-
   search(raft::resources const& res,
          search_params params,
          int64_t dim,
@@ -672,11 +665,7 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T> {
       parent_node_list(0, resource::get_cuda_stream(res)),
       topk_hint(0, resource::get_cuda_stream(res)),
       topk_workspace(0, resource::get_cuda_stream(res)),
-      terminate_flag(resource::get_cuda_stream(res)),
-      input_keys_storage(0, resource::get_cuda_stream(res)),
-      output_keys_storage(0, resource::get_cuda_stream(res)),
-      input_values_storage(0, resource::get_cuda_stream(res)),
-      output_values_storage(0, resource::get_cuda_stream(res))
+      terminate_flag(resource::get_cuda_stream(res))
   {
     set_params(res);
   }
@@ -706,98 +695,6 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T> {
 
   ~search() {}
 
-  inline void _find_topk(raft::resources const& handle,
-                         uint32_t topK,
-                         uint32_t sizeBatch,
-                         uint32_t numElements,
-                         const float* inputKeys,    // [sizeBatch, ldIK,]
-                         uint32_t ldIK,             // (*) ldIK >= numElements
-                         const INDEX_T* inputVals,  // [sizeBatch, ldIV,]
-                         uint32_t ldIV,             // (*) ldIV >= numElements
-                         float* outputKeys,         // [sizeBatch, ldOK,]
-                         uint32_t ldOK,             // (*) ldOK >= topK
-                         INDEX_T* outputVals,       // [sizeBatch, ldOV,]
-                         uint32_t ldOV,             // (*) ldOV >= topK
-                         void* workspace,
-                         bool sort,
-                         uint32_t* hints)
-  {
-    auto stream = resource::get_cuda_stream(handle);
-
-    // _cuann_find_topk right now is limited to a max-k of 1024.
-    // RAFT has a matrix::select_k function - which handles arbitrary sized values of k,
-    // but doesn't accept strided inputs unlike _cuann_find_topk
-    // The multi-kernel search path requires strided access - since its cleverly allocating memory
-    // (layout described in the search_plan_impl function below), such that both the
-    // neighbors and the internal_topk are adjacent - in a double buffered format.
-    // Since this layout doesn't work with the matrix::select_k code - we have to copy
-    // over to a contiguous (non-strided) access to handle topk larger than 1024, and
-    // potentially also copy back to a strided layout afterwards
-    if (topK <= 1024) {
-      return _cuann_find_topk(topK,
-                              sizeBatch,
-                              numElements,
-                              inputKeys,
-                              ldIK,
-                              inputVals,
-                              ldIV,
-                              outputKeys,
-                              ldOK,
-                              outputVals,
-                              ldOV,
-                              workspace,
-                              sort,
-                              hints,
-                              stream);
-    }
-
-    if (ldIK > numElements) {
-      if (input_keys_storage.size() != sizeBatch * numElements) {
-        input_keys_storage.resize(sizeBatch * numElements, stream);
-      }
-      batched_memcpy(
-        input_keys_storage.data(), numElements, inputKeys, ldIK, numElements, sizeBatch, stream);
-      inputKeys = input_keys_storage.data();
-    }
-
-    if (ldIV > numElements) {
-      if (input_values_storage.size() != sizeBatch * numElements) {
-        input_values_storage.resize(sizeBatch * numElements, stream);
-      }
-
-      batched_memcpy(
-        input_values_storage.data(), numElements, inputVals, ldIV, numElements, sizeBatch, stream);
-      inputVals = input_values_storage.data();
-    }
-
-    if ((ldOK > topK) && (output_keys_storage.size() != sizeBatch * topK)) {
-      output_keys_storage.resize(sizeBatch * topK, stream);
-    }
-
-    if ((ldOV > topK) && (output_values_storage.size() != sizeBatch * topK)) {
-      output_values_storage.resize(sizeBatch * topK, stream);
-    }
-
-    raft::matrix::select_k<float, INDEX_T>(
-      handle,
-      raft::make_device_matrix_view<const float, int64_t>(inputKeys, sizeBatch, numElements),
-      raft::make_device_matrix_view<const INDEX_T, int64_t>(inputVals, sizeBatch, numElements),
-      raft::make_device_matrix_view<float, int64_t>(
-        ldOK > topK ? output_keys_storage.data() : outputKeys, sizeBatch, topK),
-      raft::make_device_matrix_view<INDEX_T, int64_t>(
-        ldOV > topK ? output_values_storage.data() : outputVals, sizeBatch, topK),
-      true,  // select_min
-      sort);
-
-    if (ldOK > topK) {
-      batched_memcpy(outputKeys, ldOK, output_keys_storage.data(), topK, topK, sizeBatch, stream);
-    }
-
-    if (ldOV > topK) {
-      batched_memcpy(outputVals, ldOV, output_values_storage.data(), topK, topK, sizeBatch, stream);
-    }
-  }
-
   void operator()(raft::resources const& res,
                   raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,
                   raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,
@@ -849,21 +746,21 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T> {
     unsigned iter = 0;
     while (1) {
       // Make an index list of internal top-k nodes
-      _find_topk(res,
-                 itopk_size,
-                 num_queries,
-                 result_buffer_size,
-                 result_distances.data() + (iter & 0x1) * itopk_size,
-                 result_buffer_allocation_size,
-                 result_indices.data() + (iter & 0x1) * itopk_size,
-                 result_buffer_allocation_size,
-                 result_distances.data() + (1 - (iter & 0x1)) * result_buffer_size,
-                 result_buffer_allocation_size,
-                 result_indices.data() + (1 - (iter & 0x1)) * result_buffer_size,
-                 result_buffer_allocation_size,
-                 topk_workspace.data(),
-                 true,
-                 top_hint_ptr);
+      _cuann_find_topk(itopk_size,
+                       num_queries,
+                       result_buffer_size,
+                       result_distances.data() + (iter & 0x1) * itopk_size,
+                       result_buffer_allocation_size,
+                       result_indices.data() + (iter & 0x1) * itopk_size,
+                       result_buffer_allocation_size,
+                       result_distances.data() + (1 - (iter & 0x1)) * result_buffer_size,
+                       result_buffer_allocation_size,
+                       result_indices.data() + (1 - (iter & 0x1)) * result_buffer_size,
+                       result_buffer_allocation_size,
+                       topk_workspace.data(),
+                       true,
+                       top_hint_ptr,
+                       stream);
 
       // termination (1)
       if ((iter + 1 == max_iterations)) {
@@ -944,21 +841,21 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T> {
 
       result_indices_ptr   = result_indices.data() + (1 - (iter & 0x1)) * result_buffer_size;
       result_distances_ptr = result_distances.data() + (1 - (iter & 0x1)) * result_buffer_size;
-      _find_topk(res,
-                 itopk_size,
-                 num_queries,
-                 result_buffer_size,
-                 result_distances.data() + (iter & 0x1) * itopk_size,
-                 result_buffer_allocation_size,
-                 result_indices.data() + (iter & 0x1) * itopk_size,
-                 result_buffer_allocation_size,
-                 result_distances_ptr,
-                 result_buffer_allocation_size,
-                 result_indices_ptr,
-                 result_buffer_allocation_size,
-                 topk_workspace.data(),
-                 true,
-                 top_hint_ptr);
+      _cuann_find_topk(itopk_size,
+                       num_queries,
+                       result_buffer_size,
+                       result_distances.data() + (iter & 0x1) * itopk_size,
+                       result_buffer_allocation_size,
+                       result_indices.data() + (iter & 0x1) * itopk_size,
+                       result_buffer_allocation_size,
+                       result_distances_ptr,
+                       result_buffer_allocation_size,
+                       result_indices_ptr,
+                       result_buffer_allocation_size,
+                       topk_workspace.data(),
+                       true,
+                       top_hint_ptr,
+                       stream);
     } else {
       // Remove parent bit in search results
       remove_parent_bit(
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
index 271a1f4955..20df2adf61 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
@@ -42,12 +42,9 @@ struct search_plan_impl_base : public search_params {
       if (itopk_size <= 512 && search_params::max_queries >= num_sm * 2lu) {
         algo = search_algo::SINGLE_CTA;
         RAFT_LOG_DEBUG("Auto strategy: selecting single-cta");
-      } else if (topk <= 1024) {
+      } else {
         algo = search_algo::MULTI_CTA;
         RAFT_LOG_DEBUG("Auto strategy: selecting multi-cta");
-      } else {
-        algo = search_algo::MULTI_KERNEL;
-        RAFT_LOG_DEBUG("Auto strategy: selecting multi kernel");
       }
     }
   }
@@ -258,8 +255,7 @@ struct search_plan_impl : public search_plan_impl_base {
   virtual void check(const uint32_t topk)
   {
     // For single-CTA and multi kernel
-    RAFT_EXPECTS(
-      topk <= itopk_size, "topk = %u must be smaller than itopk_size = %lu", topk, itopk_size);
+    RAFT_EXPECTS(topk <= itopk_size, "topk must be smaller than itopk_size = %lu", itopk_size);
   }
 
   inline void check_params()
@@ -267,7 +263,7 @@ struct search_plan_impl : public search_plan_impl_base {
     std::string error_message = "";
 
     if (itopk_size > 1024) {
-      if ((algo == search_algo::MULTI_CTA) || (algo == search_algo::MULTI_KERNEL)) {
+      if (algo == search_algo::MULTI_CTA) {
       } else {
         error_message += std::string("- `internal_topk` (" + std::to_string(itopk_size) +
                                      ") must be smaller or equal to 1024");
diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh
index ef4f27ae64..915ef8a394 100644
--- a/cpp/test/neighbors/ann_cagra.cuh
+++ b/cpp/test/neighbors/ann_cagra.cuh
@@ -259,7 +259,6 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
         search_params.algo        = ps.algo;
         search_params.max_queries = ps.max_queries;
         search_params.team_size   = ps.team_size;
-        search_params.itopk_size  = ps.itopk_size;
 
         auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
           (const DataT*)database.data(), ps.n_rows, ps.dim);
@@ -497,7 +496,6 @@ class AnnCagraFilterTest : public ::testing::TestWithParam<AnnCagraInputs> {
         search_params.algo         = ps.algo;
         search_params.max_queries  = ps.max_queries;
         search_params.team_size    = ps.team_size;
-        search_params.itopk_size   = ps.itopk_size;
         search_params.hashmap_mode = cagra::hash_mode::HASH;
 
         auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
@@ -613,7 +611,6 @@ class AnnCagraFilterTest : public ::testing::TestWithParam<AnnCagraInputs> {
         search_params.algo         = ps.algo;
         search_params.max_queries  = ps.max_queries;
         search_params.team_size    = ps.team_size;
-        search_params.itopk_size   = ps.itopk_size;
         search_params.hashmap_mode = cagra::hash_mode::HASH;
 
         auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
@@ -821,23 +818,6 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {0.995});
   inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
 
-  inputs2 =
-    raft::util::itertools::product<AnnCagraInputs>({100},
-                                                   {20000},
-                                                   {32},
-                                                   {2048},  // k
-                                                   {graph_build_algo::NN_DESCENT},
-                                                   {search_algo::AUTO},
-                                                   {10},
-                                                   {0},
-                                                   {4096},  // itopk_size
-                                                   {1},
-                                                   {raft::distance::DistanceType::L2Expanded},
-                                                   {false},
-                                                   {false},
-                                                   {0.995});
-  inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
-
   return inputs;
 }
 

From f8bc4ff4b26754059d1b4a631e1bd68e00eb1a35 Mon Sep 17 00:00:00 2001
From: Rui Lan <rlan@nvidia.com>
Date: Tue, 23 Jan 2024 14:53:44 -0800
Subject: [PATCH 06/14] Revert "Revert "Allow topk larger than 1024 in CAGRA
 (#2097)""

This reverts commit fdd4ad2c78458cd08b663dd4e7fdb511033c2d66.
---
 .../neighbors/detail/cagra/cagra_search.cuh   |   2 +-
 .../detail/cagra/search_multi_kernel.cuh      | 167 ++++++++++++++----
 .../neighbors/detail/cagra/search_plan.cuh    |  10 +-
 cpp/test/neighbors/ann_cagra.cuh              |  20 +++
 4 files changed, 163 insertions(+), 36 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
index 41a43c9bce..40cc7c76fb 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
@@ -131,7 +131,7 @@ void search_main(raft::resources const& res,
     factory<T, internal_IdxT, DistanceT, CagraSampleFilterT_s>::create(
       res, params, index.dim(), index.graph_degree(), topk);
 
-  plan->check(neighbors.extent(1));
+  plan->check(topk);
 
   RAFT_LOG_DEBUG("Cagra search");
   const uint32_t max_queries = plan->max_queries;
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
index 7be3fedfa2..f9bf525503 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,6 +37,7 @@
 #include "topk_for_cagra/topk_core.cuh"  //todo replace with raft kernel
 #include "utils.hpp"
 #include <raft/core/logger.hpp>
+#include <raft/matrix/select_k.cuh>
 #include <raft/util/cuda_rt_essentials.hpp>
 #include <raft/util/cudart_utils.hpp>  // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp
 
@@ -653,6 +654,12 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T> {
   rmm::device_scalar<uint32_t> terminate_flag;  // dev_terminate_flag, host_terminate_flag.;
   rmm::device_uvector<uint32_t> topk_workspace;
 
+  // temporary storage for _find_topk
+  rmm::device_uvector<float> input_keys_storage;
+  rmm::device_uvector<float> output_keys_storage;
+  rmm::device_uvector<INDEX_T> input_values_storage;
+  rmm::device_uvector<INDEX_T> output_values_storage;
+
   search(raft::resources const& res,
          search_params params,
          int64_t dim,
@@ -665,7 +672,11 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T> {
       parent_node_list(0, resource::get_cuda_stream(res)),
       topk_hint(0, resource::get_cuda_stream(res)),
       topk_workspace(0, resource::get_cuda_stream(res)),
-      terminate_flag(resource::get_cuda_stream(res))
+      terminate_flag(resource::get_cuda_stream(res)),
+      input_keys_storage(0, resource::get_cuda_stream(res)),
+      output_keys_storage(0, resource::get_cuda_stream(res)),
+      input_values_storage(0, resource::get_cuda_stream(res)),
+      output_values_storage(0, resource::get_cuda_stream(res))
   {
     set_params(res);
   }
@@ -695,6 +706,98 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T> {
 
   ~search() {}
 
+  inline void _find_topk(raft::resources const& handle,
+                         uint32_t topK,
+                         uint32_t sizeBatch,
+                         uint32_t numElements,
+                         const float* inputKeys,    // [sizeBatch, ldIK,]
+                         uint32_t ldIK,             // (*) ldIK >= numElements
+                         const INDEX_T* inputVals,  // [sizeBatch, ldIV,]
+                         uint32_t ldIV,             // (*) ldIV >= numElements
+                         float* outputKeys,         // [sizeBatch, ldOK,]
+                         uint32_t ldOK,             // (*) ldOK >= topK
+                         INDEX_T* outputVals,       // [sizeBatch, ldOV,]
+                         uint32_t ldOV,             // (*) ldOV >= topK
+                         void* workspace,
+                         bool sort,
+                         uint32_t* hints)
+  {
+    auto stream = resource::get_cuda_stream(handle);
+
+    // _cuann_find_topk right now is limited to a max-k of 1024.
+    // RAFT has a matrix::select_k function - which handles arbitrary sized values of k,
+    // but doesn't accept strided inputs unlike _cuann_find_topk
+    // The multi-kernel search path requires strided access - since its cleverly allocating memory
+    // (layout described in the search_plan_impl function below), such that both the
+    // neighbors and the internal_topk are adjacent - in a double buffered format.
+    // Since this layout doesn't work with the matrix::select_k code - we have to copy
+    // over to a contiguous (non-strided) access to handle topk larger than 1024, and
+    // potentially also copy back to a strided layout afterwards
+    if (topK <= 1024) {
+      return _cuann_find_topk(topK,
+                              sizeBatch,
+                              numElements,
+                              inputKeys,
+                              ldIK,
+                              inputVals,
+                              ldIV,
+                              outputKeys,
+                              ldOK,
+                              outputVals,
+                              ldOV,
+                              workspace,
+                              sort,
+                              hints,
+                              stream);
+    }
+
+    if (ldIK > numElements) {
+      if (input_keys_storage.size() != sizeBatch * numElements) {
+        input_keys_storage.resize(sizeBatch * numElements, stream);
+      }
+      batched_memcpy(
+        input_keys_storage.data(), numElements, inputKeys, ldIK, numElements, sizeBatch, stream);
+      inputKeys = input_keys_storage.data();
+    }
+
+    if (ldIV > numElements) {
+      if (input_values_storage.size() != sizeBatch * numElements) {
+        input_values_storage.resize(sizeBatch * numElements, stream);
+      }
+
+      batched_memcpy(
+        input_values_storage.data(), numElements, inputVals, ldIV, numElements, sizeBatch, stream);
+      inputVals = input_values_storage.data();
+    }
+
+    if ((ldOK > topK) && (output_keys_storage.size() != sizeBatch * topK)) {
+      output_keys_storage.resize(sizeBatch * topK, stream);
+    }
+
+    if ((ldOV > topK) && (output_values_storage.size() != sizeBatch * topK)) {
+      output_values_storage.resize(sizeBatch * topK, stream);
+    }
+
+    raft::matrix::select_k<float, INDEX_T>(
+      handle,
+      raft::make_device_matrix_view<const float, int64_t>(inputKeys, sizeBatch, numElements),
+      raft::make_device_matrix_view<const INDEX_T, int64_t>(inputVals, sizeBatch, numElements),
+      raft::make_device_matrix_view<float, int64_t>(
+        ldOK > topK ? output_keys_storage.data() : outputKeys, sizeBatch, topK),
+      raft::make_device_matrix_view<INDEX_T, int64_t>(
+        ldOV > topK ? output_values_storage.data() : outputVals, sizeBatch, topK),
+      true,  // select_min
+      sort);
+
+    if (ldOK > topK) {
+      batched_memcpy(outputKeys, ldOK, output_keys_storage.data(), topK, topK, sizeBatch, stream);
+    }
+
+    if (ldOV > topK) {
+      batched_memcpy(outputVals, ldOV, output_values_storage.data(), topK, topK, sizeBatch, stream);
+    }
+  }
+
   void operator()(raft::resources const& res,
                   raft::device_matrix_view<const DATA_T, int64_t, layout_stride> dataset,
                   raft::device_matrix_view<const INDEX_T, int64_t, row_major> graph,
@@ -746,21 +849,21 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T> {
     unsigned iter = 0;
     while (1) {
       // Make an index list of internal top-k nodes
-      _cuann_find_topk(itopk_size,
-                       num_queries,
-                       result_buffer_size,
-                       result_distances.data() + (iter & 0x1) * itopk_size,
-                       result_buffer_allocation_size,
-                       result_indices.data() + (iter & 0x1) * itopk_size,
-                       result_buffer_allocation_size,
-                       result_distances.data() + (1 - (iter & 0x1)) * result_buffer_size,
-                       result_buffer_allocation_size,
-                       result_indices.data() + (1 - (iter & 0x1)) * result_buffer_size,
-                       result_buffer_allocation_size,
-                       topk_workspace.data(),
-                       true,
-                       top_hint_ptr,
-                       stream);
+      _find_topk(res,
+                 itopk_size,
+                 num_queries,
+                 result_buffer_size,
+                 result_distances.data() + (iter & 0x1) * itopk_size,
+                 result_buffer_allocation_size,
+                 result_indices.data() + (iter & 0x1) * itopk_size,
+                 result_buffer_allocation_size,
+                 result_distances.data() + (1 - (iter & 0x1)) * result_buffer_size,
+                 result_buffer_allocation_size,
+                 result_indices.data() + (1 - (iter & 0x1)) * result_buffer_size,
+                 result_buffer_allocation_size,
+                 topk_workspace.data(),
+                 true,
+                 top_hint_ptr);
 
       // termination (1)
       if ((iter + 1 == max_iterations)) {
@@ -841,21 +944,21 @@ struct search : search_plan_impl<DATA_T, INDEX_T, DISTANCE_T, SAMPLE_FILTER_T> {
 
       result_indices_ptr   = result_indices.data() + (1 - (iter & 0x1)) * result_buffer_size;
       result_distances_ptr = result_distances.data() + (1 - (iter & 0x1)) * result_buffer_size;
-      _cuann_find_topk(itopk_size,
-                       num_queries,
-                       result_buffer_size,
-                       result_distances.data() + (iter & 0x1) * itopk_size,
-                       result_buffer_allocation_size,
-                       result_indices.data() + (iter & 0x1) * itopk_size,
-                       result_buffer_allocation_size,
-                       result_distances_ptr,
-                       result_buffer_allocation_size,
-                       result_indices_ptr,
-                       result_buffer_allocation_size,
-                       topk_workspace.data(),
-                       true,
-                       top_hint_ptr,
-                       stream);
+      _find_topk(res,
+                 itopk_size,
+                 num_queries,
+                 result_buffer_size,
+                 result_distances.data() + (iter & 0x1) * itopk_size,
+                 result_buffer_allocation_size,
+                 result_indices.data() + (iter & 0x1) * itopk_size,
+                 result_buffer_allocation_size,
+                 result_distances_ptr,
+                 result_buffer_allocation_size,
+                 result_indices_ptr,
+                 result_buffer_allocation_size,
+                 topk_workspace.data(),
+                 true,
+                 top_hint_ptr);
     } else {
       // Remove parent bit in search results
       remove_parent_bit(
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
index 20df2adf61..271a1f4955 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
@@ -42,9 +42,12 @@ struct search_plan_impl_base : public search_params {
       if (itopk_size <= 512 && search_params::max_queries >= num_sm * 2lu) {
         algo = search_algo::SINGLE_CTA;
         RAFT_LOG_DEBUG("Auto strategy: selecting single-cta");
-      } else {
+      } else if (topk <= 1024) {
         algo = search_algo::MULTI_CTA;
         RAFT_LOG_DEBUG("Auto strategy: selecting multi-cta");
+      } else {
+        algo = search_algo::MULTI_KERNEL;
+        RAFT_LOG_DEBUG("Auto strategy: selecting multi kernel");
       }
     }
   }
@@ -255,7 +258,8 @@ struct search_plan_impl : public search_plan_impl_base {
   virtual void check(const uint32_t topk)
   {
     // For single-CTA and multi kernel
-    RAFT_EXPECTS(topk <= itopk_size, "topk must be smaller than itopk_size = %lu", itopk_size);
+    RAFT_EXPECTS(
+      topk <= itopk_size, "topk = %u must be smaller than itopk_size = %lu", topk, itopk_size);
   }
 
   inline void check_params()
@@ -263,7 +267,7 @@ struct search_plan_impl : public search_plan_impl_base {
     std::string error_message = "";
 
     if (itopk_size > 1024) {
-      if (algo == search_algo::MULTI_CTA) {
+      if ((algo == search_algo::MULTI_CTA) || (algo == search_algo::MULTI_KERNEL)) {
       } else {
         error_message += std::string("- `internal_topk` (" + std::to_string(itopk_size) +
                                      ") must be smaller or equal to 1024");
diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh
index 915ef8a394..ef4f27ae64 100644
--- a/cpp/test/neighbors/ann_cagra.cuh
+++ b/cpp/test/neighbors/ann_cagra.cuh
@@ -259,6 +259,7 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
         search_params.algo        = ps.algo;
         search_params.max_queries = ps.max_queries;
         search_params.team_size   = ps.team_size;
+        search_params.itopk_size  = ps.itopk_size;
 
         auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
           (const DataT*)database.data(), ps.n_rows, ps.dim);
@@ -496,6 +497,7 @@ class AnnCagraFilterTest : public ::testing::TestWithParam<AnnCagraInputs> {
         search_params.algo         = ps.algo;
         search_params.max_queries  = ps.max_queries;
         search_params.team_size    = ps.team_size;
+        search_params.itopk_size   = ps.itopk_size;
         search_params.hashmap_mode = cagra::hash_mode::HASH;
 
         auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
@@ -611,6 +613,7 @@ class AnnCagraFilterTest : public ::testing::TestWithParam<AnnCagraInputs> {
         search_params.algo         = ps.algo;
         search_params.max_queries  = ps.max_queries;
         search_params.team_size    = ps.team_size;
+        search_params.itopk_size   = ps.itopk_size;
         search_params.hashmap_mode = cagra::hash_mode::HASH;
 
         auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
@@ -818,6 +821,23 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {0.995});
   inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
 
+  inputs2 =
+    raft::util::itertools::product<AnnCagraInputs>({100},
+                                                   {20000},
+                                                   {32},
+                                                   {2048},  // k
+                                                   {graph_build_algo::NN_DESCENT},
+                                                   {search_algo::AUTO},
+                                                   {10},
+                                                   {0},
+                                                   {4096},  // itopk_size
+                                                   {1},
+                                                   {raft::distance::DistanceType::L2Expanded},
+                                                   {false},
+                                                   {false},
+                                                   {0.995});
+  inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
+
   return inputs;
 }
 

From 2856bff39532e08e0a84a3dc3af78fc121d90774 Mon Sep 17 00:00:00 2001
From: Rui Lan <rlan@nvidia.com>
Date: Tue, 23 Jan 2024 15:04:24 -0800
Subject: [PATCH 07/14] Revert "Merge remote-tracking branch
 'upstream/branch-24.04' into subsampling-ivfpq-codebook"

This reverts commit aea3d445a18d2a93cf8500ad50390f1976e79317, reversing
changes made to c90cdfa3b45744e0e6d90666755355fb7e90da43.
---
 .../ann/src/raft/raft_ann_bench_param_parser.h   |  3 ---
 .../raft/neighbors/detail/ivf_pq_build.cuh       | 16 ++++++----------
 cpp/include/raft/neighbors/ivf_pq_types.hpp      |  6 ------
 .../pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd  |  3 +--
 .../pylibraft/neighbors/ivf_pq/ivf_pq.pyx        | 15 +--------------
 5 files changed, 8 insertions(+), 35 deletions(-)

diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
index 1b876ea147..2a021a8a12 100644
--- a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
@@ -87,9 +87,6 @@ void parse_build_param(const nlohmann::json& conf,
                                "', should be either 'cluster' or 'subspace'");
     }
   }
-  if (conf.contains("pq_codebook_ratio")) { 
-    param.pq_codebook_trainset_fraction = 1.0 / (double)conf.at("pq_codebook_ratio"); 
-  }
 }
 
 template <typename T, typename IdxT>
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index edaea70bcc..cc94511fe7 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -353,17 +353,14 @@ void train_per_subset(raft::resources const& handle,
                       const float* trainset,   // [n_rows, dim]
                       const uint32_t* labels,  // [n_rows]
                       uint32_t kmeans_n_iters,
-                      double pq_codebook_trainset_fraction,
                       rmm::mr::device_memory_resource* managed_memory)
 {
   auto stream        = resource::get_cuda_stream(handle);
   auto device_memory = resource::get_workspace_resource(handle);
 
   rmm::device_uvector<float> pq_centers_tmp(index.pq_centers().size(), stream, device_memory);
-  // Subsampling the train set for codebook generation based on pq_codebook_trainset_fraction. 
-  auto pq_n_rows     = uint32_t(n_rows * pq_codebook_trainset_fraction);
-  rmm::device_uvector<float> sub_trainset(pq_n_rows * size_t(index.pq_len()), stream, device_memory);
-  rmm::device_uvector<uint32_t> sub_labels(pq_n_rows, stream, device_memory);
+  rmm::device_uvector<float> sub_trainset(n_rows * size_t(index.pq_len()), stream, device_memory);
+  rmm::device_uvector<uint32_t> sub_labels(n_rows, stream, device_memory);
 
   rmm::device_uvector<uint32_t> pq_cluster_sizes(index.pq_book_size(), stream, device_memory);
 
@@ -374,7 +371,7 @@ void train_per_subset(raft::resources const& handle,
     // Get the rotated cluster centers for each training vector.
     // This will be subtracted from the input vectors afterwards.
     utils::copy_selected<float, float, size_t, uint32_t>(
-      pq_n_rows,
+      n_rows,
       index.pq_len(),
       index.centers_rot().data_handle() + index.pq_len() * j,
       labels,
@@ -390,7 +387,7 @@ void train_per_subset(raft::resources const& handle,
                  true,
                  false,
                  index.pq_len(),
-                 pq_n_rows,
+                 n_rows,
                  index.dim(),
                  &alpha,
                  index.rotation_matrix().data_handle() + index.dim() * index.pq_len() * j,
@@ -404,12 +401,12 @@ void train_per_subset(raft::resources const& handle,
 
     // train PQ codebook for this subspace
     auto sub_trainset_view =
-      raft::make_device_matrix_view<const float, IdxT>(sub_trainset.data(), pq_n_rows, index.pq_len());
+      raft::make_device_matrix_view<const float, IdxT>(sub_trainset.data(), n_rows, index.pq_len());
     auto centers_tmp_view = raft::make_device_matrix_view<float, IdxT>(
       pq_centers_tmp.data() + index.pq_book_size() * index.pq_len() * j,
       index.pq_book_size(),
       index.pq_len());
-    auto sub_labels_view = raft::make_device_vector_view<uint32_t, IdxT>(sub_labels.data(), pq_n_rows);
+    auto sub_labels_view = raft::make_device_vector_view<uint32_t, IdxT>(sub_labels.data(), n_rows);
     auto cluster_sizes_view =
       raft::make_device_vector_view<uint32_t, IdxT>(pq_cluster_sizes.data(), index.pq_book_size());
     raft::cluster::kmeans_balanced_params kmeans_params;
@@ -1791,7 +1788,6 @@ auto build(raft::resources const& handle,
                          trainset.data_handle(),
                          labels.data(),
                          params.kmeans_n_iters,
-                         params.pq_codebook_trainset_fraction,
                          &managed_mr);
         break;
       case codebook_gen::PER_CLUSTER:
diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index 8340bfde67..45ab18c84f 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -105,12 +105,6 @@ struct index_params : ann::index_params {
    * flag to `true` if you prefer to use as little GPU memory for the database as possible.
    */
   bool conservative_memory_allocation = false;
-  /** 
-   * The fraction of data to use during PQ codebook generation on top of the subsampled data 
-   * controlled by kmeans_trainset_fraction. The parameter is only used when PQ codebook generation 
-   * kind is PER_SUBSPACE and ignored when PQ codebook generation kind is PER_CLUSTER. 
-  */
-  double pq_codebook_trainset_fraction = 1;
 };
 
 struct search_params : ann::search_params {
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd b/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
index 81f96df9cd..531c2428e9 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
@@ -78,8 +78,7 @@ cdef extern from "raft/neighbors/ivf_pq_types.hpp" \
         codebook_gen codebook_kind
         bool force_random_rotation
         bool conservative_memory_allocation
-        double pq_codebook_trainset_fraction
-        
+
     cdef cppclass index[IdxT](ann_index):
         index(const device_resources& handle,
               DistanceType metric,
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
index 5b0675d069..0c1bbf6b9c 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
@@ -156,13 +156,6 @@ cdef class IndexParams:
         repeated calls to `extend` (extending the database).
         To disable this behavior and use as little GPU memory for the
         database as possible, set this flat to `True`.
-    pq_codebook_trainset_fraction : int, default = 0.5 
-        If pq_codebook_trainset_fraction is less than 1, then the dataset is
-        subsampled for PQ codebook generation, and only n_samples * 
-        pq_codebook_trainset_fraction rows are used for PQ codebook generation.
-        This subsampling is applied after kmeans_trainset subsampling, 
-        controlled by kmeans_trainset_fraction and only used when codebook_kind
-        is PER_SUBSPACE. 
     """
     def __init__(self, *,
                  n_lists=1024,
@@ -174,8 +167,7 @@ cdef class IndexParams:
                  codebook_kind="subspace",
                  force_random_rotation=False,
                  add_data_on_build=True,
-                 conservative_memory_allocation=False,
-                 pq_codebook_trainset_fraction=0.5):
+                 conservative_memory_allocation=False):
         self.params.n_lists = n_lists
         self.params.metric = _get_metric(metric)
         self.params.metric_arg = 0
@@ -193,8 +185,6 @@ cdef class IndexParams:
         self.params.add_data_on_build = add_data_on_build
         self.params.conservative_memory_allocation = \
             conservative_memory_allocation
-        self.params.pq_codebook_trainset_fraction = \
-            pq_codebook_trainset_fraction
 
     @property
     def n_lists(self):
@@ -236,9 +226,6 @@ cdef class IndexParams:
     def conservative_memory_allocation(self):
         return self.params.conservative_memory_allocation
 
-    @property
-        def pq_codebook_trainset_fraction(self):
-            return self.params.pq_codebook_trainset_fraction
 
 cdef class Index:
     # We store a pointer to the index because it dose not have a trivial

From 49112ddd1969aa1bd36caf4edadd4350f016b3b8 Mon Sep 17 00:00:00 2001
From: Rui Lan <rlan@nvidia.com>
Date: Tue, 23 Jan 2024 15:04:49 -0800
Subject: [PATCH 08/14] Revert "Merge pull request #2116 from
 rapidsai/branch-24.02"

This reverts commit c90cdfa3b45744e0e6d90666755355fb7e90da43, reversing
changes made to 0586fc324598cc25e4d8c9f4f6dc3db2eb3ab521.
---
 .../cuda11.8-conda/devcontainer.json          |  4 +--
 .devcontainer/cuda11.8-pip/devcontainer.json  |  6 ++---
 .../cuda12.0-conda/devcontainer.json          |  4 +--
 .devcontainer/cuda12.0-pip/devcontainer.json  |  6 ++---
 .github/workflows/build.yaml                  | 16 ++++++------
 .github/workflows/pr.yaml                     | 26 +++++++++----------
 .github/workflows/test.yaml                   | 10 +++----
 README.md                                     |  2 +-
 VERSION                                       |  2 +-
 ci/build_docs.sh                              |  2 +-
 .../all_cuda-118_arch-aarch64.yaml            |  8 +++---
 .../all_cuda-118_arch-x86_64.yaml             |  8 +++---
 .../all_cuda-120_arch-aarch64.yaml            |  8 +++---
 .../all_cuda-120_arch-x86_64.yaml             |  8 +++---
 .../bench_ann_cuda-118_arch-aarch64.yaml      |  2 +-
 .../bench_ann_cuda-118_arch-x86_64.yaml       |  2 +-
 .../bench_ann_cuda-120_arch-aarch64.yaml      |  2 +-
 .../bench_ann_cuda-120_arch-x86_64.yaml       |  2 +-
 .../recipes/raft-dask/conda_build_config.yaml |  2 +-
 cpp/CMakeLists.txt                            |  4 +--
 cpp/doxygen/Doxyfile                          |  2 +-
 .../cmake/thirdparty/fetch_rapids.cmake       |  2 +-
 dependencies.yaml                             | 24 ++++++++---------
 docs/source/build.md                          |  2 +-
 docs/source/conf.py                           |  4 +--
 docs/source/developer_guide.md                |  8 +++---
 docs/source/raft_ann_benchmarks.md            | 12 ++++-----
 fetch_rapids.cmake                            |  2 +-
 python/pylibraft/CMakeLists.txt               |  2 +-
 python/pylibraft/pyproject.toml               |  4 +--
 python/raft-ann-bench/pyproject.toml          |  2 +-
 python/raft-dask/CMakeLists.txt               |  2 +-
 python/raft-dask/pyproject.toml               |  8 +++---
 33 files changed, 99 insertions(+), 99 deletions(-)

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 2682510ed1..4c2161cab7 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,12 +5,12 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-llvm16-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.02-cpp-llvm16-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index f96fef205f..8bbc016620 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,13 +5,13 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-llvm16-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.02-cpp-llvm16-cuda11.8-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:24.4": {"version": "1.14.1"},
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/ucx:24.2": {"version": "1.14.1"},
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/ucx",
diff --git a/.devcontainer/cuda12.0-conda/devcontainer.json b/.devcontainer/cuda12.0-conda/devcontainer.json
index 7ce4ea20d3..ebb101d9d5 100644
--- a/.devcontainer/cuda12.0-conda/devcontainer.json
+++ b/.devcontainer/cuda12.0-conda/devcontainer.json
@@ -5,12 +5,12 @@
     "args": {
       "CUDA": "12.0",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.02-cpp-mambaforge-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.0-pip/devcontainer.json b/.devcontainer/cuda12.0-pip/devcontainer.json
index 4f97af830a..3e2efb2f37 100644
--- a/.devcontainer/cuda12.0-pip/devcontainer.json
+++ b/.devcontainer/cuda12.0-pip/devcontainer.json
@@ -5,13 +5,13 @@
     "args": {
       "CUDA": "12.0",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-llvm16-cuda12.0-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.02-cpp-llvm16-cuda12.0-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:24.4": {"version": "1.14.1"},
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/ucx:24.2": {"version": "1.14.1"},
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/ucx",
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index bd8b13d21e..4c6e3d0ed4 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-raft-dask:
     needs: wheel-publish-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -99,7 +99,7 @@ jobs:
   wheel-publish-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 82594909a9..fb12d7efae 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -25,29 +25,29 @@ jobs:
       - wheel-tests-raft-dask
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.02
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.02
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.02
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.02
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.02
     with:
       build_type: pull-request
       enable_check_symbols: true
@@ -55,19 +55,19 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.02
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -77,34 +77,34 @@ jobs:
   wheel-build-pylibraft:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
     with:
       build_type: pull-request
       script: ci/build_wheel_pylibraft.sh
   wheel-tests-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
     with:
       build_type: pull-request
       script: ci/test_wheel_pylibraft.sh
   wheel-build-raft-dask:
     needs: wheel-tests-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
     with:
       build_type: pull-request
       script: "ci/build_wheel_raft_dask.sh"
   wheel-tests-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
     with:
       build_type: pull-request
       script: ci/test_wheel_raft_dask.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.02
     with:
       build_command: |
         sccache -z;
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 2a557a8b84..2472021375 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -26,7 +26,7 @@ jobs:
       symbol_exclusions: _ZN\d+raft_cutlass
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -42,7 +42,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
       script: ci/test_wheel_pylibraft.sh
   wheel-tests-raft-dask:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/README.md b/README.md
index 55b8fca8f6..26ddc30ed4 100755
--- a/README.md
+++ b/README.md
@@ -287,7 +287,7 @@ You can also install the conda packages individually using the `mamba` command a
 mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.0
 ```
 
-If installing the C++ APIs please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.04/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
+If installing the C++ APIs please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.02/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
 
 ### Installing Python through Pip
 
diff --git a/VERSION b/VERSION
index 4a2fe8aa57..3c6c5e2b70 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.04.00
+24.02.00
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 4c07683642..b7c8c7a3e0 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -28,7 +28,7 @@ rapids-mamba-retry install \
   pylibraft \
   raft-dask
 
-export RAPIDS_VERSION_NUMBER="24.04"
+export RAPIDS_VERSION_NUMBER="24.02"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build CPP docs"
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 96c10a763a..ac076f5505 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -20,7 +20,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.4.*
+- dask-cuda==24.2.*
 - doxygen>=1.8.20
 - gcc_linux-aarch64=11.*
 - gmock>=1.13.0
@@ -46,9 +46,9 @@ dependencies:
 - pydata-sphinx-theme
 - pytest
 - pytest-cov
-- rapids-dask-dependency==24.4.*
+- rapids-dask-dependency==24.2.*
 - recommonmark
-- rmm==24.4.*
+- rmm==24.2.*
 - scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
@@ -56,6 +56,6 @@ dependencies:
 - sphinx-markdown-tables
 - sysroot_linux-aarch64==2.17
 - ucx-proc=*=gpu
-- ucx-py==0.37.*
+- ucx-py==0.36.*
 - ucx>=1.13.0
 name: all_cuda-118_arch-aarch64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index aea5bd1961..b3ded51bb5 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -20,7 +20,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.4.*
+- dask-cuda==24.2.*
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - gmock>=1.13.0
@@ -46,9 +46,9 @@ dependencies:
 - pydata-sphinx-theme
 - pytest
 - pytest-cov
-- rapids-dask-dependency==24.4.*
+- rapids-dask-dependency==24.2.*
 - recommonmark
-- rmm==24.4.*
+- rmm==24.2.*
 - scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
@@ -56,6 +56,6 @@ dependencies:
 - sphinx-markdown-tables
 - sysroot_linux-64==2.17
 - ucx-proc=*=gpu
-- ucx-py==0.37.*
+- ucx-py==0.36.*
 - ucx>=1.13.0
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-120_arch-aarch64.yaml b/conda/environments/all_cuda-120_arch-aarch64.yaml
index 0ea4a979d6..c0eede1389 100644
--- a/conda/environments/all_cuda-120_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-120_arch-aarch64.yaml
@@ -21,7 +21,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.4.*
+- dask-cuda==24.2.*
 - doxygen>=1.8.20
 - gcc_linux-aarch64=11.*
 - gmock>=1.13.0
@@ -42,9 +42,9 @@ dependencies:
 - pydata-sphinx-theme
 - pytest
 - pytest-cov
-- rapids-dask-dependency==24.4.*
+- rapids-dask-dependency==24.2.*
 - recommonmark
-- rmm==24.4.*
+- rmm==24.2.*
 - scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
@@ -52,6 +52,6 @@ dependencies:
 - sphinx-markdown-tables
 - sysroot_linux-aarch64==2.17
 - ucx-proc=*=gpu
-- ucx-py==0.37.*
+- ucx-py==0.36.*
 - ucx>=1.13.0
 name: all_cuda-120_arch-aarch64
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 48a463bc6f..cebaf96493 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -21,7 +21,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.4.*
+- dask-cuda==24.2.*
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - gmock>=1.13.0
@@ -42,9 +42,9 @@ dependencies:
 - pydata-sphinx-theme
 - pytest
 - pytest-cov
-- rapids-dask-dependency==24.4.*
+- rapids-dask-dependency==24.2.*
 - recommonmark
-- rmm==24.4.*
+- rmm==24.2.*
 - scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
@@ -52,6 +52,6 @@ dependencies:
 - sphinx-markdown-tables
 - sysroot_linux-64==2.17
 - ucx-proc=*=gpu
-- ucx-py==0.37.*
+- ucx-py==0.36.*
 - ucx>=1.13.0
 name: all_cuda-120_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index 0e0385ceeb..87e3942e6a 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -38,7 +38,7 @@ dependencies:
 - openblas
 - pandas
 - pyyaml
-- rmm==24.4.*
+- rmm==24.2.*
 - scikit-build-core>=0.7.0
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-118_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index dfe76a2948..a4ac253a85 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -38,7 +38,7 @@ dependencies:
 - openblas
 - pandas
 - pyyaml
-- rmm==24.4.*
+- rmm==24.2.*
 - scikit-build-core>=0.7.0
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
index 0a6567c646..9ef9799363 100644
--- a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
@@ -34,7 +34,7 @@ dependencies:
 - openblas
 - pandas
 - pyyaml
-- rmm==24.4.*
+- rmm==24.2.*
 - scikit-build-core>=0.7.0
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-120_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
index a89d5317b6..5fa09096ba 100644
--- a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
@@ -34,7 +34,7 @@ dependencies:
 - openblas
 - pandas
 - pyyaml
-- rmm==24.4.*
+- rmm==24.2.*
 - scikit-build-core>=0.7.0
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-120_arch-x86_64
diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
index 483e53026a..c12c35af3b 100644
--- a/conda/recipes/raft-dask/conda_build_config.yaml
+++ b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -17,7 +17,7 @@ ucx_version:
   - ">=1.14.1,<1.16.0"
 
 ucx_py_version:
-  - "0.37.*"
+  - "0.36.*"
 
 cmake_version:
   - ">=3.26.4"
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index ac75d7d83b..0d71026e17 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -10,8 +10,8 @@
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
-set(RAPIDS_VERSION "24.04")
-set(RAFT_VERSION "24.04.00")
+set(RAPIDS_VERSION "24.02")
+set(RAFT_VERSION "24.02.00")
 
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 include(../fetch_rapids.cmake)
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index 779472d880..3eb0763eaf 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "RAFT C++ API"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = "24.04"
+PROJECT_NUMBER         = "24.02"
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/cpp/template/cmake/thirdparty/fetch_rapids.cmake b/cpp/template/cmake/thirdparty/fetch_rapids.cmake
index aadfdb0028..15b6c43a6f 100644
--- a/cpp/template/cmake/thirdparty/fetch_rapids.cmake
+++ b/cpp/template/cmake/thirdparty/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 
 # Use this variable to update RAPIDS and RAFT versions
-set(RAPIDS_VERSION "24.04")
+set(RAPIDS_VERSION "24.02")
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake)
     file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
diff --git a/dependencies.yaml b/dependencies.yaml
index 37404a1e37..0e4d6d4693 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -180,7 +180,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - &rmm_conda rmm==24.4.*
+          - &rmm_conda rmm==24.2.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -201,12 +201,12 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &build_pylibraft_packages_cu12
-              - &rmm_cu12 rmm-cu12==24.4.*
+              - &rmm_cu12 rmm-cu12==24.2.*
           - {matrix: {cuda: "12.1"}, packages: *build_pylibraft_packages_cu12}
           - {matrix: {cuda: "12.0"}, packages: *build_pylibraft_packages_cu12}
           - matrix: {cuda: "11.8"}
             packages: &build_pylibraft_packages_cu11
-              - &rmm_cu11 rmm-cu11==24.4.*
+              - &rmm_cu11 rmm-cu11==24.2.*
           - {matrix: {cuda: "11.5"}, packages: *build_pylibraft_packages_cu11}
           - {matrix: {cuda: "11.4"}, packages: *build_pylibraft_packages_cu11}
           - {matrix: {cuda: "11.2"}, packages: *build_pylibraft_packages_cu11}
@@ -459,20 +459,20 @@ dependencies:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - dask-cuda==24.4.*
+          - dask-cuda==24.2.*
           - joblib>=0.11
           - numba>=0.57
           - *numpy
-          - rapids-dask-dependency==24.4.*
-          - ucx-py==0.37.*
+          - rapids-dask-dependency==24.2.*
+          - ucx-py==0.36.*
       - output_types: conda
         packages:
           - ucx>=1.13.0
           - ucx-proc=*=gpu
-          - &ucx_py_conda ucx-py==0.37.*
+          - &ucx_py_conda ucx-py==0.36.*
       - output_types: pyproject
         packages:
-          - &pylibraft_conda pylibraft==24.4.*
+          - &pylibraft_conda pylibraft==24.2.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -484,14 +484,14 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &run_raft_dask_packages_cu12
-              - &pylibraft_cu12 pylibraft-cu12==24.4.*
-              - &ucx_py_cu12 ucx-py-cu12==0.37.*
+              - &pylibraft_cu12 pylibraft-cu12==24.2.*
+              - &ucx_py_cu12 ucx-py-cu12==0.36.*
           - {matrix: {cuda: "12.1"}, packages: *run_raft_dask_packages_cu12}
           - {matrix: {cuda: "12.0"}, packages: *run_raft_dask_packages_cu12}
           - matrix: {cuda: "11.8"}
             packages: &run_raft_dask_packages_cu11
-              - &pylibraft_cu11 pylibraft-cu11==24.4.*
-              - &ucx_py_cu11 ucx-py-cu11==0.37.*
+              - &pylibraft_cu11 pylibraft-cu11==24.2.*
+              - &ucx_py_cu11 ucx-py-cu11==0.36.*
           - {matrix: {cuda: "11.5"}, packages: *run_raft_dask_packages_cu11}
           - {matrix: {cuda: "11.4"}, packages: *run_raft_dask_packages_cu11}
           - {matrix: {cuda: "11.2"}, packages: *run_raft_dask_packages_cu11}
diff --git a/docs/source/build.md b/docs/source/build.md
index e76512d16f..ae7734d0ed 100644
--- a/docs/source/build.md
+++ b/docs/source/build.md
@@ -56,7 +56,7 @@ You can also install the conda packages individually using the `mamba` command a
 mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.0
 ```
 
-If installing the C++ APIs Please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.04/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
+If installing the C++ APIs Please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.02/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
 
 ## Installing Python through Pip
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 07dd4825fa..2a2c700926 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -67,9 +67,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '24.04'
+version = '24.02'
 # The full version, including alpha/beta/rc tags.
-release = '24.04.00'
+release = '24.02.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md
index d29130add0..c5bcd03f69 100644
--- a/docs/source/developer_guide.md
+++ b/docs/source/developer_guide.md
@@ -187,7 +187,7 @@ RAFT relies on `clang-format` to enforce code style across all C++ and CUDA sour
 1. Do not split empty functions/records/namespaces.
 2. Two-space indentation everywhere, including the line continuations.
 3. Disable reflowing of comments.
-   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-24.04/cpp/.clang-format).
+   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-24.02/cpp/.clang-format).
 
 [`doxygen`](https://doxygen.nl/) is used as documentation generator and also as a documentation linter.
 In order to run doxygen as a linter on C++/CUDA code, run
@@ -205,7 +205,7 @@ you can run  `codespell -i 3 -w .` from the repository root directory.
 This will bring up an interactive prompt to select which spelling fixes to apply.
 
 ### #include style
-[include_checker.py](https://github.com/rapidsai/raft/blob/branch-24.04/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
+[include_checker.py](https://github.com/rapidsai/raft/blob/branch-24.02/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
 1. `#include "..."` should be used for referencing local files only. It is acceptable to be used for referencing files in a sub-folder/parent-folder of the same algorithm, but should never be used to include files in other algorithms or between algorithms and the primitives or other dependencies.
 2. `#include <...>` should be used for referencing everything else
 
@@ -215,7 +215,7 @@ python ./cpp/scripts/include_checker.py --inplace [cpp/include cpp/test ... list
 ```
 
 ### Copyright header
-[copyright.py](https://github.com/rapidsai/raft/blob/branch-24.04/ci/checks/copyright.py) checks the Copyright header for all git-modified files
+[copyright.py](https://github.com/rapidsai/raft/blob/branch-24.02/ci/checks/copyright.py) checks the Copyright header for all git-modified files
 
 Manually, you can run the following to bulk-fix the header if only the years need to be updated:
 ```bash
@@ -229,7 +229,7 @@ Call CUDA APIs via the provided helper macros `RAFT_CUDA_TRY`, `RAFT_CUBLAS_TRY`
 ## Logging
 
 ### Introduction
-Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-24.04/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
+Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-24.02/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
 
 ### Usage
 ```cpp
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index 68fe80f9ce..3789c3f01f 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -62,7 +62,7 @@ Nightly images are located in [dockerhub](https://hub.docker.com/r/rapidsai/raft
 - The following command pulls the nightly container for python version 10, cuda version 12, and RAFT version 23.10:
 
 ```bash
-docker pull rapidsai/raft-ann-bench:24.04a-cuda12.0-py3.10 #substitute raft-ann-bench for the exact desired container.
+docker pull rapidsai/raft-ann-bench:24.02a-cuda12.0-py3.10 #substitute raft-ann-bench for the exact desired container.
 ```
 
 The CUDA and python versions can be changed for the supported values:
@@ -83,7 +83,7 @@ You can see the exact versions as well in the dockerhub site:
 [//]: # ()
 [//]: # (```bash)
 
-[//]: # (docker pull nvcr.io/nvidia/rapidsai/raft-ann-bench:24.04-cuda11.8-py3.10 #substitute raft-ann-bench for the exact desired container.)
+[//]: # (docker pull nvcr.io/nvidia/rapidsai/raft-ann-bench:24.02-cuda11.8-py3.10 #substitute raft-ann-bench for the exact desired container.)
 
 [//]: # (```)
 
@@ -344,7 +344,7 @@ For GPU-enabled systems, the `DATA_FOLDER` variable should be a local folder whe
 export DATA_FOLDER=path/to/store/datasets/and/results
 docker run --gpus all --rm -it -u $(id -u)                      \
     -v $DATA_FOLDER:/data/benchmarks                            \
-    rapidsai/raft-ann-bench:24.04a-cuda11.8-py3.10              \
+    rapidsai/raft-ann-bench:24.02a-cuda11.8-py3.10              \
     "--dataset deep-image-96-angular"                           \
     "--normalize"                                               \
     "--algorithms raft_cagra,raft_ivf_pq --batch-size 10 -k 10" \
@@ -355,7 +355,7 @@ Usage of the above command is as follows:
 
 | Argument                                                  | Description                                                                                        |
 |-----------------------------------------------------------|----------------------------------------------------------------------------------------------------|
-| `rapidsai/raft-ann-bench:24.04a-cuda11.8-py3.10`          | Image to use. Can be either `raft-ann-bench` or `raft-ann-bench-datasets`                          |
+| `rapidsai/raft-ann-bench:24.02a-cuda11.8-py3.10`          | Image to use. Can be either `raft-ann-bench` or `raft-ann-bench-datasets`                          |
 | `"--dataset deep-image-96-angular"`                       | Dataset name                                                                                       |
 | `"--normalize"`                                           | Whether to normalize the dataset                                                                   |
 | `"--algorithms raft_cagra,hnswlib --batch-size 10 -k 10"` | Arguments passed to the `run` script, such as the algorithms to benchmark, the batch size, and `k` |
@@ -372,7 +372,7 @@ The container arguments in the above section also be used for the CPU-only conta
 export DATA_FOLDER=path/to/store/datasets/and/results
 docker run  --rm -it -u $(id -u)                  \
     -v $DATA_FOLDER:/data/benchmarks              \
-    rapidsai/raft-ann-bench-cpu:24.04a-py3.10     \
+    rapidsai/raft-ann-bench-cpu:24.02a-py3.10     \
      "--dataset deep-image-96-angular"            \
      "--normalize"                                \
      "--algorithms hnswlib --batch-size 10 -k 10" \
@@ -389,7 +389,7 @@ docker run --gpus all --rm -it -u $(id -u)          \
     --entrypoint /bin/bash                          \
     --workdir /data/benchmarks                      \
     -v $DATA_FOLDER:/data/benchmarks                \
-    rapidsai/raft-ann-bench:24.04a-cuda11.8-py3.10 
+    rapidsai/raft-ann-bench:24.02a-cuda11.8-py3.10 
 ```
 
 This will drop you into a command line in the container, with the `raft-ann-bench` python package ready to use, as described in the [Running the benchmarks](#running-the-benchmarks) section above:
diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index 1dca136c97..ca871c5759 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake)
-  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.04/RAPIDS.cmake
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.02/RAPIDS.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake
   )
 endif()
diff --git a/python/pylibraft/CMakeLists.txt b/python/pylibraft/CMakeLists.txt
index c17243728e..f7e114ae66 100644
--- a/python/pylibraft/CMakeLists.txt
+++ b/python/pylibraft/CMakeLists.txt
@@ -16,7 +16,7 @@ cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
 include(../../fetch_rapids.cmake)
 
-set(pylibraft_version 24.04.00)
+set(pylibraft_version 24.02.00)
 
 # We always need CUDA for pylibraft because the raft dependency brings in a header-only cuco
 # dependency that enables CUDA unconditionally.
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index ba45a6d33d..5070d6cf6f 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -19,7 +19,7 @@ requires = [
     "cuda-python>=11.7.1,<12.0a0",
     "cython>=3.0.0",
     "ninja",
-    "rmm==24.4.*",
+    "rmm==24.2.*",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 build-backend = "scikit_build_core.build"
@@ -37,7 +37,7 @@ requires-python = ">=3.9"
 dependencies = [
     "cuda-python>=11.7.1,<12.0a0",
     "numpy>=1.21",
-    "rmm==24.4.*",
+    "rmm==24.2.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
diff --git a/python/raft-ann-bench/pyproject.toml b/python/raft-ann-bench/pyproject.toml
index 4a185b22ca..e3ae36ef62 100644
--- a/python/raft-ann-bench/pyproject.toml
+++ b/python/raft-ann-bench/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
 
 [project]
 name = "raft-ann-bench"
-version = "24.04.00"
+version = "24.02.00"
 description = "RAFT ANN benchmarks"
 authors = [
     { name = "NVIDIA Corporation" },
diff --git a/python/raft-dask/CMakeLists.txt b/python/raft-dask/CMakeLists.txt
index ff441e343e..1d27e49583 100644
--- a/python/raft-dask/CMakeLists.txt
+++ b/python/raft-dask/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
-set(raft_dask_version 24.04.00)
+set(raft_dask_version 24.02.00)
 
 include(../../fetch_rapids.cmake)
 include(rapids-cuda)
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 33e0c15684..37d94be7f9 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -33,13 +33,13 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "dask-cuda==24.4.*",
+    "dask-cuda==24.2.*",
     "joblib>=0.11",
     "numba>=0.57",
     "numpy>=1.21",
-    "pylibraft==24.4.*",
-    "rapids-dask-dependency==24.4.*",
-    "ucx-py==0.37.*",
+    "pylibraft==24.2.*",
+    "rapids-dask-dependency==24.2.*",
+    "ucx-py==0.36.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",

From a70a745dbe3430b3d7f63823b65b0157a591eeff Mon Sep 17 00:00:00 2001
From: Rui Lan <rlan@nvidia.com>
Date: Tue, 23 Jan 2024 17:34:25 -0800
Subject: [PATCH 09/14] Change to max point based subsampling.

---
 .../src/raft/raft_ann_bench_param_parser.h    |  3 +++
 .../raft/neighbors/detail/ivf_pq_build.cuh    | 24 ++++++++++++-------
 cpp/include/raft/neighbors/ivf_pq_types.hpp   |  8 +++++++
 .../neighbors/ivf_pq/cpp/c_ivf_pq.pxd         |  1 +
 .../pylibraft/neighbors/ivf_pq/ivf_pq.pyx     | 16 ++++++++++++-
 5 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
index 2a021a8a12..6324e11200 100644
--- a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
@@ -87,6 +87,9 @@ void parse_build_param(const nlohmann::json& conf,
                                "', should be either 'cluster' or 'subspace'");
     }
   }
+  if (conf.contains("max_train_points_per_pq_code")) {
+    param.max_train_points_per_pq_code = 1.0 / (double)conf.at("max_train_points_per_pq_code");
+  }
 }
 
 template <typename T, typename IdxT>
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index cc94511fe7..2bf588df8c 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -353,14 +353,18 @@ void train_per_subset(raft::resources const& handle,
                       const float* trainset,   // [n_rows, dim]
                       const uint32_t* labels,  // [n_rows]
                       uint32_t kmeans_n_iters,
+                      uint32_t max_train_points_per_pq_code,
                       rmm::mr::device_memory_resource* managed_memory)
 {
   auto stream        = resource::get_cuda_stream(handle);
   auto device_memory = resource::get_workspace_resource(handle);
 
   rmm::device_uvector<float> pq_centers_tmp(index.pq_centers().size(), stream, device_memory);
-  rmm::device_uvector<float> sub_trainset(n_rows * size_t(index.pq_len()), stream, device_memory);
-  rmm::device_uvector<uint32_t> sub_labels(n_rows, stream, device_memory);
+  // Subsampling the train set for codebook generation based on max_train_points_per_pq_code.
+  size_t big_enough     = max_train_points_per_pq_code * size_t(index.pq_book_size());
+  auto pq_n_rows        = uint32_t(std::min(big_enough, n_rows));
+  rmm::device_uvector<float> sub_trainset(pq_n_rows * size_t(index.pq_len()), stream, device_memory);
+  rmm::device_uvector<uint32_t> sub_labels(pq_n_rows, stream, device_memory);
 
   rmm::device_uvector<uint32_t> pq_cluster_sizes(index.pq_book_size(), stream, device_memory);
 
@@ -371,7 +375,7 @@ void train_per_subset(raft::resources const& handle,
     // Get the rotated cluster centers for each training vector.
     // This will be subtracted from the input vectors afterwards.
     utils::copy_selected<float, float, size_t, uint32_t>(
-      n_rows,
+      pq_n_rows,
       index.pq_len(),
       index.centers_rot().data_handle() + index.pq_len() * j,
       labels,
@@ -387,7 +391,7 @@ void train_per_subset(raft::resources const& handle,
                  true,
                  false,
                  index.pq_len(),
-                 n_rows,
+                 pq_n_rows,
                  index.dim(),
                  &alpha,
                  index.rotation_matrix().data_handle() + index.dim() * index.pq_len() * j,
@@ -401,12 +405,12 @@ void train_per_subset(raft::resources const& handle,
 
     // train PQ codebook for this subspace
     auto sub_trainset_view =
-      raft::make_device_matrix_view<const float, IdxT>(sub_trainset.data(), n_rows, index.pq_len());
+      raft::make_device_matrix_view<const float, IdxT>(sub_trainset.data(), pq_n_rows, index.pq_len());
     auto centers_tmp_view = raft::make_device_matrix_view<float, IdxT>(
       pq_centers_tmp.data() + index.pq_book_size() * index.pq_len() * j,
       index.pq_book_size(),
       index.pq_len());
-    auto sub_labels_view = raft::make_device_vector_view<uint32_t, IdxT>(sub_labels.data(), n_rows);
+    auto sub_labels_view = raft::make_device_vector_view<uint32_t, IdxT>(sub_labels.data(), pq_n_rows);
     auto cluster_sizes_view =
       raft::make_device_vector_view<uint32_t, IdxT>(pq_cluster_sizes.data(), index.pq_book_size());
     raft::cluster::kmeans_balanced_params kmeans_params;
@@ -430,6 +434,7 @@ void train_per_cluster(raft::resources const& handle,
                        const float* trainset,   // [n_rows, dim]
                        const uint32_t* labels,  // [n_rows]
                        uint32_t kmeans_n_iters,
+                       uint32_t max_train_points_per_pq_code,
                        rmm::mr::device_memory_resource* managed_memory)
 {
   auto stream        = resource::get_cuda_stream(handle);
@@ -477,9 +482,10 @@ void train_per_cluster(raft::resources const& handle,
                      indices + cluster_offsets[l],
                      device_memory);
 
-    // limit the cluster size to bound the training time.
+    // limit the cluster size to bound the training time based on max_train_points_per_pq_code
+    // If pq_book_size is less than pq_dim, use max_train_points_per_pq_code per pq_dim instead
     // [sic] we interpret the data as pq_len-dimensional
-    size_t big_enough     = 256ul * std::max<size_t>(index.pq_book_size(), index.pq_dim());
+    size_t big_enough     = max_train_points_per_pq_code * std::max<size_t>(index.pq_book_size(), index.pq_dim());
     size_t available_rows = size_t(cluster_size) * size_t(index.pq_dim());
     auto pq_n_rows        = uint32_t(std::min(big_enough, available_rows));
     // train PQ codebook for this cluster
@@ -1788,6 +1794,7 @@ auto build(raft::resources const& handle,
                          trainset.data_handle(),
                          labels.data(),
                          params.kmeans_n_iters,
+                         params.max_train_points_per_pq_code,
                          &managed_mr);
         break;
       case codebook_gen::PER_CLUSTER:
@@ -1797,6 +1804,7 @@ auto build(raft::resources const& handle,
                           trainset.data_handle(),
                           labels.data(),
                           params.kmeans_n_iters,
+                          params.max_train_points_per_pq_code,
                           &managed_mr);
         break;
       default: RAFT_FAIL("Unreachable code");
diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index 45ab18c84f..c3de6020c5 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -105,6 +105,14 @@ struct index_params : ann::index_params {
    * flag to `true` if you prefer to use as little GPU memory for the database as possible.
    */
   bool conservative_memory_allocation = false;
+  /**
+   * The max number of data points to use per PQ code during PQ codebook training. Using more data
+   * points per PQ code may increase the quality of PQ codebook but may also increase the build time.
+   * The parameter is applied to both PQ codebook generation methods, i.e., PER_SUBSPACE and
+   * PER_CLUSTER. For PER_CLUSTER, max_train_points_per_pq_code per pq_dim numbers of points would be
+   * used if pq_dim is more than pq_book_size. 
+   */
+  uint32_t max_train_points_per_pq_code = 256;
 };
 
 struct search_params : ann::search_params {
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd b/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
index 531c2428e9..8798b8bf60 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
@@ -78,6 +78,7 @@ cdef extern from "raft/neighbors/ivf_pq_types.hpp" \
         codebook_gen codebook_kind
         bool force_random_rotation
         bool conservative_memory_allocation
+        uint32_t max_train_points_per_pq_code
 
     cdef cppclass index[IdxT](ann_index):
         index(const device_resources& handle,
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
index 0c1bbf6b9c..4ee1785ca2 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
@@ -156,6 +156,14 @@ cdef class IndexParams:
         repeated calls to `extend` (extending the database).
         To disable this behavior and use as little GPU memory for the
         database as possible, set this flat to `True`.
+    max_train_points_per_pq_code : int, default = 256
+        The max number of data points to use per PQ code during PQ codebook
+        training. Using more data points per PQ code may increase the
+        quality of PQ codebook but may also increase the build time. The
+        parameter is applied to both PQ codebook generation methods, i.e.,
+        PER_SUBSPACE and PER_CLUSTER. For PER_CLUSTER,
+        max_train_points_per_pq_code per pq_dim numbers of points would be
+        used if pq_dim is more than pq_book_size.
     """
     def __init__(self, *,
                  n_lists=1024,
@@ -167,7 +175,8 @@ cdef class IndexParams:
                  codebook_kind="subspace",
                  force_random_rotation=False,
                  add_data_on_build=True,
-                 conservative_memory_allocation=False):
+                 conservative_memory_allocation=False,
+                 max_train_points_per_pq_code=256):
         self.params.n_lists = n_lists
         self.params.metric = _get_metric(metric)
         self.params.metric_arg = 0
@@ -185,6 +194,8 @@ cdef class IndexParams:
         self.params.add_data_on_build = add_data_on_build
         self.params.conservative_memory_allocation = \
             conservative_memory_allocation
+        self.params.max_train_points_per_pq_code = \
+            max_train_points_per_pq_code
 
     @property
     def n_lists(self):
@@ -226,6 +237,9 @@ cdef class IndexParams:
     def conservative_memory_allocation(self):
         return self.params.conservative_memory_allocation
 
+    @property
+    def max_train_points_per_pq_code(self):
+        return self.params.max_train_points_per_pq_code
 
 cdef class Index:
     # We store a pointer to the index because it dose not have a trivial

From f480c13bf4b23d6ed616791e41409b2d1a97b894 Mon Sep 17 00:00:00 2001
From: abc99lr <abc99lr@gmail.com>
Date: Tue, 23 Jan 2024 21:58:57 -0800
Subject: [PATCH 10/14] Add max_train_points_per_pq_code row in benchmark
 tuning guide.

---
 docs/source/ann_benchmarks_param_tuning.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/ann_benchmarks_param_tuning.md b/docs/source/ann_benchmarks_param_tuning.md
index afb4ed18ea..2620e67761 100644
--- a/docs/source/ann_benchmarks_param_tuning.md
+++ b/docs/source/ann_benchmarks_param_tuning.md
@@ -38,13 +38,13 @@ IVF-pq is an inverted-file index, which partitions the vectors into a series of
 | `pq_bits`              | `build`  | N | Positive Integer. [4-8]          | 8       | Bit length of the vector element after quantization.                                                                                                                            |
 | `codebook_kind`        | `build`  | N | ["cluster", "subspace"]          | "subspace" | Type of codebook. See the [API docs](https://docs.rapids.ai/api/raft/nightly/cpp_api/neighbors_ivf_pq/#_CPPv412codebook_gen) for more detail                                 |
 | `dataset_memory_type`  | `build` | N | ["device", "host", "mmap"]       | "host" | What memory type should the dataset reside?                                                                                                                                       |
+| `max_train_points_per_pq_code`         | `build` | N | Positive Number >=1              | 256       | Max number of data points per PQ code used for PQ code book creation. Depending on input dataset size, the data points could be less than what user specifies.         |
 | `query_memory_type`    | `search` | N | ["device", "host", "mmap"]       | "device | What memory type should the queries reside? |
 | `nprobe`               | `search` | Y | Positive Integer >0              |         | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                     |
 | `internalDistanceDtype` | `search` | N | [`float`, `half`]                | `half`  | The precision to use for the distance computations. Lower precision can increase performance at the cost of accuracy.                                                           |
 | `smemLutDtype`         | `search` | N | [`float`, `half`, `fp8`]         | `half`  | The precision to use for the lookup table in shared memory. Lower precision can increase performance at the cost of accuracy.                                                   |
 | `refine_ratio`         | `search` | N| Positive Number >=1              | 1       | `refine_ratio * k` nearest neighbors are queried from the index initially and an additional refinement step improves recall by selecting only the best `k` neighbors.           |
 
-
 ### `raft_cagra`
 <a id='raft-cagra'></a>CAGRA uses a graph-based index, which creates an intermediate, approximate kNN graph using IVF-PQ and then further refining and optimizing to create a final kNN graph. This kNN graph is used by CAGRA as an index for search.
 

From c2b271599cdd22b4aebe7f29a032f417a4df54c6 Mon Sep 17 00:00:00 2001
From: abc99lr <abc99lr@gmail.com>
Date: Wed, 24 Jan 2024 11:18:06 -0800
Subject: [PATCH 11/14] Address comments.

---
 cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h   | 2 +-
 cpp/include/raft/neighbors/ivf_pq_types.hpp            | 4 ++--
 docs/source/ann_benchmarks_param_tuning.md             | 1 +
 python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx | 6 +++---
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
index 6324e11200..feef5ee995 100644
--- a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
@@ -88,7 +88,7 @@ void parse_build_param(const nlohmann::json& conf,
     }
   }
   if (conf.contains("max_train_points_per_pq_code")) {
-    param.max_train_points_per_pq_code = 1.0 / (double)conf.at("max_train_points_per_pq_code");
+    param.max_train_points_per_pq_code = conf.at("max_train_points_per_pq_code");
   }
 }
 
diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index c3de6020c5..35fb5fac69 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -109,8 +109,8 @@ struct index_params : ann::index_params {
    * The max number of data points to use per PQ code during PQ codebook training. Using more data
    * points per PQ code may increase the quality of PQ codebook but may also increase the build time.
    * The parameter is applied to both PQ codebook generation methods, i.e., PER_SUBSPACE and
-   * PER_CLUSTER. For PER_CLUSTER, max_train_points_per_pq_code per pq_dim numbers of points would be
-   * used if pq_dim is more than pq_book_size. 
+   * PER_CLUSTER. In both cases, we will use `pq_book_size * max_train_points_per_pq_code` training
+   * points to train each codebook.
    */
   uint32_t max_train_points_per_pq_code = 256;
 };
diff --git a/docs/source/ann_benchmarks_param_tuning.md b/docs/source/ann_benchmarks_param_tuning.md
index 2620e67761..e003aa879c 100644
--- a/docs/source/ann_benchmarks_param_tuning.md
+++ b/docs/source/ann_benchmarks_param_tuning.md
@@ -45,6 +45,7 @@ IVF-pq is an inverted-file index, which partitions the vectors into a series of
 | `smemLutDtype`         | `search` | N | [`float`, `half`, `fp8`]         | `half`  | The precision to use for the lookup table in shared memory. Lower precision can increase performance at the cost of accuracy.                                                   |
 | `refine_ratio`         | `search` | N| Positive Number >=1              | 1       | `refine_ratio * k` nearest neighbors are queried from the index initially and an additional refinement step improves recall by selecting only the best `k` neighbors.           |
 
+
 ### `raft_cagra`
 <a id='raft-cagra'></a>CAGRA uses a graph-based index, which creates an intermediate, approximate kNN graph using IVF-PQ and then further refining and optimizing to create a final kNN graph. This kNN graph is used by CAGRA as an index for search.
 
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
index 4ee1785ca2..eb9ecb1de2 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
@@ -161,9 +161,9 @@ cdef class IndexParams:
         training. Using more data points per PQ code may increase the
         quality of PQ codebook but may also increase the build time. The
         parameter is applied to both PQ codebook generation methods, i.e.,
-        PER_SUBSPACE and PER_CLUSTER. For PER_CLUSTER,
-        max_train_points_per_pq_code per pq_dim numbers of points would be
-        used if pq_dim is more than pq_book_size.
+        PER_SUBSPACE and PER_CLUSTER. In both cases, we will use
+        pq_book_size * max_train_points_per_pq_code training points to
+        train each codebook.
     """
     def __init__(self, *,
                  n_lists=1024,

From aa1a3e5a72cc531d32effd80d8e1dda7cbc0b645 Mon Sep 17 00:00:00 2001
From: abc99lr <abc99lr@gmail.com>
Date: Wed, 24 Jan 2024 13:15:13 -0800
Subject: [PATCH 12/14] Run format checker.


From cc88715ed62b4bb7ba3c17bbd81b9c4f5e4034d4 Mon Sep 17 00:00:00 2001
From: abc99lr <abc99lr@gmail.com>
Date: Wed, 24 Jan 2024 14:53:16 -0800
Subject: [PATCH 13/14] Fix formatting issue.

---
 cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h         | 2 +-
 cpp/include/raft/neighbors/ivf_pq_types.hpp                  | 2 +-
 python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd | 2 +-
 python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
index feef5ee995..ae40deb50c 100644
--- a/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_param_parser.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index 35fb5fac69..c8d1d18d8b 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd b/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
index 8798b8bf60..930c3245f1 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/cpp/c_ivf_pq.pxd
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
index eb9ecb1de2..7081b65ce3 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 67205f92d57aaae6ecb2f6cd56c18ae460cd71c9 Mon Sep 17 00:00:00 2001
From: abc99lr <abc99lr@gmail.com>
Date: Wed, 24 Jan 2024 16:32:38 -0800
Subject: [PATCH 14/14] More format changes.

---
 .../raft/neighbors/detail/ivf_pq_build.cuh      | 17 ++++++++++-------
 cpp/include/raft/neighbors/ivf_pq_types.hpp     |  4 ++--
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index 2bf588df8c..0ef6cb13fb 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -361,9 +361,10 @@ void train_per_subset(raft::resources const& handle,
 
   rmm::device_uvector<float> pq_centers_tmp(index.pq_centers().size(), stream, device_memory);
   // Subsampling the train set for codebook generation based on max_train_points_per_pq_code.
-  size_t big_enough     = max_train_points_per_pq_code * size_t(index.pq_book_size());
-  auto pq_n_rows        = uint32_t(std::min(big_enough, n_rows));
-  rmm::device_uvector<float> sub_trainset(pq_n_rows * size_t(index.pq_len()), stream, device_memory);
+  size_t big_enough = max_train_points_per_pq_code * size_t(index.pq_book_size());
+  auto pq_n_rows    = uint32_t(std::min(big_enough, n_rows));
+  rmm::device_uvector<float> sub_trainset(
+    pq_n_rows * size_t(index.pq_len()), stream, device_memory);
   rmm::device_uvector<uint32_t> sub_labels(pq_n_rows, stream, device_memory);
 
   rmm::device_uvector<uint32_t> pq_cluster_sizes(index.pq_book_size(), stream, device_memory);
@@ -404,13 +405,14 @@ void train_per_subset(raft::resources const& handle,
                  stream);
 
     // train PQ codebook for this subspace
-    auto sub_trainset_view =
-      raft::make_device_matrix_view<const float, IdxT>(sub_trainset.data(), pq_n_rows, index.pq_len());
+    auto sub_trainset_view = raft::make_device_matrix_view<const float, IdxT>(
+      sub_trainset.data(), pq_n_rows, index.pq_len());
     auto centers_tmp_view = raft::make_device_matrix_view<float, IdxT>(
       pq_centers_tmp.data() + index.pq_book_size() * index.pq_len() * j,
       index.pq_book_size(),
       index.pq_len());
-    auto sub_labels_view = raft::make_device_vector_view<uint32_t, IdxT>(sub_labels.data(), pq_n_rows);
+    auto sub_labels_view =
+      raft::make_device_vector_view<uint32_t, IdxT>(sub_labels.data(), pq_n_rows);
     auto cluster_sizes_view =
       raft::make_device_vector_view<uint32_t, IdxT>(pq_cluster_sizes.data(), index.pq_book_size());
     raft::cluster::kmeans_balanced_params kmeans_params;
@@ -485,7 +487,8 @@ void train_per_cluster(raft::resources const& handle,
     // limit the cluster size to bound the training time based on max_train_points_per_pq_code
     // If pq_book_size is less than pq_dim, use max_train_points_per_pq_code per pq_dim instead
     // [sic] we interpret the data as pq_len-dimensional
-    size_t big_enough     = max_train_points_per_pq_code * std::max<size_t>(index.pq_book_size(), index.pq_dim());
+    size_t big_enough =
+      max_train_points_per_pq_code * std::max<size_t>(index.pq_book_size(), index.pq_dim());
     size_t available_rows = size_t(cluster_size) * size_t(index.pq_dim());
     auto pq_n_rows        = uint32_t(std::min(big_enough, available_rows));
     // train PQ codebook for this cluster
diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index c8d1d18d8b..04c2354fbc 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -107,8 +107,8 @@ struct index_params : ann::index_params {
   bool conservative_memory_allocation = false;
   /**
    * The max number of data points to use per PQ code during PQ codebook training. Using more data
-   * points per PQ code may increase the quality of PQ codebook but may also increase the build time.
-   * The parameter is applied to both PQ codebook generation methods, i.e., PER_SUBSPACE and
+   * points per PQ code may increase the quality of PQ codebook but may also increase the build
+   * time. The parameter is applied to both PQ codebook generation methods, i.e., PER_SUBSPACE and
    * PER_CLUSTER. In both cases, we will use `pq_book_size * max_train_points_per_pq_code` training
    * points to train each codebook.
    */