From 3f5e149b7af0b9ec65c1f86272138f510516646c Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Mon, 5 Feb 2024 11:51:36 +0100
Subject: [PATCH 01/16] Make subsampling use less memory

---
 .../raft/neighbors/detail/ivf_flat_build.cuh  |  5 +--
 .../raft/neighbors/detail/ivf_pq_build.cuh    | 11 ++++--
 cpp/include/raft/random/detail/rng_impl.cuh   | 11 ++++++
 .../raft/spatial/knn/detail/ann_utils.cuh     | 38 ++++++++++++++++++-
 4 files changed, 57 insertions(+), 8 deletions(-)
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
index ab30b4009d..8bf9842466 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh
@@ -365,9 +365,8 @@ inline auto build(raft::resources const& handle,
     auto trainset_ratio = std::max<size_t>(
       1, n_rows / std::max<size_t>(params.kmeans_trainset_fraction * n_rows, index.n_lists()));
     auto n_rows_train = n_rows / trainset_ratio;
-    auto trainset     = make_device_matrix<T, IdxT>(handle, n_rows_train, index.dim());
-    raft::spatial::knn::detail::utils::subsample(
-      handle, dataset, n_rows, trainset.view(), random_seed);
+    auto trainset     = raft::spatial::knn::detail::utils::subsample<T, IdxT>(
+      handle, dataset, n_rows, n_rows_train, dim, random_seed);
     auto centers_view = raft::make_device_matrix_view<float, IdxT>(
       index.centers().data_handle(), index.n_lists(), index.dim());
     raft::cluster::kmeans_balanced_params kmeans_params;
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index 0ef6cb13fb..f4486c6188 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -1730,12 +1730,13 @@ auto build(raft::resources const& handle,
 
     // Besides just sampling, we transform the input dataset into floats to make it easier
     // to use gemm operations from cublas.
-    auto trainset =
-      make_device_mdarray<float>(handle, device_mr, make_extents<IdxT>(n_rows_train, dim));
+    auto trainset = make_device_mdarray<float>(handle, device_mr, make_extents<IdxT>(0, 0));
 
     if constexpr (std::is_same_v<T, float>) {
-      raft::spatial::knn::detail::utils::subsample(
-        handle, dataset, n_rows, trainset.view(), random_seed);
+      // raft::spatial::knn::detail::utils::subsample(
+      //   handle, dataset, n_rows, trainset.view(), random_seed);
+      trainset = raft::spatial::knn::detail::utils::subsample<T, IdxT>(
+        handle, dataset, n_rows, n_rows_train, dim, random_seed);
     } else {
       // TODO(tfeher): Enable codebook generation with any type T, and then remove
       // trainset tmp.
@@ -1744,6 +1745,8 @@ auto build(raft::resources const& handle,
       raft::spatial::knn::detail::utils::subsample(
         handle, dataset, n_rows, trainset_tmp.view(), random_seed);
       cudaDeviceSynchronize();
+      trainset =
+        make_device_mdarray<float>(handle, device_mr, make_extents<IdxT>(n_rows_train, dim));
       raft::linalg::unaryOp(trainset.data_handle(),
                             trainset_tmp.data_handle(),
                             trainset.size(),
diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh
index 57f4c8d33d..ace98e6d3f 100644
--- a/cpp/include/raft/random/detail/rng_impl.cuh
+++ b/cpp/include/raft/random/detail/rng_impl.cuh
@@ -290,10 +290,21 @@ void sampleWithoutReplacement(RngState& rng_state,
 {
   ASSERT(sampledLen <= len, "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'.");
 
+  // size_t free, total;
+  // float GiB = 1073741824.0f;
+  // cudaMemGetInfo(&free, &total);
+  // RAFT_LOG_INFO("sampleWithoutReplacement::start free mem %6.1f, used mem %6.1f",
+  //               free / GiB,
+  //               (total - free) / GiB);
   rmm::device_uvector<WeightsT> expWts(len, stream);
   rmm::device_uvector<WeightsT> sortedWts(len, stream);
   rmm::device_uvector<IdxT> inIdx(len, stream);
   rmm::device_uvector<IdxT> outIdxBuff(len, stream);
+
+  // cudaMemGetInfo(&free, &total);
+  // RAFT_LOG_INFO("sampleWithoutReplacement::buffers free mem %6.1f, used mem %6.1f",
+  //               free / GiB,
+  //               (total - free) / GiB);
   auto* inIdxPtr = inIdx.data();
   // generate modified weights
   SamplingParams<WeightsT, IdxT> params;
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index e55dc82f5d..294c3097ad 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -588,11 +588,22 @@ auto get_subsample_indices(raft::resources const& res, IdxT n_samples, IdxT n_su
   -> raft::device_vector<IdxT, IdxT>
 {
   RAFT_EXPECTS(n_subsamples <= n_samples, "Cannot have more training samples than dataset vectors");
+  // size_t free, total;
+  // float GiB = 1073741824.0f;
+  // cudaMemGetInfo(&free, &total);
+  // RAFT_LOG_INFO(
+  //   "get_subsample_indices::data free mem %6.1f, used mem %6.1f", free / GiB, (total - free) /
+  //   GiB);
 
   auto data_indices = raft::make_device_vector<IdxT, IdxT>(res, n_samples);
+  // cudaMemGetInfo(&free, &total);
+  // RAFT_LOG_INFO("get_subsample_indices::train free mem %6.1f, used mem %6.1f",
+  //               free / GiB,
+  //               (total - free) / GiB);
+
+  auto train_indices = raft::make_device_vector<IdxT, IdxT>(res, n_subsamples);
   raft::linalg::map_offset(res, data_indices.view(), identity_op());
   raft::random::RngState rng(seed);
-  auto train_indices = raft::make_device_vector<IdxT, IdxT>(res, n_subsamples);
   raft::random::sample_without_replacement(res,
                                            rng,
                                            raft::make_const_mdspan(data_indices.view()),
@@ -629,4 +640,29 @@ void subsample(raft::resources const& res,
     raft::matrix::detail::gather(res, dataset, make_const_mdspan(train_indices.view()), output);
   }
 }
+
+/** Subsample the dataset to create a training set*/
+template <typename T, typename IdxT = int64_t>
+raft::device_matrix<T, IdxT> subsample(
+  raft::resources const& res, const T* input, IdxT n_samples, IdxT n_train, IdxT n_dim, int seed)
+{
+  raft::device_vector<IdxT, IdxT> train_indices =
+    get_subsample_indices<IdxT>(res, n_samples, n_train, seed);
+
+  auto output = raft::make_device_matrix<T, IdxT>(res, n_train, n_dim);
+  cudaPointerAttributes attr;
+  RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, input));
+  T* ptr = reinterpret_cast<T*>(attr.devicePointer);
+  if (ptr != nullptr) {
+    raft::matrix::gather(res,
+                         raft::make_device_matrix_view<const T, IdxT>(ptr, n_samples, n_dim),
+                         raft::make_const_mdspan(train_indices.view()),
+                         output.view());
+  } else {
+    auto dataset = raft::make_host_matrix_view<const T, IdxT>(input, n_samples, n_dim);
+    raft::matrix::detail::gather(
+      res, dataset, make_const_mdspan(train_indices.view()), output.view());
+  }
+  return output;
+}
 }  // namespace raft::spatial::knn::detail::utils

From 1d2a68140e98cf31186e5cb70be856490e0716cb Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Mon, 5 Feb 2024 12:01:55 +0100
Subject: [PATCH 02/16] Add subsample benchmark

---
 cpp/bench/prims/random/subsample.cu | 197 ++++++++++++++++++++++++++++
 1 file changed, 197 insertions(+)
 create mode 100644 cpp/bench/prims/random/subsample.cu

diff --git a/cpp/bench/prims/random/subsample.cu b/cpp/bench/prims/random/subsample.cu
new file mode 100644
index 0000000000..a89b1b1650
--- /dev/null
+++ b/cpp/bench/prims/random/subsample.cu
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <cub/cub.cuh>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/random/permute.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/random/sample_without_replacement.cuh>
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_scalar.hpp>
+
+namespace raft::bench::random {
+
+struct sample_inputs {
+  int n_samples;
+  int n_train;
+  int method;
+};  // struct sample_inputs
+
+template <typename IdxT>
+auto excess_subsample(raft::resources const& res, IdxT n_samples, IdxT n_subsamples, int seed)
+  -> raft::device_vector<IdxT, IdxT>
+{
+  RAFT_EXPECTS(n_subsamples <= n_samples, "Cannot have more training samples than dataset vectors");
+  auto stream = resource::get_cuda_stream(res);
+
+  auto rnd_idx =
+    raft::make_device_vector<IdxT, IdxT>(res, std::min<IdxT>(1.5 * n_subsamples, n_samples));
+  auto linear_idx = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
+  raft::linalg::map_offset(res, linear_idx.view(), identity_op());
+
+  raft::random::RngState state(137ULL);
+  raft::random::uniformInt(
+    res, state, rnd_idx.data_handle(), rnd_idx.size(), IdxT(0), IdxT(n_samples));
+
+  // Sort indices according to rnd keys
+  size_t workspace_size = 0;
+  cub::DeviceMergeSort::SortPairs(nullptr,
+                                  workspace_size,
+                                  rnd_idx.data_handle(),
+                                  linear_idx.data_handle(),
+                                  rnd_idx.size(),
+                                  raft::less_op{});
+  float GiB = 1073741824.0f;
+  RAFT_LOG_INFO("worksize sort %6.1f GiB", workspace_size / GiB);
+  auto workspace = raft::make_device_vector<char, IdxT>(res, workspace_size);
+  cub::DeviceMergeSort::SortPairs(nullptr,
+                                  workspace_size,
+                                  rnd_idx.data_handle(),
+                                  linear_idx.data_handle(),
+                                  rnd_idx.size(),
+                                  raft::less_op{});
+
+  if (rnd_idx.size() == static_cast<size_t>(n_samples)) {
+    // We shuffled the linear_idx array by sorting it according to rnd_idx.
+    // We return the first n_subsamples elements.
+    if (n_subsamples == n_samples) { return linear_idx; }
+    rnd_idx = raft::make_device_vector<IdxT, IdxT>(res, n_subsamples);
+    raft::copy(rnd_idx.data_handle(), linear_idx.data_handle(), n_subsamples, stream);
+    return rnd_idx;
+  }
+  // Else we do a rejection sampling (or excess sampling): we generated more random indices than
+  // needed and reject the duplicates.
+  auto keys_out   = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
+  auto values_out = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
+  rmm::device_scalar<IdxT> num_selected(stream);
+  size_t worksize2 = 0;
+  cub::DeviceSelect::UniqueByKey(nullptr,
+                                 worksize2,
+                                 rnd_idx.data_handle(),
+                                 linear_idx.data_handle(),
+                                 keys_out.data_handle(),
+                                 values_out.data_handle(),
+                                 num_selected.data(),
+                                 rnd_idx.size(),
+                                 stream);
+
+  RAFT_LOG_INFO("worksize unique %6.1f GiB", worksize2 / GiB);
+
+  if (worksize2 > workspace.size()) {
+    workspace = raft::make_device_vector<char, IdxT>(res, worksize2);
+  }
+
+  cub::DeviceSelect::UniqueByKey(workspace.data_handle(),
+                                 worksize2,
+                                 rnd_idx.data_handle(),
+                                 linear_idx.data_handle(),
+                                 keys_out.data_handle(),
+                                 values_out.data_handle(),
+                                 num_selected.data(),
+                                 rnd_idx.size(),
+                                 stream);
+
+  IdxT selected = num_selected.value(stream);
+
+  if (selected < n_subsamples) {
+    RAFT_LOG_WARN("Subsampling returned with less unique indices (%zu) than requested (%zu)",
+                  (size_t)selected,
+                  (size_t)n_subsamples);
+
+  } else {
+    RAFT_LOG_INFO(
+      "Subsampling unique indices (%zu) requested (%zu)", (size_t)selected, (size_t)n_subsamples);
+  }
+
+  // need to shuffle again
+  cub::DeviceMergeSort::SortPairs(workspace.data_handle(),
+                                  worksize2,
+                                  linear_idx.data_handle(),
+                                  rnd_idx.data_handle(),
+                                  n_samples,
+                                  raft::less_op{});
+
+  if (n_subsamples == n_samples) { return linear_idx; }
+  values_out = raft::make_device_vector<IdxT, IdxT>(res, n_subsamples);
+  raft::copy(values_out.data_handle(), rnd_idx.data_handle(), n_subsamples, stream);
+  return values_out;
+}
+
+template <typename IdxT>
+auto bernoulli_subsample(raft::resources const& res, IdxT n_samples, IdxT n_subsamples, int seed)
+  -> raft::device_vector<IdxT, IdxT>
+{
+  RAFT_EXPECTS(n_subsamples <= n_samples, "Cannot have more training samples than dataset vectors");
+
+  auto indices = raft::make_device_vector<IdxT, IdxT>(res, n_subsamples);
+  raft::random::RngState state(123456ULL);
+  raft::random::uniformInt(
+    res, state, indices.data_handle(), n_subsamples, IdxT(0), IdxT(n_samples));
+  return indices;
+}
+
+template <typename T>
+struct sample : public fixture {
+  sample(const sample_inputs& p)
+    : params(p),
+      in(make_device_vector<T, int64_t>(res, p.n_samples)),
+      out(make_device_vector<T, int64_t>(res, p.n_train))
+  {
+    raft::random::RngState r(123456ULL);
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    raft::random::RngState r(123456ULL);
+    loop_on_state(state, [this, &r]() {
+      if (params.method == 0) {
+        this->out = raft::spatial::knn::detail::utils::get_subsample_indices<T>(
+          this->res, this->params.n_samples, this->params.n_train, 137);
+      } else if (params.method == 1) {
+        this->out =
+          bernoulli_subsample<T>(this->res, this->params.n_samples, this->params.n_train, 137);
+      } else if (params.method == 2) {
+        this->out =
+          excess_subsample<T>(this->res, this->params.n_samples, this->params.n_train, 137);
+      }
+      //   raft::random::permute(
+      //     perms.data(), out.data(), in.data(), params.cols, params.rows, params.rowMajor,
+      //     stream);
+    });
+  }
+
+ private:
+  raft::device_resources res;
+  sample_inputs params;
+  raft::device_vector<T, int64_t> out, in;
+};  // struct sample
+
+const std::vector<sample_inputs> input_vecs = {{10000000, 1000000, 0},
+                                               {10000000, 10000000, 0},
+                                               {100000000, 10000000, 1},
+                                               {100000000, 100000000, 1},
+                                               {100000000, 10000000, 2},
+                                               {100000000, 50000000, 2},
+                                               {100000000, 100000000, 2}};
+
+RAFT_BENCH_REGISTER(sample<int64_t>, "", input_vecs);
+
+}  // namespace raft::bench::random

From 4040a9623134b88bfeff4e6507760f8145928f55 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Tue, 12 Mar 2024 01:14:32 +0100
Subject: [PATCH 03/16] debug

---
 cpp/bench/prims/CMakeLists.txt                | 172 +++++++++---------
 cpp/bench/prims/random/subsample.cu           |  65 +++++--
 .../raft/spatial/knn/detail/ann_utils.cuh     |   4 +
 3 files changed, 142 insertions(+), 99 deletions(-)

diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index 3a2431cd34..18936317f6 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -74,94 +74,96 @@ function(ConfigureBench)
 endfunction()
 
 if(BUILD_PRIMS_BENCH)
-  ConfigureBench(
-    NAME CORE_BENCH PATH bench/prims/core/bitset.cu bench/prims/core/copy.cu bench/prims/main.cpp
-  )
-
-  ConfigureBench(
-    NAME CLUSTER_BENCH PATH bench/prims/cluster/kmeans_balanced.cu bench/prims/cluster/kmeans.cu
-    bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
-  )
-
-  ConfigureBench(
-    NAME TUNE_DISTANCE PATH bench/prims/distance/tune_pairwise/kernel.cu
-    bench/prims/distance/tune_pairwise/bench.cu bench/prims/main.cpp
-  )
-
-  ConfigureBench(
-    NAME
-    DISTANCE_BENCH
-    PATH
-    bench/prims/distance/distance_cosine.cu
-    bench/prims/distance/distance_exp_l2.cu
-    bench/prims/distance/distance_l1.cu
-    bench/prims/distance/distance_unexp_l2.cu
-    bench/prims/distance/fused_l2_nn.cu
-    bench/prims/distance/masked_nn.cu
-    bench/prims/distance/kernels.cu
-    bench/prims/main.cpp
-    OPTIONAL
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
-  )
-
-  ConfigureBench(
-    NAME
-    LINALG_BENCH
-    PATH
-    bench/prims/linalg/add.cu
-    bench/prims/linalg/map_then_reduce.cu
-    bench/prims/linalg/matrix_vector_op.cu
-    bench/prims/linalg/norm.cu
-    bench/prims/linalg/normalize.cu
-    bench/prims/linalg/reduce_cols_by_key.cu
-    bench/prims/linalg/reduce_rows_by_key.cu
-    bench/prims/linalg/reduce.cu
-    bench/prims/linalg/sddmm.cu
-    bench/prims/main.cpp
-  )
-
-  ConfigureBench(
-    NAME
-    MATRIX_BENCH
-    PATH
-    bench/prims/matrix/argmin.cu
-    bench/prims/matrix/gather.cu
-    bench/prims/matrix/select_k.cu
-    bench/prims/matrix/main.cpp
-    OPTIONAL
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
-  )
+  # ConfigureBench(
+  #   NAME CORE_BENCH PATH bench/prims/core/bitset.cu bench/prims/core/copy.cu bench/prims/main.cpp
+  # )
+
+  # ConfigureBench(
+  #   NAME CLUSTER_BENCH PATH bench/prims/cluster/kmeans_balanced.cu bench/prims/cluster/kmeans.cu
+  #   bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
+  # )
+
+  # ConfigureBench(
+  #   NAME TUNE_DISTANCE PATH bench/prims/distance/tune_pairwise/kernel.cu
+  #   bench/prims/distance/tune_pairwise/bench.cu bench/prims/main.cpp
+  # )
+
+  # ConfigureBench(
+  #   NAME
+  #   DISTANCE_BENCH
+  #   PATH
+  #   bench/prims/distance/distance_cosine.cu
+  #   bench/prims/distance/distance_exp_l2.cu
+  #   bench/prims/distance/distance_l1.cu
+  #   bench/prims/distance/distance_unexp_l2.cu
+  #   bench/prims/distance/fused_l2_nn.cu
+  #   bench/prims/distance/masked_nn.cu
+  #   bench/prims/distance/kernels.cu
+  #   bench/prims/main.cpp
+  #   OPTIONAL
+  #   LIB
+  #   EXPLICIT_INSTANTIATE_ONLY
+  # )
+
+  # ConfigureBench(
+  #   NAME
+  #   LINALG_BENCH
+  #   PATH
+  #   bench/prims/linalg/add.cu
+  #   bench/prims/linalg/map_then_reduce.cu
+  #   bench/prims/linalg/matrix_vector_op.cu
+  #   bench/prims/linalg/norm.cu
+  #   bench/prims/linalg/normalize.cu
+  #   bench/prims/linalg/reduce_cols_by_key.cu
+  #   bench/prims/linalg/reduce_rows_by_key.cu
+  #   bench/prims/linalg/reduce.cu
+  #   bench/prims/linalg/sddmm.cu
+  #   bench/prims/main.cpp
+  # )
+
+  # ConfigureBench(
+  #   NAME
+  #   MATRIX_BENCH
+  #   PATH
+  #   bench/prims/matrix/argmin.cu
+  #   bench/prims/matrix/gather.cu
+  #   bench/prims/matrix/select_k.cu
+  #   bench/prims/matrix/main.cpp
+  #   OPTIONAL
+  #   LIB
+  #   EXPLICIT_INSTANTIATE_ONLY
+  # )
 
   ConfigureBench(
-    NAME RANDOM_BENCH PATH bench/prims/random/make_blobs.cu bench/prims/random/permute.cu
-    bench/prims/random/rng.cu bench/prims/main.cpp
+    NAME RANDOM_BENCH PATH 
+    # bench/prims/random/make_blobs.cu bench/prims/random/permute.cu
+    # bench/prims/random/rng.cu
+     bench/prims/random/subsample.cu bench/prims/main.cpp
   )
 
-  ConfigureBench(NAME SPARSE_BENCH PATH bench/prims/sparse/convert_csr.cu bench/prims/main.cpp)
-
-  ConfigureBench(
-    NAME
-    NEIGHBORS_BENCH
-    PATH
-    bench/prims/neighbors/knn/brute_force_float_int64_t.cu
-    bench/prims/neighbors/knn/brute_force_float_uint32_t.cu
-    bench/prims/neighbors/knn/cagra_float_uint32_t.cu
-    bench/prims/neighbors/knn/ivf_flat_filter_float_int64_t.cu
-    bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu
-    bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu
-    bench/prims/neighbors/knn/ivf_flat_uint8_t_int64_t.cu
-    bench/prims/neighbors/knn/ivf_pq_float_int64_t.cu
-    bench/prims/neighbors/knn/ivf_pq_filter_float_int64_t.cu
-    bench/prims/neighbors/knn/ivf_pq_int8_t_int64_t.cu
-    bench/prims/neighbors/knn/ivf_pq_uint8_t_int64_t.cu
-    bench/prims/neighbors/refine_float_int64_t.cu
-    bench/prims/neighbors/refine_uint8_t_int64_t.cu
-    bench/prims/main.cpp
-    OPTIONAL
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
-  )
+  # ConfigureBench(NAME SPARSE_BENCH PATH bench/prims/sparse/convert_csr.cu bench/prims/main.cpp)
+
+  # ConfigureBench(
+  #   NAME
+  #   NEIGHBORS_BENCH
+  #   PATH
+  #   bench/prims/neighbors/knn/brute_force_float_int64_t.cu
+  #   bench/prims/neighbors/knn/brute_force_float_uint32_t.cu
+  #   bench/prims/neighbors/knn/cagra_float_uint32_t.cu
+  #   bench/prims/neighbors/knn/ivf_flat_filter_float_int64_t.cu
+  #   bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu
+  #   bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu
+  #   bench/prims/neighbors/knn/ivf_flat_uint8_t_int64_t.cu
+  #   bench/prims/neighbors/knn/ivf_pq_float_int64_t.cu
+  #   bench/prims/neighbors/knn/ivf_pq_filter_float_int64_t.cu
+  #   bench/prims/neighbors/knn/ivf_pq_int8_t_int64_t.cu
+  #   bench/prims/neighbors/knn/ivf_pq_uint8_t_int64_t.cu
+  #   bench/prims/neighbors/refine_float_int64_t.cu
+  #   bench/prims/neighbors/refine_uint8_t_int64_t.cu
+  #   bench/prims/main.cpp
+  #   OPTIONAL
+  #   LIB
+  #   EXPLICIT_INSTANTIATE_ONLY
+  # )
 
 endif()
diff --git a/cpp/bench/prims/random/subsample.cu b/cpp/bench/prims/random/subsample.cu
index a89b1b1650..03b22db95a 100644
--- a/cpp/bench/prims/random/subsample.cu
+++ b/cpp/bench/prims/random/subsample.cu
@@ -15,7 +15,7 @@
  */
 
 #include <common/benchmark.hpp>
-#include <cub/cub.cuh>
+
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_resources.hpp>
 #include <raft/core/host_mdarray.hpp>
@@ -25,8 +25,11 @@
 #include <raft/random/sample_without_replacement.cuh>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
+
 #include <rmm/device_scalar.hpp>
 
+#include <cub/cub.cuh>
+
 namespace raft::bench::random {
 
 struct sample_inputs {
@@ -42,8 +45,15 @@ auto excess_subsample(raft::resources const& res, IdxT n_samples, IdxT n_subsamp
   RAFT_EXPECTS(n_subsamples <= n_samples, "Cannot have more training samples than dataset vectors");
   auto stream = resource::get_cuda_stream(res);
 
+  // number of samples we'll need to sample (with replacement), to expect 'k'
+  // unique samples from 'n' is given by the following equation: log(1 - k/n)/log(1 - 1/n) ref:
+  // https://stats.stackexchange.com/questions/296005/the-expected-number-of-unique-elements-drawn-with-replacement
+  IdxT n_excess_samples = std::ceil(raft::log(1 - double(n_subsamples) / double(n_samples)) /
+                                    (raft::log(1 - 1 / double(n_samples))));
   auto rnd_idx =
-    raft::make_device_vector<IdxT, IdxT>(res, std::min<IdxT>(1.5 * n_subsamples, n_samples));
+    raft::make_device_vector<IdxT, IdxT>(res, std::min<IdxT>(n_excess_samples, n_samples));
+
+  RAFT_LOG_INFO("We will draw %zu random samples", (size_t)rnd_idx.size());
   auto linear_idx = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
   raft::linalg::map_offset(res, linear_idx.view(), identity_op());
 
@@ -51,6 +61,9 @@ auto excess_subsample(raft::resources const& res, IdxT n_samples, IdxT n_subsamp
   raft::random::uniformInt(
     res, state, rnd_idx.data_handle(), rnd_idx.size(), IdxT(0), IdxT(n_samples));
 
+  if (rnd_idx.size() <= 100) {
+    print_vector("rnd_idx", rnd_idx.data_handle(), rnd_idx.size(), std::cout);
+  }
   // Sort indices according to rnd keys
   size_t workspace_size = 0;
   cub::DeviceMergeSort::SortPairs(nullptr,
@@ -62,13 +75,19 @@ auto excess_subsample(raft::resources const& res, IdxT n_samples, IdxT n_subsamp
   float GiB = 1073741824.0f;
   RAFT_LOG_INFO("worksize sort %6.1f GiB", workspace_size / GiB);
   auto workspace = raft::make_device_vector<char, IdxT>(res, workspace_size);
-  cub::DeviceMergeSort::SortPairs(nullptr,
+  cub::DeviceMergeSort::SortPairs(workspace.data_handle(),
                                   workspace_size,
                                   rnd_idx.data_handle(),
                                   linear_idx.data_handle(),
                                   rnd_idx.size(),
                                   raft::less_op{});
 
+  if (rnd_idx.size() <= 100) {
+    print_vector("rnd   _idx sorted", rnd_idx.data_handle(), rnd_idx.size(), std::cout);
+  }
+  if (rnd_idx.size() <= 100) {
+    print_vector("linear_idx sorted", linear_idx.data_handle(), linear_idx.size(), std::cout);
+  }
   if (rnd_idx.size() == static_cast<size_t>(n_samples)) {
     // We shuffled the linear_idx array by sorting it according to rnd_idx.
     // We return the first n_subsamples elements.
@@ -111,12 +130,18 @@ auto excess_subsample(raft::resources const& res, IdxT n_samples, IdxT n_subsamp
 
   IdxT selected = num_selected.value(stream);
 
+  if (rnd_idx.size() <= 100) {
+    print_vector("unique keys (rnd_idx)", keys_out.data_handle(), selected, std::cout);
+    print_vector("unique vals (linear idx)", values_out.data_handle(), selected, std::cout);
+  }
   if (selected < n_subsamples) {
     RAFT_LOG_WARN("Subsampling returned with less unique indices (%zu) than requested (%zu)",
                   (size_t)selected,
                   (size_t)n_subsamples);
 
   } else {
+    RAFT_LOG_INFO(
+      "We have %zu unique idices out of %zu samples", (size_t)selected, (size_t)rnd_idx.size());
     RAFT_LOG_INFO(
       "Subsampling unique indices (%zu) requested (%zu)", (size_t)selected, (size_t)n_subsamples);
   }
@@ -124,14 +149,18 @@ auto excess_subsample(raft::resources const& res, IdxT n_samples, IdxT n_subsamp
   // need to shuffle again
   cub::DeviceMergeSort::SortPairs(workspace.data_handle(),
                                   worksize2,
-                                  linear_idx.data_handle(),
-                                  rnd_idx.data_handle(),
-                                  n_samples,
+                                  values_out.data_handle(),
+                                  keys_out.data_handle(),
+                                  n_subsamples,
                                   raft::less_op{});
 
+  if (rnd_idx.size() <= 100) {
+    print_vector("re sorted keys ", keys_out.data_handle(), selected, std::cout);
+    print_vector("re sorted vals ", values_out.data_handle(), selected, std::cout);
+  }
   if (n_subsamples == n_samples) { return linear_idx; }
   values_out = raft::make_device_vector<IdxT, IdxT>(res, n_subsamples);
-  raft::copy(values_out.data_handle(), rnd_idx.data_handle(), n_subsamples, stream);
+  raft::copy(values_out.data_handle(), keys_out.data_handle(), n_subsamples, stream);
   return values_out;
 }
 
@@ -176,6 +205,9 @@ struct sample : public fixture {
       //     perms.data(), out.data(), in.data(), params.cols, params.rows, params.rowMajor,
       //     stream);
     });
+    if (this->params.n_train <= 100) {
+      print_vector("samples", this->out.data_handle(), this->params.n_train, std::cout);
+    }
   }
 
  private:
@@ -184,13 +216,18 @@ struct sample : public fixture {
   raft::device_vector<T, int64_t> out, in;
 };  // struct sample
 
-const std::vector<sample_inputs> input_vecs = {{10000000, 1000000, 0},
-                                               {10000000, 10000000, 0},
-                                               {100000000, 10000000, 1},
-                                               {100000000, 100000000, 1},
-                                               {100000000, 10000000, 2},
-                                               {100000000, 50000000, 2},
-                                               {100000000, 100000000, 2}};
+const std::vector<sample_inputs> input_vecs = {
+  {100, 20, 2}, {10, 5, 2},
+  //{100, 50, 2},
+  // {10000000, 1000000, 0},
+  // {10000000, 10000000, 0},
+  // {100000000, 10000000, 1},
+  // {100000000, 100000000, 1},
+  // {100000000, 10000000, 2},
+  // {100000000, 50000000, 2},
+  // {1000, 900, 2}
+  //{100000000, 100000000, 2}
+};
 
 RAFT_BENCH_REGISTER(sample<int64_t>, "", input_vecs);
 
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index e7e9ec6c08..d7f4651b56 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -18,6 +18,10 @@
 
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/linalg/map.cuh>
+#include <raft/matrix/gather.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/random/sample_without_replacement.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/integer_utils.hpp>

From e09c9f7b5381ba159135fe37e2a50bbb5add2ebf Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Tue, 12 Mar 2024 23:39:02 +0100
Subject: [PATCH 04/16] Fix bug

---
 cpp/bench/prims/random/subsample.cu           | 159 +---
 cpp/include/raft/random/detail/rng_device.cuh |   1 +
 cpp/include/raft/random/detail/rng_impl.cuh   | 159 ++++
 cpp/include/raft/random/rng.cuh               |  24 +
 cpp/test/CMakeLists.txt                       | 753 +++++++++---------
 5 files changed, 577 insertions(+), 519 deletions(-)

diff --git a/cpp/bench/prims/random/subsample.cu b/cpp/bench/prims/random/subsample.cu
index 03b22db95a..64a5e32669 100644
--- a/cpp/bench/prims/random/subsample.cu
+++ b/cpp/bench/prims/random/subsample.cu
@@ -38,132 +38,6 @@ struct sample_inputs {
   int method;
 };  // struct sample_inputs
 
-template <typename IdxT>
-auto excess_subsample(raft::resources const& res, IdxT n_samples, IdxT n_subsamples, int seed)
-  -> raft::device_vector<IdxT, IdxT>
-{
-  RAFT_EXPECTS(n_subsamples <= n_samples, "Cannot have more training samples than dataset vectors");
-  auto stream = resource::get_cuda_stream(res);
-
-  // number of samples we'll need to sample (with replacement), to expect 'k'
-  // unique samples from 'n' is given by the following equation: log(1 - k/n)/log(1 - 1/n) ref:
-  // https://stats.stackexchange.com/questions/296005/the-expected-number-of-unique-elements-drawn-with-replacement
-  IdxT n_excess_samples = std::ceil(raft::log(1 - double(n_subsamples) / double(n_samples)) /
-                                    (raft::log(1 - 1 / double(n_samples))));
-  auto rnd_idx =
-    raft::make_device_vector<IdxT, IdxT>(res, std::min<IdxT>(n_excess_samples, n_samples));
-
-  RAFT_LOG_INFO("We will draw %zu random samples", (size_t)rnd_idx.size());
-  auto linear_idx = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
-  raft::linalg::map_offset(res, linear_idx.view(), identity_op());
-
-  raft::random::RngState state(137ULL);
-  raft::random::uniformInt(
-    res, state, rnd_idx.data_handle(), rnd_idx.size(), IdxT(0), IdxT(n_samples));
-
-  if (rnd_idx.size() <= 100) {
-    print_vector("rnd_idx", rnd_idx.data_handle(), rnd_idx.size(), std::cout);
-  }
-  // Sort indices according to rnd keys
-  size_t workspace_size = 0;
-  cub::DeviceMergeSort::SortPairs(nullptr,
-                                  workspace_size,
-                                  rnd_idx.data_handle(),
-                                  linear_idx.data_handle(),
-                                  rnd_idx.size(),
-                                  raft::less_op{});
-  float GiB = 1073741824.0f;
-  RAFT_LOG_INFO("worksize sort %6.1f GiB", workspace_size / GiB);
-  auto workspace = raft::make_device_vector<char, IdxT>(res, workspace_size);
-  cub::DeviceMergeSort::SortPairs(workspace.data_handle(),
-                                  workspace_size,
-                                  rnd_idx.data_handle(),
-                                  linear_idx.data_handle(),
-                                  rnd_idx.size(),
-                                  raft::less_op{});
-
-  if (rnd_idx.size() <= 100) {
-    print_vector("rnd   _idx sorted", rnd_idx.data_handle(), rnd_idx.size(), std::cout);
-  }
-  if (rnd_idx.size() <= 100) {
-    print_vector("linear_idx sorted", linear_idx.data_handle(), linear_idx.size(), std::cout);
-  }
-  if (rnd_idx.size() == static_cast<size_t>(n_samples)) {
-    // We shuffled the linear_idx array by sorting it according to rnd_idx.
-    // We return the first n_subsamples elements.
-    if (n_subsamples == n_samples) { return linear_idx; }
-    rnd_idx = raft::make_device_vector<IdxT, IdxT>(res, n_subsamples);
-    raft::copy(rnd_idx.data_handle(), linear_idx.data_handle(), n_subsamples, stream);
-    return rnd_idx;
-  }
-  // Else we do a rejection sampling (or excess sampling): we generated more random indices than
-  // needed and reject the duplicates.
-  auto keys_out   = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
-  auto values_out = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
-  rmm::device_scalar<IdxT> num_selected(stream);
-  size_t worksize2 = 0;
-  cub::DeviceSelect::UniqueByKey(nullptr,
-                                 worksize2,
-                                 rnd_idx.data_handle(),
-                                 linear_idx.data_handle(),
-                                 keys_out.data_handle(),
-                                 values_out.data_handle(),
-                                 num_selected.data(),
-                                 rnd_idx.size(),
-                                 stream);
-
-  RAFT_LOG_INFO("worksize unique %6.1f GiB", worksize2 / GiB);
-
-  if (worksize2 > workspace.size()) {
-    workspace = raft::make_device_vector<char, IdxT>(res, worksize2);
-  }
-
-  cub::DeviceSelect::UniqueByKey(workspace.data_handle(),
-                                 worksize2,
-                                 rnd_idx.data_handle(),
-                                 linear_idx.data_handle(),
-                                 keys_out.data_handle(),
-                                 values_out.data_handle(),
-                                 num_selected.data(),
-                                 rnd_idx.size(),
-                                 stream);
-
-  IdxT selected = num_selected.value(stream);
-
-  if (rnd_idx.size() <= 100) {
-    print_vector("unique keys (rnd_idx)", keys_out.data_handle(), selected, std::cout);
-    print_vector("unique vals (linear idx)", values_out.data_handle(), selected, std::cout);
-  }
-  if (selected < n_subsamples) {
-    RAFT_LOG_WARN("Subsampling returned with less unique indices (%zu) than requested (%zu)",
-                  (size_t)selected,
-                  (size_t)n_subsamples);
-
-  } else {
-    RAFT_LOG_INFO(
-      "We have %zu unique idices out of %zu samples", (size_t)selected, (size_t)rnd_idx.size());
-    RAFT_LOG_INFO(
-      "Subsampling unique indices (%zu) requested (%zu)", (size_t)selected, (size_t)n_subsamples);
-  }
-
-  // need to shuffle again
-  cub::DeviceMergeSort::SortPairs(workspace.data_handle(),
-                                  worksize2,
-                                  values_out.data_handle(),
-                                  keys_out.data_handle(),
-                                  n_subsamples,
-                                  raft::less_op{});
-
-  if (rnd_idx.size() <= 100) {
-    print_vector("re sorted keys ", keys_out.data_handle(), selected, std::cout);
-    print_vector("re sorted vals ", values_out.data_handle(), selected, std::cout);
-  }
-  if (n_subsamples == n_samples) { return linear_idx; }
-  values_out = raft::make_device_vector<IdxT, IdxT>(res, n_subsamples);
-  raft::copy(values_out.data_handle(), keys_out.data_handle(), n_subsamples, stream);
-  return values_out;
-}
-
 template <typename IdxT>
 auto bernoulli_subsample(raft::resources const& res, IdxT n_samples, IdxT n_subsamples, int seed)
   -> raft::device_vector<IdxT, IdxT>
@@ -198,12 +72,9 @@ struct sample : public fixture {
         this->out =
           bernoulli_subsample<T>(this->res, this->params.n_samples, this->params.n_train, 137);
       } else if (params.method == 2) {
-        this->out =
-          excess_subsample<T>(this->res, this->params.n_samples, this->params.n_train, 137);
+        this->out = raft::random::excess_subsample<T, int64_t>(
+          this->res, r, this->params.n_samples, this->params.n_train);
       }
-      //   raft::random::permute(
-      //     perms.data(), out.data(), in.data(), params.cols, params.rows, params.rowMajor,
-      //     stream);
     });
     if (this->params.n_train <= 100) {
       print_vector("samples", this->out.data_handle(), this->params.n_train, std::cout);
@@ -216,18 +87,20 @@ struct sample : public fixture {
   raft::device_vector<T, int64_t> out, in;
 };  // struct sample
 
-const std::vector<sample_inputs> input_vecs = {
-  {100, 20, 2}, {10, 5, 2},
-  //{100, 50, 2},
-  // {10000000, 1000000, 0},
-  // {10000000, 10000000, 0},
-  // {100000000, 10000000, 1},
-  // {100000000, 100000000, 1},
-  // {100000000, 10000000, 2},
-  // {100000000, 50000000, 2},
-  // {1000, 900, 2}
-  //{100000000, 100000000, 2}
-};
+const std::vector<sample_inputs> input_vecs = {{100, 20, 2},
+                                               {10, 5, 2},
+                                               {20, 10, 2},
+                                               {20, 15, 2},
+                                               {100, 50, 2},
+                                               {1000, 500, 2},
+                                               {1000, 600, 2},
+                                               {1000, 700, 2},
+                                               {10000, 5000, 2},
+                                               {100000, 50000, 2},
+                                               {100000000, 10000000, 2},
+                                               {100000000, 50000000, 2},
+                                               {1000, 900, 2},
+                                               {100000000, 100000000, 2}};
 
 RAFT_BENCH_REGISTER(sample<int64_t>, "", input_vecs);
 
diff --git a/cpp/include/raft/random/detail/rng_device.cuh b/cpp/include/raft/random/detail/rng_device.cuh
index 12c67679ba..5e962fc982 100644
--- a/cpp/include/raft/random/detail/rng_device.cuh
+++ b/cpp/include/raft/random/detail/rng_device.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <raft/linalg/map.cuh>
 #include <raft/random/rng_state.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/integer_utils.hpp>
diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh
index ace98e6d3f..08be3f6a98 100644
--- a/cpp/include/raft/random/detail/rng_impl.cuh
+++ b/cpp/include/raft/random/detail/rng_impl.cuh
@@ -17,12 +17,18 @@
 #pragma once
 
 #include <raft/core/detail/macros.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/math.hpp>
 #include <raft/random/rng_device.cuh>
 #include <raft/random/rng_state.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/detail/cub_wrappers.cuh>
 #include <raft/util/scatter.cuh>
 
+#include <rmm/device_scalar.hpp>
+
+#include <cub/cub.cuh>
+
 namespace raft {
 namespace random {
 namespace detail {
@@ -278,6 +284,7 @@ std::enable_if_t<std::is_integral_v<OutType>> discrete(RngState& rng_state,
                      len);
 }
 
+/** Note the memory space requirements are O(4*len) */
 template <typename DataT, typename WeightsT, typename IdxT = int>
 void sampleWithoutReplacement(RngState& rng_state,
                               DataT* out,
@@ -339,6 +346,158 @@ void affine_transform_params(RngState const& rng_state, IdxT n, IdxT& a, IdxT& b
   b = mt_rng() % n;
 }
 
+/** @brief Sample without replacement from range 0..N-1.
+ *
+ * Elements are sampled uniformly.
+ * The algorithm will allocate a workspace of size O(4*n_samples) internally.
+ *
+ * We use max N random numbers. Depending on how large n_samples is w.r.t to N, we
+ * either use rejection sampling, sort the [0..N-1] values using random keys.
+ *
+ * @tparam IdxT type of indices that we sample
+ * @tparam MatIdxT extent type of the returned mdarray
+ *
+ * @param res RAFT resource handle
+ * @param RngState state random number generator state
+ * @param N number of elements to sample from. We will sample values in range 0..N-1
+ * @param n_samples number of samples to return
+ *
+ * @return device mdarray with the random samples
+ */
+template <typename IdxT, typename MatIdxT = IdxT>
+auto excess_subsample(raft::resources const& res, RngState& state, IdxT N, IdxT n_samples)
+  -> raft::device_vector<IdxT, MatIdxT>
+{
+  RAFT_EXPECTS(n_samples <= N, "Cannot have more training samples than dataset vectors");
+
+  // Number of samples we'll need to sample (with replacement), to expect 'k'
+  // unique samples from 'n' is given by the following equation: log(1 - k/n)/log(1 - 1/n) ref:
+  // https://stats.stackexchange.com/questions/296005/the-expected-number-of-unique-elements-drawn-with-replacement
+  IdxT n_excess_samples =
+    n_samples < N
+      ? std::ceil(raft::log(1 - double(n_samples) / double(N)) / (raft::log(1 - 1 / double(N))))
+      : N;
+
+  // There is a variance of n_excess_samples, we take 10% more elements.
+  n_excess_samples += std::max<IdxT>(0.1 * n_samples, 100);
+
+  // n_excess_sampless will be larger than N around k = 0.64*N. When we reach N, then instead of
+  // doing rejection sampling, we simply shuffle the range [0..N-1] using N random numbers.
+  n_excess_samples = std::min<IdxT>(n_excess_samples, N);
+  auto rnd_idx     = raft::make_device_vector<IdxT, IdxT>(res, n_excess_samples);
+
+  RAFT_LOG_INFO("We will draw %zu random samples", (size_t)rnd_idx.size());
+  auto linear_idx = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
+  raft::linalg::map_offset(res, linear_idx.view(), identity_op());
+
+  uniformInt(res, state, rnd_idx.data_handle(), rnd_idx.size(), IdxT(0), IdxT(N));
+
+  if (rnd_idx.size() <= 100) {
+    print_vector("rnd_idx", rnd_idx.data_handle(), rnd_idx.size(), std::cout);
+  }
+  // Sort indices according to rnd keys
+  size_t workspace_size = 0;
+  auto stream           = resource::get_cuda_stream(res);
+  cub::DeviceMergeSort::SortPairs(nullptr,
+                                  workspace_size,
+                                  rnd_idx.data_handle(),
+                                  linear_idx.data_handle(),
+                                  rnd_idx.size(),
+                                  raft::less_op{},
+                                  stream);
+  float GiB = 1073741824.0f;
+  RAFT_LOG_INFO("worksize sort %6.1f GiB", workspace_size / GiB);
+  auto workspace = raft::make_device_vector<char, IdxT>(res, workspace_size);
+  cub::DeviceMergeSort::SortPairs(workspace.data_handle(),
+                                  workspace_size,
+                                  rnd_idx.data_handle(),
+                                  linear_idx.data_handle(),
+                                  rnd_idx.size(),
+                                  raft::less_op{},
+                                  stream);
+
+  if (rnd_idx.size() <= 100) {
+    print_vector("rnd   _idx sorted", rnd_idx.data_handle(), rnd_idx.size(), std::cout);
+  }
+  if (rnd_idx.size() <= 100) {
+    print_vector("linear_idx sorted", linear_idx.data_handle(), linear_idx.size(), std::cout);
+  }
+  if (rnd_idx.size() == static_cast<size_t>(N)) {
+    // We shuffled the linear_idx array by sorting it according to rnd_idx.
+    // We return the first n_samples elements.
+    if (n_samples == N) { return linear_idx; }
+    rnd_idx = raft::make_device_vector<IdxT, IdxT>(res, n_samples);
+    raft::copy(rnd_idx.data_handle(), linear_idx.data_handle(), n_samples, stream);
+    return rnd_idx;
+  }
+  // Else we do a rejection sampling (or excess sampling): we generated more random indices than
+  // needed and reject the duplicates.
+  auto keys_out   = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
+  auto values_out = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
+  rmm::device_scalar<IdxT> num_selected(stream);
+  size_t worksize2 = 0;
+  cub::DeviceSelect::UniqueByKey(nullptr,
+                                 worksize2,
+                                 rnd_idx.data_handle(),
+                                 linear_idx.data_handle(),
+                                 keys_out.data_handle(),
+                                 values_out.data_handle(),
+                                 num_selected.data(),
+                                 rnd_idx.size(),
+                                 stream);
+
+  RAFT_LOG_INFO("worksize unique %6.1f GiB", worksize2 / GiB);
+
+  if (worksize2 > workspace.size()) {
+    workspace      = raft::make_device_vector<char, IdxT>(res, worksize2);
+    workspace_size = workspace.size();
+  }
+
+  cub::DeviceSelect::UniqueByKey(workspace.data_handle(),
+                                 workspace_size,
+                                 rnd_idx.data_handle(),
+                                 linear_idx.data_handle(),
+                                 keys_out.data_handle(),
+                                 values_out.data_handle(),
+                                 num_selected.data(),
+                                 rnd_idx.size(),
+                                 stream);
+
+  IdxT selected = num_selected.value(stream);
+
+  if (rnd_idx.size() <= 100) {
+    print_vector("unique keys (rnd_idx)", keys_out.data_handle(), selected, std::cout);
+    print_vector("unique vals (linear idx)", values_out.data_handle(), selected, std::cout);
+  }
+  if (selected < n_samples) {
+    RAFT_LOG_WARN("Subsampling returned with less unique indices (%zu) than requested (%zu)",
+                  (size_t)selected,
+                  (size_t)n_samples);
+  }
+  RAFT_LOG_INFO(
+    "We have %zu unique idices out of %zu samples", (size_t)selected, (size_t)rnd_idx.size());
+  RAFT_LOG_INFO(
+    "Subsampling unique indices (%zu) requested (%zu)", (size_t)selected, (size_t)n_samples);
+
+  // After duplicates are removed, we need to shuffle back to random order
+
+  cub::DeviceMergeSort::SortPairs(workspace.data_handle(),
+                                  workspace_size,
+                                  values_out.data_handle(),
+                                  keys_out.data_handle(),
+                                  n_samples,
+                                  raft::less_op{},
+                                  stream);
+  if (rnd_idx.size() <= 100) {
+    print_vector("re sorted keys ", keys_out.data_handle(), selected, std::cout);
+    print_vector("re sorted vals ", values_out.data_handle(), selected, std::cout);
+  }
+
+  values_out = raft::make_device_vector<IdxT, IdxT>(res, n_samples);
+  raft::copy(values_out.data_handle(), keys_out.data_handle(), n_samples, stream);
+  return values_out;
+}
+
 };  // end namespace detail
 };  // end namespace random
 };  // end namespace raft
diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh
index 4e63669f98..10d809d3b8 100644
--- a/cpp/include/raft/random/rng.cuh
+++ b/cpp/include/raft/random/rng.cuh
@@ -813,6 +813,30 @@ void sampleWithoutReplacement(raft::resources const& handle,
     rng_state, out, outIdx, in, wts, sampledLen, len, resource::get_cuda_stream(handle));
 }
 
+/** @brief Sample without replacement from range 0..N-1.
+ *
+ * Elements are sampled uniformly.
+ * The algorithm will allocate a workspace of size O(4*n_samples) internally.
+ *
+ * We use max N random numbers. Depending on how large n_samples is w.r.t to N, we
+ * either use rejection sampling, sort the [0..N-1] values using random keys.
+ *
+ * @tparam IdxT type of indices that we sample
+ * @tparam MatIdxT extent type of the returned mdarray
+ *
+ * @param res RAFT resource handle
+ * @param RngState state random number generator state
+ * @param N number of elements to sample from. We will sample values in range 0..N-1
+ * @param n_samples number of samples to return
+ *
+ * @return device mdarray with the random samples
+ */
+template <typename IdxT, typename MatIdxT = IdxT>
+auto excess_subsample(raft::resources const& res, RngState& state, IdxT N, IdxT n_samples)
+{
+  return detail::excess_subsample(res, state, N, n_samples);
+}
+
 /**
  * @brief Generates the 'a' and 'b' parameters for a modulo affine
  *        transformation equation: `(ax + b) % n`
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index dd7eb839ab..28ef83af34 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -95,389 +95,390 @@ endfunction()
 # * distance tests -------------------------------------------------------------------------
 
 if(BUILD_TESTS)
-  ConfigureTest(
-    NAME
-    CLUSTER_TEST
-    PATH
-    test/cluster/kmeans.cu
-    test/cluster/kmeans_balanced.cu
-    test/cluster/kmeans_find_k.cu
-    test/cluster/cluster_solvers.cu
-    test/cluster/linkage.cu
-    test/cluster/spectral.cu
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
-  )
-
-  ConfigureTest(
-    NAME
-    CORE_TEST
-    PATH
-    test/core/bitset.cu
-    test/core/device_resources_manager.cpp
-    test/core/device_setter.cpp
-    test/core/logger.cpp
-    test/core/math_device.cu
-    test/core/math_host.cpp
-    test/core/operators_device.cu
-    test/core/operators_host.cpp
-    test/core/handle.cpp
-    test/core/interruptible.cu
-    test/core/nvtx.cpp
-    test/core/mdarray.cu
-    test/core/mdbuffer.cu
-    test/core/mdspan_copy.cpp
-    test/core/mdspan_copy.cu
-    test/core/mdspan_utils.cu
-    test/core/numpy_serializer.cu
-    test/core/memory_type.cpp
-    test/core/sparse_matrix.cu
-    test/core/sparse_matrix.cpp
-    test/core/span.cpp
-    test/core/span.cu
-    test/core/stream_view.cpp
-    test/core/temporary_device_buffer.cu
-    test/test.cpp
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
-  )
-
-  ConfigureTest(
-    NAME CORE_TEST PATH test/core/stream_view.cpp test/core/mdspan_copy.cpp LIB
-    EXPLICIT_INSTANTIATE_ONLY NOCUDA
-  )
-
-  ConfigureTest(
-    NAME
-    DISTANCE_TEST
-    PATH
-    test/distance/dist_adj.cu
-    test/distance/dist_adj_distance_instance.cu
-    test/distance/dist_canberra.cu
-    test/distance/dist_correlation.cu
-    test/distance/dist_cos.cu
-    test/distance/dist_hamming.cu
-    test/distance/dist_hellinger.cu
-    test/distance/dist_inner_product.cu
-    test/distance/dist_jensen_shannon.cu
-    test/distance/dist_kl_divergence.cu
-    test/distance/dist_l1.cu
-    test/distance/dist_l2_exp.cu
-    test/distance/dist_l2_unexp.cu
-    test/distance/dist_l2_sqrt_exp.cu
-    test/distance/dist_l_inf.cu
-    test/distance/dist_lp_unexp.cu
-    test/distance/dist_russell_rao.cu
-    test/distance/masked_nn.cu
-    test/distance/masked_nn_compress_to_bits.cu
-    test/distance/fused_l2_nn.cu
-    test/distance/gram.cu
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
-  )
-
-  list(
-    APPEND
-    EXT_HEADER_TEST_SOURCES
-    test/ext_headers/raft_neighbors_brute_force.cu
-    test/ext_headers/raft_distance_distance.cu
-    test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
-    test/ext_headers/raft_matrix_detail_select_k.cu
-    test/ext_headers/raft_neighbors_ball_cover.cu
-    test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
-    test/ext_headers/raft_distance_fused_l2_nn.cu
-    test/ext_headers/raft_neighbors_ivf_pq.cu
-    test/ext_headers/raft_util_memory_pool.cpp
-    test/ext_headers/raft_neighbors_ivf_flat.cu
-    test/ext_headers/raft_core_logger.cpp
-    test/ext_headers/raft_neighbors_refine.cu
-    test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu
-    test/ext_headers/raft_linalg_detail_coalesced_reduction.cu
-    test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
-    test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
-    test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
-  )
-
-  # Test that the split headers compile in isolation with:
-  #
-  # * EXT_HEADERS_TEST_COMPILED_EXPLICIT: RAFT_COMPILED, RAFT_EXPLICIT_INSTANTIATE_ONLY defined
-  # * EXT_HEADERS_TEST_COMPILED_IMPLICIT: RAFT_COMPILED defined
-  # * EXT_HEADERS_TEST_IMPLICIT:          no macros defined.
-  ConfigureTest(
-    NAME EXT_HEADERS_TEST_COMPILED_EXPLICIT PATH ${EXT_HEADER_TEST_SOURCES} LIB
-    EXPLICIT_INSTANTIATE_ONLY
-  )
-  ConfigureTest(NAME EXT_HEADERS_TEST_COMPILED_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES} LIB)
-  ConfigureTest(NAME EXT_HEADERS_TEST_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES})
-
-  ConfigureTest(NAME LABEL_TEST PATH test/label/label.cu test/label/merge_labels.cu)
-
-  ConfigureTest(
-    NAME
-    LINALG_TEST
-    PATH
-    test/linalg/add.cu
-    test/linalg/axpy.cu
-    test/linalg/binary_op.cu
-    test/linalg/cholesky_r1.cu
-    test/linalg/coalesced_reduction.cu
-    test/linalg/divide.cu
-    test/linalg/dot.cu
-    test/linalg/eig.cu
-    test/linalg/eig_sel.cu
-    test/linalg/gemm_layout.cu
-    test/linalg/gemv.cu
-    test/linalg/map.cu
-    test/linalg/map_then_reduce.cu
-    test/linalg/matrix_vector.cu
-    test/linalg/matrix_vector_op.cu
-    test/linalg/mean_squared_error.cu
-    test/linalg/multiply.cu
-    test/linalg/norm.cu
-    test/linalg/normalize.cu
-    test/linalg/power.cu
-    test/linalg/randomized_svd.cu
-    test/linalg/reduce.cu
-    test/linalg/reduce_cols_by_key.cu
-    test/linalg/reduce_rows_by_key.cu
-    test/linalg/rsvd.cu
-    test/linalg/sqrt.cu
-    test/linalg/strided_reduction.cu
-    test/linalg/subtract.cu
-    test/linalg/svd.cu
-    test/linalg/ternary_op.cu
-    test/linalg/transpose.cu
-    test/linalg/unary_op.cu
-  )
-
-  ConfigureTest(
-    NAME
-    MATRIX_TEST
-    PATH
-    test/matrix/argmax.cu
-    test/matrix/argmin.cu
-    test/matrix/columnSort.cu
-    test/matrix/diagonal.cu
-    test/matrix/gather.cu
-    test/matrix/scatter.cu
-    test/matrix/eye.cu
-    test/matrix/linewise_op.cu
-    test/matrix/math.cu
-    test/matrix/matrix.cu
-    test/matrix/norm.cu
-    test/matrix/reverse.cu
-    test/matrix/slice.cu
-    test/matrix/triangular.cu
-    test/sparse/spectral_matrix.cu
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
-  )
-
-  ConfigureTest(NAME MATRIX_SELECT_TEST PATH test/matrix/select_k.cu LIB EXPLICIT_INSTANTIATE_ONLY)
-
-  ConfigureTest(
-    NAME MATRIX_SELECT_LARGE_TEST PATH test/matrix/select_large_k.cu LIB EXPLICIT_INSTANTIATE_ONLY
-  )
+  # ConfigureTest(
+  #   NAME
+  #   CLUSTER_TEST
+  #   PATH
+  #   test/cluster/kmeans.cu
+  #   test/cluster/kmeans_balanced.cu
+  #   test/cluster/kmeans_find_k.cu
+  #   test/cluster/cluster_solvers.cu
+  #   test/cluster/linkage.cu
+  #   test/cluster/spectral.cu
+  #   LIB
+  #   EXPLICIT_INSTANTIATE_ONLY
+  # )
+
+  # ConfigureTest(
+  #   NAME
+  #   CORE_TEST
+  #   PATH
+  #   test/core/bitset.cu
+  #   test/core/device_resources_manager.cpp
+  #   test/core/device_setter.cpp
+  #   test/core/logger.cpp
+  #   test/core/math_device.cu
+  #   test/core/math_host.cpp
+  #   test/core/operators_device.cu
+  #   test/core/operators_host.cpp
+  #   test/core/handle.cpp
+  #   test/core/interruptible.cu
+  #   test/core/nvtx.cpp
+  #   test/core/mdarray.cu
+  #   test/core/mdbuffer.cu
+  #   test/core/mdspan_copy.cpp
+  #   test/core/mdspan_copy.cu
+  #   test/core/mdspan_utils.cu
+  #   test/core/numpy_serializer.cu
+  #   test/core/memory_type.cpp
+  #   test/core/sparse_matrix.cu
+  #   test/core/sparse_matrix.cpp
+  #   test/core/span.cpp
+  #   test/core/span.cu
+  #   test/core/stream_view.cpp
+  #   test/core/temporary_device_buffer.cu
+  #   test/test.cpp
+  #   LIB
+  #   EXPLICIT_INSTANTIATE_ONLY
+  # )
+
+  # ConfigureTest(
+  #   NAME CORE_TEST PATH test/core/stream_view.cpp test/core/mdspan_copy.cpp LIB
+  #   EXPLICIT_INSTANTIATE_ONLY NOCUDA
+  # )
+
+  # ConfigureTest(
+  #   NAME
+  #   DISTANCE_TEST
+  #   PATH
+  #   test/distance/dist_adj.cu
+  #   test/distance/dist_adj_distance_instance.cu
+  #   test/distance/dist_canberra.cu
+  #   test/distance/dist_correlation.cu
+  #   test/distance/dist_cos.cu
+  #   test/distance/dist_hamming.cu
+  #   test/distance/dist_hellinger.cu
+  #   test/distance/dist_inner_product.cu
+  #   test/distance/dist_jensen_shannon.cu
+  #   test/distance/dist_kl_divergence.cu
+  #   test/distance/dist_l1.cu
+  #   test/distance/dist_l2_exp.cu
+  #   test/distance/dist_l2_unexp.cu
+  #   test/distance/dist_l2_sqrt_exp.cu
+  #   test/distance/dist_l_inf.cu
+  #   test/distance/dist_lp_unexp.cu
+  #   test/distance/dist_russell_rao.cu
+  #   test/distance/masked_nn.cu
+  #   test/distance/masked_nn_compress_to_bits.cu
+  #   test/distance/fused_l2_nn.cu
+  #   test/distance/gram.cu
+  #   LIB
+  #   EXPLICIT_INSTANTIATE_ONLY
+  # )
+
+  # list(
+  #   APPEND
+  #   EXT_HEADER_TEST_SOURCES
+  #   test/ext_headers/raft_neighbors_brute_force.cu
+  #   test/ext_headers/raft_distance_distance.cu
+  #   test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
+  #   test/ext_headers/raft_matrix_detail_select_k.cu
+  #   test/ext_headers/raft_neighbors_ball_cover.cu
+  #   test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
+  #   test/ext_headers/raft_distance_fused_l2_nn.cu
+  #   test/ext_headers/raft_neighbors_ivf_pq.cu
+  #   test/ext_headers/raft_util_memory_pool.cpp
+  #   test/ext_headers/raft_neighbors_ivf_flat.cu
+  #   test/ext_headers/raft_core_logger.cpp
+  #   test/ext_headers/raft_neighbors_refine.cu
+  #   test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu
+  #   test/ext_headers/raft_linalg_detail_coalesced_reduction.cu
+  #   test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
+  #   test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
+  #   test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
+  # )
+
+  # # Test that the split headers compile in isolation with:
+  # #
+  # # * EXT_HEADERS_TEST_COMPILED_EXPLICIT: RAFT_COMPILED, RAFT_EXPLICIT_INSTANTIATE_ONLY defined
+  # # * EXT_HEADERS_TEST_COMPILED_IMPLICIT: RAFT_COMPILED defined
+  # # * EXT_HEADERS_TEST_IMPLICIT:          no macros defined.
+  # ConfigureTest(
+  #   NAME EXT_HEADERS_TEST_COMPILED_EXPLICIT PATH ${EXT_HEADER_TEST_SOURCES} LIB
+  #   EXPLICIT_INSTANTIATE_ONLY
+  # )
+  # ConfigureTest(NAME EXT_HEADERS_TEST_COMPILED_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES} LIB)
+  # ConfigureTest(NAME EXT_HEADERS_TEST_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES})
+
+  # ConfigureTest(NAME LABEL_TEST PATH test/label/label.cu test/label/merge_labels.cu)
+
+  # ConfigureTest(
+  #   NAME
+  #   LINALG_TEST
+  #   PATH
+  #   test/linalg/add.cu
+  #   test/linalg/axpy.cu
+  #   test/linalg/binary_op.cu
+  #   test/linalg/cholesky_r1.cu
+  #   test/linalg/coalesced_reduction.cu
+  #   test/linalg/divide.cu
+  #   test/linalg/dot.cu
+  #   test/linalg/eig.cu
+  #   test/linalg/eig_sel.cu
+  #   test/linalg/gemm_layout.cu
+  #   test/linalg/gemv.cu
+  #   test/linalg/map.cu
+  #   test/linalg/map_then_reduce.cu
+  #   test/linalg/matrix_vector.cu
+  #   test/linalg/matrix_vector_op.cu
+  #   test/linalg/mean_squared_error.cu
+  #   test/linalg/multiply.cu
+  #   test/linalg/norm.cu
+  #   test/linalg/normalize.cu
+  #   test/linalg/power.cu
+  #   test/linalg/randomized_svd.cu
+  #   test/linalg/reduce.cu
+  #   test/linalg/reduce_cols_by_key.cu
+  #   test/linalg/reduce_rows_by_key.cu
+  #   test/linalg/rsvd.cu
+  #   test/linalg/sqrt.cu
+  #   test/linalg/strided_reduction.cu
+  #   test/linalg/subtract.cu
+  #   test/linalg/svd.cu
+  #   test/linalg/ternary_op.cu
+  #   test/linalg/transpose.cu
+  #   test/linalg/unary_op.cu
+  # )
+
+  # ConfigureTest(
+  #   NAME
+  #   MATRIX_TEST
+  #   PATH
+  #   test/matrix/argmax.cu
+  #   test/matrix/argmin.cu
+  #   test/matrix/columnSort.cu
+  #   test/matrix/diagonal.cu
+  #   test/matrix/gather.cu
+  #   test/matrix/scatter.cu
+  #   test/matrix/eye.cu
+  #   test/matrix/linewise_op.cu
+  #   test/matrix/math.cu
+  #   test/matrix/matrix.cu
+  #   test/matrix/norm.cu
+  #   test/matrix/reverse.cu
+  #   test/matrix/slice.cu
+  #   test/matrix/triangular.cu
+  #   test/sparse/spectral_matrix.cu
+  #   LIB
+  #   EXPLICIT_INSTANTIATE_ONLY
+  # )
+
+  # ConfigureTest(NAME MATRIX_SELECT_TEST PATH test/matrix/select_k.cu LIB EXPLICIT_INSTANTIATE_ONLY)
+
+  # ConfigureTest(
+  #   NAME MATRIX_SELECT_LARGE_TEST PATH test/matrix/select_large_k.cu LIB EXPLICIT_INSTANTIATE_ONLY
+  # )
 
   ConfigureTest(
     NAME
     RANDOM_TEST
     PATH
-    test/random/make_blobs.cu
-    test/random/make_regression.cu
-    test/random/multi_variable_gaussian.cu
-    test/random/rng_pcg_host_api.cu
-    test/random/permute.cu
-    test/random/rng.cu
-    test/random/rng_discrete.cu
-    test/random/rng_int.cu
-    test/random/rmat_rectangular_generator.cu
-    test/random/sample_without_replacement.cu
-  )
-
-  ConfigureTest(
-    NAME SOLVERS_TEST PATH test/cluster/cluster_solvers_deprecated.cu test/linalg/eigen_solvers.cu
-    test/lap/lap.cu test/sparse/mst.cu LIB EXPLICIT_INSTANTIATE_ONLY
+    # test/random/make_blobs.cu
+    # test/random/make_regression.cu
+    # test/random/multi_variable_gaussian.cu
+    # test/random/rng_pcg_host_api.cu
+    # test/random/permute.cu
+    # test/random/rng.cu
+    # test/random/rng_discrete.cu
+    # test/random/rng_int.cu
+    # test/random/rmat_rectangular_generator.cu
+    # test/random/sample_without_replacement.cu
+    test/random/excess_sampling.cu
   )
 
-  ConfigureTest(
-    NAME
-    SPARSE_TEST
-    PATH
-    test/sparse/add.cu
-    test/sparse/convert_coo.cu
-    test/sparse/convert_csr.cu
-    test/sparse/csr_row_slice.cu
-    test/sparse/csr_to_dense.cu
-    test/sparse/csr_transpose.cu
-    test/sparse/degree.cu
-    test/sparse/filter.cu
-    test/sparse/norm.cu
-    test/sparse/normalize.cu
-    test/sparse/reduce.cu
-    test/sparse/row_op.cu
-    test/sparse/sddmm.cu
-    test/sparse/sort.cu
-    test/sparse/spgemmi.cu
-    test/sparse/spmm.cu
-    test/sparse/symmetrize.cu
-  )
-
-  ConfigureTest(
-    NAME SPARSE_DIST_TEST PATH test/sparse/dist_coo_spmv.cu test/sparse/distance.cu
-    test/sparse/gram.cu LIB EXPLICIT_INSTANTIATE_ONLY
-  )
-
-  ConfigureTest(
-    NAME
-    SPARSE_NEIGHBORS_TEST
-    PATH
-    test/sparse/neighbors/cross_component_nn.cu
-    test/sparse/neighbors/brute_force.cu
-    test/sparse/neighbors/knn_graph.cu
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
-  )
-
-  ConfigureTest(
-    NAME
-    NEIGHBORS_TEST
-    PATH
-    test/neighbors/knn.cu
-    test/neighbors/fused_l2_knn.cu
-    test/neighbors/tiled_knn.cu
-    test/neighbors/haversine.cu
-    test/neighbors/ball_cover.cu
-    test/neighbors/epsilon_neighborhood.cu
-    test/neighbors/refine.cu
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
-  )
-
-  ConfigureTest(
-    NAME NEIGHBORS_ANN_BRUTE_FORCE_TEST PATH test/neighbors/ann_brute_force/test_float.cu LIB
-    EXPLICIT_INSTANTIATE_ONLY GPUS 1 PERCENT 100
-  )
-
-  ConfigureTest(
-    NAME
-    NEIGHBORS_ANN_CAGRA_TEST
-    PATH
-    test/neighbors/ann_cagra/test_float_uint32_t.cu
-    test/neighbors/ann_cagra/test_half_uint32_t.cu
-    test/neighbors/ann_cagra/test_int8_t_uint32_t.cu
-    test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu
-    test/neighbors/ann_cagra/test_float_int64_t.cu
-    test/neighbors/ann_cagra/test_half_int64_t.cu
-    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
-    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
-    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
-    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
-    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
-    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
-    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
-    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
-    src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu
-    src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu
-    src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu
-    src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu
-    src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu
-    src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu
-    src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu
-    src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
-    GPUS
-    1
-    PERCENT
-    100
-  )
-
-  ConfigureTest(
-    NAME
-    NEIGHBORS_ANN_IVF_TEST
-    PATH
-    test/neighbors/ann_ivf_flat/test_filter_float_int64_t.cu
-    test/neighbors/ann_ivf_flat/test_float_int64_t.cu
-    test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu
-    test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu
-    test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
-    test/neighbors/ann_ivf_pq/test_float_int64_t.cu
-    test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu
-    test/neighbors/ann_ivf_pq/test_uint8_t_int64_t.cu
-    test/neighbors/ann_ivf_pq/test_filter_float_int64_t.cu
-    test/neighbors/ann_ivf_pq/test_filter_int8_t_int64_t.cu
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
-    GPUS
-    1
-    PERCENT
-    100
-  )
-
-  ConfigureTest(
-    NAME
-    NEIGHBORS_ANN_NN_DESCENT_TEST
-    PATH
-    test/neighbors/ann_nn_descent/test_float_uint32_t.cu
-    test/neighbors/ann_nn_descent/test_int8_t_uint32_t.cu
-    test/neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
-    GPUS
-    1
-    PERCENT
-    100
-  )
-
-  ConfigureTest(
-    NAME
-    STATS_TEST
-    PATH
-    test/stats/accuracy.cu
-    test/stats/adjusted_rand_index.cu
-    test/stats/completeness_score.cu
-    test/stats/contingencyMatrix.cu
-    test/stats/cov.cu
-    test/stats/dispersion.cu
-    test/stats/entropy.cu
-    test/stats/histogram.cu
-    test/stats/homogeneity_score.cu
-    test/stats/information_criterion.cu
-    test/stats/kl_divergence.cu
-    test/stats/mean.cu
-    test/stats/meanvar.cu
-    test/stats/mean_center.cu
-    test/stats/minmax.cu
-    test/stats/mutual_info_score.cu
-    test/stats/neighborhood_recall.cu
-    test/stats/r2_score.cu
-    test/stats/rand_index.cu
-    test/stats/regression_metrics.cu
-    test/stats/silhouette_score.cu
-    test/stats/stddev.cu
-    test/stats/sum.cu
-    test/stats/trustworthiness.cu
-    test/stats/weighted_mean.cu
-    test/stats/v_measure.cu
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
-  )
-
-  ConfigureTest(
-    NAME
-    UTILS_TEST
-    PATH
-    test/core/seive.cu
-    test/util/bitonic_sort.cu
-    test/util/cudart_utils.cpp
-    test/util/device_atomics.cu
-    test/util/integer_utils.cpp
-    test/util/integer_utils.cu
-    test/util/memory_type_dispatcher.cu
-    test/util/pow2_utils.cu
-    test/util/reduction.cu
-  )
+  # ConfigureTest(
+  #   NAME SOLVERS_TEST PATH test/cluster/cluster_solvers_deprecated.cu test/linalg/eigen_solvers.cu
+  #   test/lap/lap.cu test/sparse/mst.cu LIB EXPLICIT_INSTANTIATE_ONLY
+  # )
+
+  # ConfigureTest(
+  #   NAME
+  #   SPARSE_TEST
+  #   PATH
+  #   test/sparse/add.cu
+  #   test/sparse/convert_coo.cu
+  #   test/sparse/convert_csr.cu
+  #   test/sparse/csr_row_slice.cu
+  #   test/sparse/csr_to_dense.cu
+  #   test/sparse/csr_transpose.cu
+  #   test/sparse/degree.cu
+  #   test/sparse/filter.cu
+  #   test/sparse/norm.cu
+  #   test/sparse/normalize.cu
+  #   test/sparse/reduce.cu
+  #   test/sparse/row_op.cu
+  #   test/sparse/sddmm.cu
+  #   test/sparse/sort.cu
+  #   test/sparse/spgemmi.cu
+  #   test/sparse/spmm.cu
+  #   test/sparse/symmetrize.cu
+  # )
+
+  # ConfigureTest(
+  #   NAME SPARSE_DIST_TEST PATH test/sparse/dist_coo_spmv.cu test/sparse/distance.cu
+  #   test/sparse/gram.cu LIB EXPLICIT_INSTANTIATE_ONLY
+  # )
+
+  # ConfigureTest(
+  #   NAME
+  #   SPARSE_NEIGHBORS_TEST
+  #   PATH
+  #   test/sparse/neighbors/cross_component_nn.cu
+  #   test/sparse/neighbors/brute_force.cu
+  #   test/sparse/neighbors/knn_graph.cu
+  #   LIB
+  #   EXPLICIT_INSTANTIATE_ONLY
+  # )
+
+  # ConfigureTest(
+  #   NAME
+  #   NEIGHBORS_TEST
+  #   PATH
+  #   test/neighbors/knn.cu
+  #   test/neighbors/fused_l2_knn.cu
+  #   test/neighbors/tiled_knn.cu
+  #   test/neighbors/haversine.cu
+  #   test/neighbors/ball_cover.cu
+  #   test/neighbors/epsilon_neighborhood.cu
+  #   test/neighbors/refine.cu
+  #   LIB
+  #   EXPLICIT_INSTANTIATE_ONLY
+  # )
+
+  # ConfigureTest(
+  #   NAME NEIGHBORS_ANN_BRUTE_FORCE_TEST PATH test/neighbors/ann_brute_force/test_float.cu LIB
+  #   EXPLICIT_INSTANTIATE_ONLY GPUS 1 PERCENT 100
+  # )
+
+  # ConfigureTest(
+  #   NAME
+  #   NEIGHBORS_ANN_CAGRA_TEST
+  #   PATH
+  #   test/neighbors/ann_cagra/test_float_uint32_t.cu
+  #   test/neighbors/ann_cagra/test_half_uint32_t.cu
+  #   test/neighbors/ann_cagra/test_int8_t_uint32_t.cu
+  #   test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu
+  #   test/neighbors/ann_cagra/test_float_int64_t.cu
+  #   test/neighbors/ann_cagra/test_half_int64_t.cu
+  #   src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
+  #   src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
+  #   src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
+  #   src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
+  #   src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
+  #   src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
+  #   src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
+  #   src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
+  #   src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu
+  #   src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu
+  #   src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu
+  #   src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu
+  #   src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu
+  #   src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu
+  #   src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu
+  #   src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu
+  #   LIB
+  #   EXPLICIT_INSTANTIATE_ONLY
+  #   GPUS
+  #   1
+  #   PERCENT
+  #   100
+  # )
+
+  # ConfigureTest(
+  #   NAME
+  #   NEIGHBORS_ANN_IVF_TEST
+  #   PATH
+  #   test/neighbors/ann_ivf_flat/test_filter_float_int64_t.cu
+  #   test/neighbors/ann_ivf_flat/test_float_int64_t.cu
+  #   test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu
+  #   test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu
+  #   test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
+  #   test/neighbors/ann_ivf_pq/test_float_int64_t.cu
+  #   test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu
+  #   test/neighbors/ann_ivf_pq/test_uint8_t_int64_t.cu
+  #   test/neighbors/ann_ivf_pq/test_filter_float_int64_t.cu
+  #   test/neighbors/ann_ivf_pq/test_filter_int8_t_int64_t.cu
+  #   LIB
+  #   EXPLICIT_INSTANTIATE_ONLY
+  #   GPUS
+  #   1
+  #   PERCENT
+  #   100
+  # )
+
+  # ConfigureTest(
+  #   NAME
+  #   NEIGHBORS_ANN_NN_DESCENT_TEST
+  #   PATH
+  #   test/neighbors/ann_nn_descent/test_float_uint32_t.cu
+  #   test/neighbors/ann_nn_descent/test_int8_t_uint32_t.cu
+  #   test/neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu
+  #   LIB
+  #   EXPLICIT_INSTANTIATE_ONLY
+  #   GPUS
+  #   1
+  #   PERCENT
+  #   100
+  # )
+
+  # ConfigureTest(
+  #   NAME
+  #   STATS_TEST
+  #   PATH
+  #   test/stats/accuracy.cu
+  #   test/stats/adjusted_rand_index.cu
+  #   test/stats/completeness_score.cu
+  #   test/stats/contingencyMatrix.cu
+  #   test/stats/cov.cu
+  #   test/stats/dispersion.cu
+  #   test/stats/entropy.cu
+  #   test/stats/histogram.cu
+  #   test/stats/homogeneity_score.cu
+  #   test/stats/information_criterion.cu
+  #   test/stats/kl_divergence.cu
+  #   test/stats/mean.cu
+  #   test/stats/meanvar.cu
+  #   test/stats/mean_center.cu
+  #   test/stats/minmax.cu
+  #   test/stats/mutual_info_score.cu
+  #   test/stats/neighborhood_recall.cu
+  #   test/stats/r2_score.cu
+  #   test/stats/rand_index.cu
+  #   test/stats/regression_metrics.cu
+  #   test/stats/silhouette_score.cu
+  #   test/stats/stddev.cu
+  #   test/stats/sum.cu
+  #   test/stats/trustworthiness.cu
+  #   test/stats/weighted_mean.cu
+  #   test/stats/v_measure.cu
+  #   LIB
+  #   EXPLICIT_INSTANTIATE_ONLY
+  # )
+
+  # ConfigureTest(
+  #   NAME
+  #   UTILS_TEST
+  #   PATH
+  #   test/core/seive.cu
+  #   test/util/bitonic_sort.cu
+  #   test/util/cudart_utils.cpp
+  #   test/util/device_atomics.cu
+  #   test/util/integer_utils.cpp
+  #   test/util/integer_utils.cu
+  #   test/util/memory_type_dispatcher.cu
+  #   test/util/pow2_utils.cu
+  #   test/util/reduction.cu
+  # )
 endif()
 
 # ##################################################################################################

From a6f9083abf1bed20e1f38b20dbfe65b7dc395bcd Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Tue, 12 Mar 2024 23:39:29 +0100
Subject: [PATCH 05/16] add tests

---
 cpp/test/random/excess_sampling.cu | 113 +++++++++++++++++++++++++++++
 1 file changed, 113 insertions(+)
 create mode 100644 cpp/test/random/excess_sampling.cu

diff --git a/cpp/test/random/excess_sampling.cu b/cpp/test/random/excess_sampling.cu
new file mode 100644
index 0000000000..fec515900e
--- /dev/null
+++ b/cpp/test/random/excess_sampling.cu
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <gtest/gtest.h>
+
+#include <unordered_set>
+#include <vector>
+
+namespace raft {
+namespace random {
+
+using namespace raft::random;
+
+struct inputs {
+  int N;
+  int n_samples;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const inputs p)
+{
+  os << p.N << "/" << p.n_samples;
+  return os;
+}
+
+template <typename T>
+class ExcessSamplingTest : public ::testing::TestWithParam<inputs> {
+ public:
+  ExcessSamplingTest()
+    : params(::testing::TestWithParam<inputs>::GetParam()),
+      state{137ULL},
+      in(make_device_vector<T, int64_t>(res, params.n_samples)),
+      out(make_device_vector<T, int64_t>(res, 0)),
+      h_out(make_host_vector<T, int64_t>(res, params.n_samples))
+
+  {
+  }
+
+  void check()
+  {
+    out = raft::random::excess_subsample<T, int64_t>(res, state, params.N, params.n_samples);
+    ASSERT_TRUE(out.extent(0) == params.n_samples);
+    raft::copy(h_out.data_handle(), out.data_handle(), out.size(), stream);
+
+    resource::sync_stream(res, stream);
+    std::unordered_set<int> occurrence;
+    size_t sum = 0;
+    for (int i = 0; i < params.n_samples; ++i) {
+      T val = h_out(i);
+      sum += val;
+      ASSERT_TRUE(0 <= val && val < params.N)
+        << "out-of-range index @i=" << i << " val=" << val << " n_samples=" << params.n_samples;
+      ASSERT_TRUE(occurrence.find(val) == occurrence.end())
+        << "repeated index @i=" << i << " idx=" << val;
+      occurrence.insert(val);
+    }
+    float avg = sum / (float)params.n_samples;
+    std::cout << "samples " << params.n_samples << ", average" << avg << std::endl;
+    if (params.n_samples >= 100) {
+      ASSERT_TRUE(raft::match(avg, params.N / 2.0, raft::CompareApprox<float>(0.1)));
+    }
+  }
+
+ protected:
+  inputs params;
+  raft::resources res;
+  cudaStream_t stream;
+  RngState state;
+  device_vector<T, int64_t> out, in;
+  host_vector<T, int64_t> h_out;
+};
+
+const std::vector<inputs> input1 = {{1, 0},
+                                    {1, 1},
+                                    {10, 0},
+                                    {10, 1},
+                                    {10, 2},
+                                    {10, 10},
+                                    {200, 0},
+                                    {200, 1},
+                                    {200, 100},
+                                    {200, 130},
+                                    {200, 200}};
+
+using ExcessSamplingTestInt64 = ExcessSamplingTest<int64_t>;
+TEST_P(ExcessSamplingTestInt64, SamplingTest) { check(); }
+INSTANTIATE_TEST_SUITE_P(ExcessSamplingTests, ExcessSamplingTestInt64, ::testing::ValuesIn(input1));
+
+}  // namespace random
+}  // namespace raft

From 941e165be4f0992b7c0e82ae4aecc3489dd89228 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Wed, 13 Mar 2024 01:16:39 +0100
Subject: [PATCH 06/16] cleanup

---
 cpp/bench/prims/CMakeLists.txt              | 172 +++--
 cpp/bench/prims/random/subsample.cu         |  25 +-
 cpp/include/raft/random/detail/rng_impl.cuh |  46 +-
 cpp/test/CMakeLists.txt                     | 752 ++++++++++----------
 cpp/test/random/excess_sampling.cu          |  19 +-
 5 files changed, 482 insertions(+), 532 deletions(-)

diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index 18936317f6..02b94cc4ab 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -74,96 +74,94 @@ function(ConfigureBench)
 endfunction()
 
 if(BUILD_PRIMS_BENCH)
-  # ConfigureBench(
-  #   NAME CORE_BENCH PATH bench/prims/core/bitset.cu bench/prims/core/copy.cu bench/prims/main.cpp
-  # )
-
-  # ConfigureBench(
-  #   NAME CLUSTER_BENCH PATH bench/prims/cluster/kmeans_balanced.cu bench/prims/cluster/kmeans.cu
-  #   bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
-  # )
-
-  # ConfigureBench(
-  #   NAME TUNE_DISTANCE PATH bench/prims/distance/tune_pairwise/kernel.cu
-  #   bench/prims/distance/tune_pairwise/bench.cu bench/prims/main.cpp
-  # )
-
-  # ConfigureBench(
-  #   NAME
-  #   DISTANCE_BENCH
-  #   PATH
-  #   bench/prims/distance/distance_cosine.cu
-  #   bench/prims/distance/distance_exp_l2.cu
-  #   bench/prims/distance/distance_l1.cu
-  #   bench/prims/distance/distance_unexp_l2.cu
-  #   bench/prims/distance/fused_l2_nn.cu
-  #   bench/prims/distance/masked_nn.cu
-  #   bench/prims/distance/kernels.cu
-  #   bench/prims/main.cpp
-  #   OPTIONAL
-  #   LIB
-  #   EXPLICIT_INSTANTIATE_ONLY
-  # )
-
-  # ConfigureBench(
-  #   NAME
-  #   LINALG_BENCH
-  #   PATH
-  #   bench/prims/linalg/add.cu
-  #   bench/prims/linalg/map_then_reduce.cu
-  #   bench/prims/linalg/matrix_vector_op.cu
-  #   bench/prims/linalg/norm.cu
-  #   bench/prims/linalg/normalize.cu
-  #   bench/prims/linalg/reduce_cols_by_key.cu
-  #   bench/prims/linalg/reduce_rows_by_key.cu
-  #   bench/prims/linalg/reduce.cu
-  #   bench/prims/linalg/sddmm.cu
-  #   bench/prims/main.cpp
-  # )
-
-  # ConfigureBench(
-  #   NAME
-  #   MATRIX_BENCH
-  #   PATH
-  #   bench/prims/matrix/argmin.cu
-  #   bench/prims/matrix/gather.cu
-  #   bench/prims/matrix/select_k.cu
-  #   bench/prims/matrix/main.cpp
-  #   OPTIONAL
-  #   LIB
-  #   EXPLICIT_INSTANTIATE_ONLY
-  # )
+  ConfigureBench(
+    NAME CORE_BENCH PATH bench/prims/core/bitset.cu bench/prims/core/copy.cu bench/prims/main.cpp
+  )
+
+  ConfigureBench(
+    NAME CLUSTER_BENCH PATH bench/prims/cluster/kmeans_balanced.cu bench/prims/cluster/kmeans.cu
+    bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
+  )
+
+  ConfigureBench(
+    NAME TUNE_DISTANCE PATH bench/prims/distance/tune_pairwise/kernel.cu
+    bench/prims/distance/tune_pairwise/bench.cu bench/prims/main.cpp
+  )
+
+  ConfigureBench(
+    NAME
+    DISTANCE_BENCH
+    PATH
+    bench/prims/distance/distance_cosine.cu
+    bench/prims/distance/distance_exp_l2.cu
+    bench/prims/distance/distance_l1.cu
+    bench/prims/distance/distance_unexp_l2.cu
+    bench/prims/distance/fused_l2_nn.cu
+    bench/prims/distance/masked_nn.cu
+    bench/prims/distance/kernels.cu
+    bench/prims/main.cpp
+    OPTIONAL
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
+  )
+
+  ConfigureBench(
+    NAME
+    LINALG_BENCH
+    PATH
+    bench/prims/linalg/add.cu
+    bench/prims/linalg/map_then_reduce.cu
+    bench/prims/linalg/matrix_vector_op.cu
+    bench/prims/linalg/norm.cu
+    bench/prims/linalg/normalize.cu
+    bench/prims/linalg/reduce_cols_by_key.cu
+    bench/prims/linalg/reduce_rows_by_key.cu
+    bench/prims/linalg/reduce.cu
+    bench/prims/linalg/sddmm.cu
+    bench/prims/main.cpp
+  )
+
+  ConfigureBench(
+    NAME
+    MATRIX_BENCH
+    PATH
+    bench/prims/matrix/argmin.cu
+    bench/prims/matrix/gather.cu
+    bench/prims/matrix/select_k.cu
+    bench/prims/matrix/main.cpp
+    OPTIONAL
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
+  )
 
   ConfigureBench(
-    NAME RANDOM_BENCH PATH 
-    # bench/prims/random/make_blobs.cu bench/prims/random/permute.cu
-    # bench/prims/random/rng.cu
-     bench/prims/random/subsample.cu bench/prims/main.cpp
+    NAME RANDOM_BENCH PATH bench/prims/random/make_blobs.cu bench/prims/random/permute.cu
+   bench/prims/random/rng.cu bench/prims/random/subsample.cu bench/prims/main.cpp
   )
 
-  # ConfigureBench(NAME SPARSE_BENCH PATH bench/prims/sparse/convert_csr.cu bench/prims/main.cpp)
-
-  # ConfigureBench(
-  #   NAME
-  #   NEIGHBORS_BENCH
-  #   PATH
-  #   bench/prims/neighbors/knn/brute_force_float_int64_t.cu
-  #   bench/prims/neighbors/knn/brute_force_float_uint32_t.cu
-  #   bench/prims/neighbors/knn/cagra_float_uint32_t.cu
-  #   bench/prims/neighbors/knn/ivf_flat_filter_float_int64_t.cu
-  #   bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu
-  #   bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu
-  #   bench/prims/neighbors/knn/ivf_flat_uint8_t_int64_t.cu
-  #   bench/prims/neighbors/knn/ivf_pq_float_int64_t.cu
-  #   bench/prims/neighbors/knn/ivf_pq_filter_float_int64_t.cu
-  #   bench/prims/neighbors/knn/ivf_pq_int8_t_int64_t.cu
-  #   bench/prims/neighbors/knn/ivf_pq_uint8_t_int64_t.cu
-  #   bench/prims/neighbors/refine_float_int64_t.cu
-  #   bench/prims/neighbors/refine_uint8_t_int64_t.cu
-  #   bench/prims/main.cpp
-  #   OPTIONAL
-  #   LIB
-  #   EXPLICIT_INSTANTIATE_ONLY
-  # )
+  ConfigureBench(NAME SPARSE_BENCH PATH bench/prims/sparse/convert_csr.cu bench/prims/main.cpp)
+
+  ConfigureBench(
+    NAME
+    NEIGHBORS_BENCH
+    PATH
+    bench/prims/neighbors/knn/brute_force_float_int64_t.cu
+    bench/prims/neighbors/knn/brute_force_float_uint32_t.cu
+    bench/prims/neighbors/knn/cagra_float_uint32_t.cu
+    bench/prims/neighbors/knn/ivf_flat_filter_float_int64_t.cu
+    bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu
+    bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu
+    bench/prims/neighbors/knn/ivf_flat_uint8_t_int64_t.cu
+    bench/prims/neighbors/knn/ivf_pq_float_int64_t.cu
+    bench/prims/neighbors/knn/ivf_pq_filter_float_int64_t.cu
+    bench/prims/neighbors/knn/ivf_pq_int8_t_int64_t.cu
+    bench/prims/neighbors/knn/ivf_pq_uint8_t_int64_t.cu
+    bench/prims/neighbors/refine_float_int64_t.cu
+    bench/prims/neighbors/refine_uint8_t_int64_t.cu
+    bench/prims/main.cpp
+    OPTIONAL
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
+  )
 
 endif()
diff --git a/cpp/bench/prims/random/subsample.cu b/cpp/bench/prims/random/subsample.cu
index 64a5e32669..1c384f9a03 100644
--- a/cpp/bench/prims/random/subsample.cu
+++ b/cpp/bench/prims/random/subsample.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,6 +38,7 @@ struct sample_inputs {
   int method;
 };  // struct sample_inputs
 
+// Sample with replacement. We use this as a baseline.
 template <typename IdxT>
 auto bernoulli_subsample(raft::resources const& res, IdxT n_samples, IdxT n_subsamples, int seed)
   -> raft::device_vector<IdxT, IdxT>
@@ -65,10 +66,7 @@ struct sample : public fixture {
   {
     raft::random::RngState r(123456ULL);
     loop_on_state(state, [this, &r]() {
-      if (params.method == 0) {
-        this->out = raft::spatial::knn::detail::utils::get_subsample_indices<T>(
-          this->res, this->params.n_samples, this->params.n_train, 137);
-      } else if (params.method == 1) {
+      if (params.method == 1) {
         this->out =
           bernoulli_subsample<T>(this->res, this->params.n_samples, this->params.n_train, 137);
       } else if (params.method == 2) {
@@ -76,9 +74,6 @@ struct sample : public fixture {
           this->res, r, this->params.n_samples, this->params.n_train);
       }
     });
-    if (this->params.n_train <= 100) {
-      print_vector("samples", this->out.data_handle(), this->params.n_train, std::cout);
-    }
   }
 
  private:
@@ -87,19 +82,11 @@ struct sample : public fixture {
   raft::device_vector<T, int64_t> out, in;
 };  // struct sample
 
-const std::vector<sample_inputs> input_vecs = {{100, 20, 2},
-                                               {10, 5, 2},
-                                               {20, 10, 2},
-                                               {20, 15, 2},
-                                               {100, 50, 2},
-                                               {1000, 500, 2},
-                                               {1000, 600, 2},
-                                               {1000, 700, 2},
-                                               {10000, 5000, 2},
-                                               {100000, 50000, 2},
+const std::vector<sample_inputs> input_vecs = {{100000000, 10000000, 1},
+                                               {100000000, 50000000, 1},
+                                               {100000000, 100000000, 1},
                                                {100000000, 10000000, 2},
                                                {100000000, 50000000, 2},
-                                               {1000, 900, 2},
                                                {100000000, 100000000, 2}};
 
 RAFT_BENCH_REGISTER(sample<int64_t>, "", input_vecs);
diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh
index 08be3f6a98..08a57e17c0 100644
--- a/cpp/include/raft/random/detail/rng_impl.cuh
+++ b/cpp/include/raft/random/detail/rng_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -297,21 +297,10 @@ void sampleWithoutReplacement(RngState& rng_state,
 {
   ASSERT(sampledLen <= len, "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'.");
 
-  // size_t free, total;
-  // float GiB = 1073741824.0f;
-  // cudaMemGetInfo(&free, &total);
-  // RAFT_LOG_INFO("sampleWithoutReplacement::start free mem %6.1f, used mem %6.1f",
-  //               free / GiB,
-  //               (total - free) / GiB);
   rmm::device_uvector<WeightsT> expWts(len, stream);
   rmm::device_uvector<WeightsT> sortedWts(len, stream);
   rmm::device_uvector<IdxT> inIdx(len, stream);
   rmm::device_uvector<IdxT> outIdxBuff(len, stream);
-
-  // cudaMemGetInfo(&free, &total);
-  // RAFT_LOG_INFO("sampleWithoutReplacement::buffers free mem %6.1f, used mem %6.1f",
-  //               free / GiB,
-  //               (total - free) / GiB);
   auto* inIdxPtr = inIdx.data();
   // generate modified weights
   SamplingParams<WeightsT, IdxT> params;
@@ -386,15 +375,11 @@ auto excess_subsample(raft::resources const& res, RngState& state, IdxT N, IdxT
   n_excess_samples = std::min<IdxT>(n_excess_samples, N);
   auto rnd_idx     = raft::make_device_vector<IdxT, IdxT>(res, n_excess_samples);
 
-  RAFT_LOG_INFO("We will draw %zu random samples", (size_t)rnd_idx.size());
   auto linear_idx = raft::make_device_vector<IdxT, IdxT>(res, rnd_idx.size());
   raft::linalg::map_offset(res, linear_idx.view(), identity_op());
 
   uniformInt(res, state, rnd_idx.data_handle(), rnd_idx.size(), IdxT(0), IdxT(N));
 
-  if (rnd_idx.size() <= 100) {
-    print_vector("rnd_idx", rnd_idx.data_handle(), rnd_idx.size(), std::cout);
-  }
   // Sort indices according to rnd keys
   size_t workspace_size = 0;
   auto stream           = resource::get_cuda_stream(res);
@@ -405,8 +390,6 @@ auto excess_subsample(raft::resources const& res, RngState& state, IdxT N, IdxT
                                   rnd_idx.size(),
                                   raft::less_op{},
                                   stream);
-  float GiB = 1073741824.0f;
-  RAFT_LOG_INFO("worksize sort %6.1f GiB", workspace_size / GiB);
   auto workspace = raft::make_device_vector<char, IdxT>(res, workspace_size);
   cub::DeviceMergeSort::SortPairs(workspace.data_handle(),
                                   workspace_size,
@@ -416,12 +399,6 @@ auto excess_subsample(raft::resources const& res, RngState& state, IdxT N, IdxT
                                   raft::less_op{},
                                   stream);
 
-  if (rnd_idx.size() <= 100) {
-    print_vector("rnd   _idx sorted", rnd_idx.data_handle(), rnd_idx.size(), std::cout);
-  }
-  if (rnd_idx.size() <= 100) {
-    print_vector("linear_idx sorted", linear_idx.data_handle(), linear_idx.size(), std::cout);
-  }
   if (rnd_idx.size() == static_cast<size_t>(N)) {
     // We shuffled the linear_idx array by sorting it according to rnd_idx.
     // We return the first n_samples elements.
@@ -446,8 +423,6 @@ auto excess_subsample(raft::resources const& res, RngState& state, IdxT N, IdxT
                                  rnd_idx.size(),
                                  stream);
 
-  RAFT_LOG_INFO("worksize unique %6.1f GiB", worksize2 / GiB);
-
   if (worksize2 > workspace.size()) {
     workspace      = raft::make_device_vector<char, IdxT>(res, worksize2);
     workspace_size = workspace.size();
@@ -465,22 +440,13 @@ auto excess_subsample(raft::resources const& res, RngState& state, IdxT N, IdxT
 
   IdxT selected = num_selected.value(stream);
 
-  if (rnd_idx.size() <= 100) {
-    print_vector("unique keys (rnd_idx)", keys_out.data_handle(), selected, std::cout);
-    print_vector("unique vals (linear idx)", values_out.data_handle(), selected, std::cout);
-  }
   if (selected < n_samples) {
-    RAFT_LOG_WARN("Subsampling returned with less unique indices (%zu) than requested (%zu)",
-                  (size_t)selected,
-                  (size_t)n_samples);
+    RAFT_LOG_DEBUG("Subsampling returned with less unique indices (%zu) than requested (%zu)",
+                   (size_t)selected,
+                   (size_t)n_samples);
   }
-  RAFT_LOG_INFO(
-    "We have %zu unique idices out of %zu samples", (size_t)selected, (size_t)rnd_idx.size());
-  RAFT_LOG_INFO(
-    "Subsampling unique indices (%zu) requested (%zu)", (size_t)selected, (size_t)n_samples);
 
   // After duplicates are removed, we need to shuffle back to random order
-
   cub::DeviceMergeSort::SortPairs(workspace.data_handle(),
                                   workspace_size,
                                   values_out.data_handle(),
@@ -488,10 +454,6 @@ auto excess_subsample(raft::resources const& res, RngState& state, IdxT N, IdxT
                                   n_samples,
                                   raft::less_op{},
                                   stream);
-  if (rnd_idx.size() <= 100) {
-    print_vector("re sorted keys ", keys_out.data_handle(), selected, std::cout);
-    print_vector("re sorted vals ", values_out.data_handle(), selected, std::cout);
-  }
 
   values_out = raft::make_device_vector<IdxT, IdxT>(res, n_samples);
   raft::copy(values_out.data_handle(), keys_out.data_handle(), n_samples, stream);
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 28ef83af34..037f85698c 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -95,390 +95,390 @@ endfunction()
 # * distance tests -------------------------------------------------------------------------
 
 if(BUILD_TESTS)
-  # ConfigureTest(
-  #   NAME
-  #   CLUSTER_TEST
-  #   PATH
-  #   test/cluster/kmeans.cu
-  #   test/cluster/kmeans_balanced.cu
-  #   test/cluster/kmeans_find_k.cu
-  #   test/cluster/cluster_solvers.cu
-  #   test/cluster/linkage.cu
-  #   test/cluster/spectral.cu
-  #   LIB
-  #   EXPLICIT_INSTANTIATE_ONLY
-  # )
-
-  # ConfigureTest(
-  #   NAME
-  #   CORE_TEST
-  #   PATH
-  #   test/core/bitset.cu
-  #   test/core/device_resources_manager.cpp
-  #   test/core/device_setter.cpp
-  #   test/core/logger.cpp
-  #   test/core/math_device.cu
-  #   test/core/math_host.cpp
-  #   test/core/operators_device.cu
-  #   test/core/operators_host.cpp
-  #   test/core/handle.cpp
-  #   test/core/interruptible.cu
-  #   test/core/nvtx.cpp
-  #   test/core/mdarray.cu
-  #   test/core/mdbuffer.cu
-  #   test/core/mdspan_copy.cpp
-  #   test/core/mdspan_copy.cu
-  #   test/core/mdspan_utils.cu
-  #   test/core/numpy_serializer.cu
-  #   test/core/memory_type.cpp
-  #   test/core/sparse_matrix.cu
-  #   test/core/sparse_matrix.cpp
-  #   test/core/span.cpp
-  #   test/core/span.cu
-  #   test/core/stream_view.cpp
-  #   test/core/temporary_device_buffer.cu
-  #   test/test.cpp
-  #   LIB
-  #   EXPLICIT_INSTANTIATE_ONLY
-  # )
-
-  # ConfigureTest(
-  #   NAME CORE_TEST PATH test/core/stream_view.cpp test/core/mdspan_copy.cpp LIB
-  #   EXPLICIT_INSTANTIATE_ONLY NOCUDA
-  # )
-
-  # ConfigureTest(
-  #   NAME
-  #   DISTANCE_TEST
-  #   PATH
-  #   test/distance/dist_adj.cu
-  #   test/distance/dist_adj_distance_instance.cu
-  #   test/distance/dist_canberra.cu
-  #   test/distance/dist_correlation.cu
-  #   test/distance/dist_cos.cu
-  #   test/distance/dist_hamming.cu
-  #   test/distance/dist_hellinger.cu
-  #   test/distance/dist_inner_product.cu
-  #   test/distance/dist_jensen_shannon.cu
-  #   test/distance/dist_kl_divergence.cu
-  #   test/distance/dist_l1.cu
-  #   test/distance/dist_l2_exp.cu
-  #   test/distance/dist_l2_unexp.cu
-  #   test/distance/dist_l2_sqrt_exp.cu
-  #   test/distance/dist_l_inf.cu
-  #   test/distance/dist_lp_unexp.cu
-  #   test/distance/dist_russell_rao.cu
-  #   test/distance/masked_nn.cu
-  #   test/distance/masked_nn_compress_to_bits.cu
-  #   test/distance/fused_l2_nn.cu
-  #   test/distance/gram.cu
-  #   LIB
-  #   EXPLICIT_INSTANTIATE_ONLY
-  # )
-
-  # list(
-  #   APPEND
-  #   EXT_HEADER_TEST_SOURCES
-  #   test/ext_headers/raft_neighbors_brute_force.cu
-  #   test/ext_headers/raft_distance_distance.cu
-  #   test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
-  #   test/ext_headers/raft_matrix_detail_select_k.cu
-  #   test/ext_headers/raft_neighbors_ball_cover.cu
-  #   test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
-  #   test/ext_headers/raft_distance_fused_l2_nn.cu
-  #   test/ext_headers/raft_neighbors_ivf_pq.cu
-  #   test/ext_headers/raft_util_memory_pool.cpp
-  #   test/ext_headers/raft_neighbors_ivf_flat.cu
-  #   test/ext_headers/raft_core_logger.cpp
-  #   test/ext_headers/raft_neighbors_refine.cu
-  #   test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu
-  #   test/ext_headers/raft_linalg_detail_coalesced_reduction.cu
-  #   test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
-  #   test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
-  #   test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
-  # )
-
-  # # Test that the split headers compile in isolation with:
-  # #
-  # # * EXT_HEADERS_TEST_COMPILED_EXPLICIT: RAFT_COMPILED, RAFT_EXPLICIT_INSTANTIATE_ONLY defined
-  # # * EXT_HEADERS_TEST_COMPILED_IMPLICIT: RAFT_COMPILED defined
-  # # * EXT_HEADERS_TEST_IMPLICIT:          no macros defined.
-  # ConfigureTest(
-  #   NAME EXT_HEADERS_TEST_COMPILED_EXPLICIT PATH ${EXT_HEADER_TEST_SOURCES} LIB
-  #   EXPLICIT_INSTANTIATE_ONLY
-  # )
-  # ConfigureTest(NAME EXT_HEADERS_TEST_COMPILED_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES} LIB)
-  # ConfigureTest(NAME EXT_HEADERS_TEST_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES})
-
-  # ConfigureTest(NAME LABEL_TEST PATH test/label/label.cu test/label/merge_labels.cu)
-
-  # ConfigureTest(
-  #   NAME
-  #   LINALG_TEST
-  #   PATH
-  #   test/linalg/add.cu
-  #   test/linalg/axpy.cu
-  #   test/linalg/binary_op.cu
-  #   test/linalg/cholesky_r1.cu
-  #   test/linalg/coalesced_reduction.cu
-  #   test/linalg/divide.cu
-  #   test/linalg/dot.cu
-  #   test/linalg/eig.cu
-  #   test/linalg/eig_sel.cu
-  #   test/linalg/gemm_layout.cu
-  #   test/linalg/gemv.cu
-  #   test/linalg/map.cu
-  #   test/linalg/map_then_reduce.cu
-  #   test/linalg/matrix_vector.cu
-  #   test/linalg/matrix_vector_op.cu
-  #   test/linalg/mean_squared_error.cu
-  #   test/linalg/multiply.cu
-  #   test/linalg/norm.cu
-  #   test/linalg/normalize.cu
-  #   test/linalg/power.cu
-  #   test/linalg/randomized_svd.cu
-  #   test/linalg/reduce.cu
-  #   test/linalg/reduce_cols_by_key.cu
-  #   test/linalg/reduce_rows_by_key.cu
-  #   test/linalg/rsvd.cu
-  #   test/linalg/sqrt.cu
-  #   test/linalg/strided_reduction.cu
-  #   test/linalg/subtract.cu
-  #   test/linalg/svd.cu
-  #   test/linalg/ternary_op.cu
-  #   test/linalg/transpose.cu
-  #   test/linalg/unary_op.cu
-  # )
-
-  # ConfigureTest(
-  #   NAME
-  #   MATRIX_TEST
-  #   PATH
-  #   test/matrix/argmax.cu
-  #   test/matrix/argmin.cu
-  #   test/matrix/columnSort.cu
-  #   test/matrix/diagonal.cu
-  #   test/matrix/gather.cu
-  #   test/matrix/scatter.cu
-  #   test/matrix/eye.cu
-  #   test/matrix/linewise_op.cu
-  #   test/matrix/math.cu
-  #   test/matrix/matrix.cu
-  #   test/matrix/norm.cu
-  #   test/matrix/reverse.cu
-  #   test/matrix/slice.cu
-  #   test/matrix/triangular.cu
-  #   test/sparse/spectral_matrix.cu
-  #   LIB
-  #   EXPLICIT_INSTANTIATE_ONLY
-  # )
-
-  # ConfigureTest(NAME MATRIX_SELECT_TEST PATH test/matrix/select_k.cu LIB EXPLICIT_INSTANTIATE_ONLY)
-
-  # ConfigureTest(
-  #   NAME MATRIX_SELECT_LARGE_TEST PATH test/matrix/select_large_k.cu LIB EXPLICIT_INSTANTIATE_ONLY
-  # )
+  ConfigureTest(
+    NAME
+    CLUSTER_TEST
+    PATH
+    test/cluster/kmeans.cu
+    test/cluster/kmeans_balanced.cu
+    test/cluster/kmeans_find_k.cu
+    test/cluster/cluster_solvers.cu
+    test/cluster/linkage.cu
+    test/cluster/spectral.cu
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
+  )
+
+  ConfigureTest(
+    NAME
+    CORE_TEST
+    PATH
+    test/core/bitset.cu
+    test/core/device_resources_manager.cpp
+    test/core/device_setter.cpp
+    test/core/logger.cpp
+    test/core/math_device.cu
+    test/core/math_host.cpp
+    test/core/operators_device.cu
+    test/core/operators_host.cpp
+    test/core/handle.cpp
+    test/core/interruptible.cu
+    test/core/nvtx.cpp
+    test/core/mdarray.cu
+    test/core/mdbuffer.cu
+    test/core/mdspan_copy.cpp
+    test/core/mdspan_copy.cu
+    test/core/mdspan_utils.cu
+    test/core/numpy_serializer.cu
+    test/core/memory_type.cpp
+    test/core/sparse_matrix.cu
+    test/core/sparse_matrix.cpp
+    test/core/span.cpp
+    test/core/span.cu
+    test/core/stream_view.cpp
+    test/core/temporary_device_buffer.cu
+    test/test.cpp
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
+  )
+
+  ConfigureTest(
+    NAME CORE_TEST PATH test/core/stream_view.cpp test/core/mdspan_copy.cpp LIB
+    EXPLICIT_INSTANTIATE_ONLY NOCUDA
+  )
+
+  ConfigureTest(
+    NAME
+    DISTANCE_TEST
+    PATH
+    test/distance/dist_adj.cu
+    test/distance/dist_adj_distance_instance.cu
+    test/distance/dist_canberra.cu
+    test/distance/dist_correlation.cu
+    test/distance/dist_cos.cu
+    test/distance/dist_hamming.cu
+    test/distance/dist_hellinger.cu
+    test/distance/dist_inner_product.cu
+    test/distance/dist_jensen_shannon.cu
+    test/distance/dist_kl_divergence.cu
+    test/distance/dist_l1.cu
+    test/distance/dist_l2_exp.cu
+    test/distance/dist_l2_unexp.cu
+    test/distance/dist_l2_sqrt_exp.cu
+    test/distance/dist_l_inf.cu
+    test/distance/dist_lp_unexp.cu
+    test/distance/dist_russell_rao.cu
+    test/distance/masked_nn.cu
+    test/distance/masked_nn_compress_to_bits.cu
+    test/distance/fused_l2_nn.cu
+    test/distance/gram.cu
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
+  )
+
+  list(
+    APPEND
+    EXT_HEADER_TEST_SOURCES
+    test/ext_headers/raft_neighbors_brute_force.cu
+    test/ext_headers/raft_distance_distance.cu
+    test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
+    test/ext_headers/raft_matrix_detail_select_k.cu
+    test/ext_headers/raft_neighbors_ball_cover.cu
+    test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
+    test/ext_headers/raft_distance_fused_l2_nn.cu
+    test/ext_headers/raft_neighbors_ivf_pq.cu
+    test/ext_headers/raft_util_memory_pool.cpp
+    test/ext_headers/raft_neighbors_ivf_flat.cu
+    test/ext_headers/raft_core_logger.cpp
+    test/ext_headers/raft_neighbors_refine.cu
+    test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu
+    test/ext_headers/raft_linalg_detail_coalesced_reduction.cu
+    test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
+    test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
+    test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
+  )
+
+  # Test that the split headers compile in isolation with:
+  #
+  # * EXT_HEADERS_TEST_COMPILED_EXPLICIT: RAFT_COMPILED, RAFT_EXPLICIT_INSTANTIATE_ONLY defined
+  # * EXT_HEADERS_TEST_COMPILED_IMPLICIT: RAFT_COMPILED defined
+  # * EXT_HEADERS_TEST_IMPLICIT:          no macros defined.
+  ConfigureTest(
+    NAME EXT_HEADERS_TEST_COMPILED_EXPLICIT PATH ${EXT_HEADER_TEST_SOURCES} LIB
+    EXPLICIT_INSTANTIATE_ONLY
+  )
+  ConfigureTest(NAME EXT_HEADERS_TEST_COMPILED_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES} LIB)
+  ConfigureTest(NAME EXT_HEADERS_TEST_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES})
+
+  ConfigureTest(NAME LABEL_TEST PATH test/label/label.cu test/label/merge_labels.cu)
+
+  ConfigureTest(
+    NAME
+    LINALG_TEST
+    PATH
+    test/linalg/add.cu
+    test/linalg/axpy.cu
+    test/linalg/binary_op.cu
+    test/linalg/cholesky_r1.cu
+    test/linalg/coalesced_reduction.cu
+    test/linalg/divide.cu
+    test/linalg/dot.cu
+    test/linalg/eig.cu
+    test/linalg/eig_sel.cu
+    test/linalg/gemm_layout.cu
+    test/linalg/gemv.cu
+    test/linalg/map.cu
+    test/linalg/map_then_reduce.cu
+    test/linalg/matrix_vector.cu
+    test/linalg/matrix_vector_op.cu
+    test/linalg/mean_squared_error.cu
+    test/linalg/multiply.cu
+    test/linalg/norm.cu
+    test/linalg/normalize.cu
+    test/linalg/power.cu
+    test/linalg/randomized_svd.cu
+    test/linalg/reduce.cu
+    test/linalg/reduce_cols_by_key.cu
+    test/linalg/reduce_rows_by_key.cu
+    test/linalg/rsvd.cu
+    test/linalg/sqrt.cu
+    test/linalg/strided_reduction.cu
+    test/linalg/subtract.cu
+    test/linalg/svd.cu
+    test/linalg/ternary_op.cu
+    test/linalg/transpose.cu
+    test/linalg/unary_op.cu
+  )
+
+  ConfigureTest(
+    NAME
+    MATRIX_TEST
+    PATH
+    test/matrix/argmax.cu
+    test/matrix/argmin.cu
+    test/matrix/columnSort.cu
+    test/matrix/diagonal.cu
+    test/matrix/gather.cu
+    test/matrix/scatter.cu
+    test/matrix/eye.cu
+    test/matrix/linewise_op.cu
+    test/matrix/math.cu
+    test/matrix/matrix.cu
+    test/matrix/norm.cu
+    test/matrix/reverse.cu
+    test/matrix/slice.cu
+    test/matrix/triangular.cu
+    test/sparse/spectral_matrix.cu
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
+  )
+
+  ConfigureTest(NAME MATRIX_SELECT_TEST PATH test/matrix/select_k.cu LIB EXPLICIT_INSTANTIATE_ONLY)
+
+  ConfigureTest(
+    NAME MATRIX_SELECT_LARGE_TEST PATH test/matrix/select_large_k.cu LIB EXPLICIT_INSTANTIATE_ONLY
+  )
 
   ConfigureTest(
     NAME
     RANDOM_TEST
     PATH
-    # test/random/make_blobs.cu
-    # test/random/make_regression.cu
-    # test/random/multi_variable_gaussian.cu
-    # test/random/rng_pcg_host_api.cu
-    # test/random/permute.cu
-    # test/random/rng.cu
-    # test/random/rng_discrete.cu
-    # test/random/rng_int.cu
-    # test/random/rmat_rectangular_generator.cu
-    # test/random/sample_without_replacement.cu
+    test/random/make_blobs.cu
+    test/random/make_regression.cu
+    test/random/multi_variable_gaussian.cu
+    test/random/rng_pcg_host_api.cu
+    test/random/permute.cu
+    test/random/rng.cu
+    test/random/rng_discrete.cu
+    test/random/rng_int.cu
+    test/random/rmat_rectangular_generator.cu
+    test/random/sample_without_replacement.cu
     test/random/excess_sampling.cu
   )
 
-  # ConfigureTest(
-  #   NAME SOLVERS_TEST PATH test/cluster/cluster_solvers_deprecated.cu test/linalg/eigen_solvers.cu
-  #   test/lap/lap.cu test/sparse/mst.cu LIB EXPLICIT_INSTANTIATE_ONLY
-  # )
-
-  # ConfigureTest(
-  #   NAME
-  #   SPARSE_TEST
-  #   PATH
-  #   test/sparse/add.cu
-  #   test/sparse/convert_coo.cu
-  #   test/sparse/convert_csr.cu
-  #   test/sparse/csr_row_slice.cu
-  #   test/sparse/csr_to_dense.cu
-  #   test/sparse/csr_transpose.cu
-  #   test/sparse/degree.cu
-  #   test/sparse/filter.cu
-  #   test/sparse/norm.cu
-  #   test/sparse/normalize.cu
-  #   test/sparse/reduce.cu
-  #   test/sparse/row_op.cu
-  #   test/sparse/sddmm.cu
-  #   test/sparse/sort.cu
-  #   test/sparse/spgemmi.cu
-  #   test/sparse/spmm.cu
-  #   test/sparse/symmetrize.cu
-  # )
-
-  # ConfigureTest(
-  #   NAME SPARSE_DIST_TEST PATH test/sparse/dist_coo_spmv.cu test/sparse/distance.cu
-  #   test/sparse/gram.cu LIB EXPLICIT_INSTANTIATE_ONLY
-  # )
-
-  # ConfigureTest(
-  #   NAME
-  #   SPARSE_NEIGHBORS_TEST
-  #   PATH
-  #   test/sparse/neighbors/cross_component_nn.cu
-  #   test/sparse/neighbors/brute_force.cu
-  #   test/sparse/neighbors/knn_graph.cu
-  #   LIB
-  #   EXPLICIT_INSTANTIATE_ONLY
-  # )
-
-  # ConfigureTest(
-  #   NAME
-  #   NEIGHBORS_TEST
-  #   PATH
-  #   test/neighbors/knn.cu
-  #   test/neighbors/fused_l2_knn.cu
-  #   test/neighbors/tiled_knn.cu
-  #   test/neighbors/haversine.cu
-  #   test/neighbors/ball_cover.cu
-  #   test/neighbors/epsilon_neighborhood.cu
-  #   test/neighbors/refine.cu
-  #   LIB
-  #   EXPLICIT_INSTANTIATE_ONLY
-  # )
-
-  # ConfigureTest(
-  #   NAME NEIGHBORS_ANN_BRUTE_FORCE_TEST PATH test/neighbors/ann_brute_force/test_float.cu LIB
-  #   EXPLICIT_INSTANTIATE_ONLY GPUS 1 PERCENT 100
-  # )
-
-  # ConfigureTest(
-  #   NAME
-  #   NEIGHBORS_ANN_CAGRA_TEST
-  #   PATH
-  #   test/neighbors/ann_cagra/test_float_uint32_t.cu
-  #   test/neighbors/ann_cagra/test_half_uint32_t.cu
-  #   test/neighbors/ann_cagra/test_int8_t_uint32_t.cu
-  #   test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu
-  #   test/neighbors/ann_cagra/test_float_int64_t.cu
-  #   test/neighbors/ann_cagra/test_half_int64_t.cu
-  #   src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
-  #   src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
-  #   src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
-  #   src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
-  #   src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
-  #   src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
-  #   src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
-  #   src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
-  #   src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu
-  #   src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu
-  #   src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu
-  #   src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu
-  #   src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu
-  #   src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu
-  #   src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu
-  #   src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu
-  #   LIB
-  #   EXPLICIT_INSTANTIATE_ONLY
-  #   GPUS
-  #   1
-  #   PERCENT
-  #   100
-  # )
-
-  # ConfigureTest(
-  #   NAME
-  #   NEIGHBORS_ANN_IVF_TEST
-  #   PATH
-  #   test/neighbors/ann_ivf_flat/test_filter_float_int64_t.cu
-  #   test/neighbors/ann_ivf_flat/test_float_int64_t.cu
-  #   test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu
-  #   test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu
-  #   test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
-  #   test/neighbors/ann_ivf_pq/test_float_int64_t.cu
-  #   test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu
-  #   test/neighbors/ann_ivf_pq/test_uint8_t_int64_t.cu
-  #   test/neighbors/ann_ivf_pq/test_filter_float_int64_t.cu
-  #   test/neighbors/ann_ivf_pq/test_filter_int8_t_int64_t.cu
-  #   LIB
-  #   EXPLICIT_INSTANTIATE_ONLY
-  #   GPUS
-  #   1
-  #   PERCENT
-  #   100
-  # )
-
-  # ConfigureTest(
-  #   NAME
-  #   NEIGHBORS_ANN_NN_DESCENT_TEST
-  #   PATH
-  #   test/neighbors/ann_nn_descent/test_float_uint32_t.cu
-  #   test/neighbors/ann_nn_descent/test_int8_t_uint32_t.cu
-  #   test/neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu
-  #   LIB
-  #   EXPLICIT_INSTANTIATE_ONLY
-  #   GPUS
-  #   1
-  #   PERCENT
-  #   100
-  # )
-
-  # ConfigureTest(
-  #   NAME
-  #   STATS_TEST
-  #   PATH
-  #   test/stats/accuracy.cu
-  #   test/stats/adjusted_rand_index.cu
-  #   test/stats/completeness_score.cu
-  #   test/stats/contingencyMatrix.cu
-  #   test/stats/cov.cu
-  #   test/stats/dispersion.cu
-  #   test/stats/entropy.cu
-  #   test/stats/histogram.cu
-  #   test/stats/homogeneity_score.cu
-  #   test/stats/information_criterion.cu
-  #   test/stats/kl_divergence.cu
-  #   test/stats/mean.cu
-  #   test/stats/meanvar.cu
-  #   test/stats/mean_center.cu
-  #   test/stats/minmax.cu
-  #   test/stats/mutual_info_score.cu
-  #   test/stats/neighborhood_recall.cu
-  #   test/stats/r2_score.cu
-  #   test/stats/rand_index.cu
-  #   test/stats/regression_metrics.cu
-  #   test/stats/silhouette_score.cu
-  #   test/stats/stddev.cu
-  #   test/stats/sum.cu
-  #   test/stats/trustworthiness.cu
-  #   test/stats/weighted_mean.cu
-  #   test/stats/v_measure.cu
-  #   LIB
-  #   EXPLICIT_INSTANTIATE_ONLY
-  # )
-
-  # ConfigureTest(
-  #   NAME
-  #   UTILS_TEST
-  #   PATH
-  #   test/core/seive.cu
-  #   test/util/bitonic_sort.cu
-  #   test/util/cudart_utils.cpp
-  #   test/util/device_atomics.cu
-  #   test/util/integer_utils.cpp
-  #   test/util/integer_utils.cu
-  #   test/util/memory_type_dispatcher.cu
-  #   test/util/pow2_utils.cu
-  #   test/util/reduction.cu
-  # )
+  ConfigureTest(
+    NAME SOLVERS_TEST PATH test/cluster/cluster_solvers_deprecated.cu test/linalg/eigen_solvers.cu
+    test/lap/lap.cu test/sparse/mst.cu LIB EXPLICIT_INSTANTIATE_ONLY
+  )
+
+  ConfigureTest(
+    NAME
+    SPARSE_TEST
+    PATH
+    test/sparse/add.cu
+    test/sparse/convert_coo.cu
+    test/sparse/convert_csr.cu
+    test/sparse/csr_row_slice.cu
+    test/sparse/csr_to_dense.cu
+    test/sparse/csr_transpose.cu
+    test/sparse/degree.cu
+    test/sparse/filter.cu
+    test/sparse/norm.cu
+    test/sparse/normalize.cu
+    test/sparse/reduce.cu
+    test/sparse/row_op.cu
+    test/sparse/sddmm.cu
+    test/sparse/sort.cu
+    test/sparse/spgemmi.cu
+    test/sparse/spmm.cu
+    test/sparse/symmetrize.cu
+  )
+
+  ConfigureTest(
+    NAME SPARSE_DIST_TEST PATH test/sparse/dist_coo_spmv.cu test/sparse/distance.cu
+    test/sparse/gram.cu LIB EXPLICIT_INSTANTIATE_ONLY
+  )
+
+  ConfigureTest(
+    NAME
+    SPARSE_NEIGHBORS_TEST
+    PATH
+    test/sparse/neighbors/cross_component_nn.cu
+    test/sparse/neighbors/brute_force.cu
+    test/sparse/neighbors/knn_graph.cu
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
+  )
+
+  ConfigureTest(
+    NAME
+    NEIGHBORS_TEST
+    PATH
+    test/neighbors/knn.cu
+    test/neighbors/fused_l2_knn.cu
+    test/neighbors/tiled_knn.cu
+    test/neighbors/haversine.cu
+    test/neighbors/ball_cover.cu
+    test/neighbors/epsilon_neighborhood.cu
+    test/neighbors/refine.cu
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
+  )
+
+  ConfigureTest(
+    NAME NEIGHBORS_ANN_BRUTE_FORCE_TEST PATH test/neighbors/ann_brute_force/test_float.cu LIB
+    EXPLICIT_INSTANTIATE_ONLY GPUS 1 PERCENT 100
+  )
+
+  ConfigureTest(
+    NAME
+    NEIGHBORS_ANN_CAGRA_TEST
+    PATH
+    test/neighbors/ann_cagra/test_float_uint32_t.cu
+    test/neighbors/ann_cagra/test_half_uint32_t.cu
+    test/neighbors/ann_cagra/test_int8_t_uint32_t.cu
+    test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu
+    test/neighbors/ann_cagra/test_float_int64_t.cu
+    test/neighbors/ann_cagra/test_half_int64_t.cu
+    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
+    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
+    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
+    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
+    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
+    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
+    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
+    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
+    src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu
+    src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu
+    src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu
+    src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu
+    src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu
+    src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu
+    src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu
+    src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
+    GPUS
+    1
+    PERCENT
+    100
+  )
+
+  ConfigureTest(
+    NAME
+    NEIGHBORS_ANN_IVF_TEST
+    PATH
+    test/neighbors/ann_ivf_flat/test_filter_float_int64_t.cu
+    test/neighbors/ann_ivf_flat/test_float_int64_t.cu
+    test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu
+    test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu
+    test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
+    test/neighbors/ann_ivf_pq/test_float_int64_t.cu
+    test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu
+    test/neighbors/ann_ivf_pq/test_uint8_t_int64_t.cu
+    test/neighbors/ann_ivf_pq/test_filter_float_int64_t.cu
+    test/neighbors/ann_ivf_pq/test_filter_int8_t_int64_t.cu
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
+    GPUS
+    1
+    PERCENT
+    100
+  )
+
+  ConfigureTest(
+    NAME
+    NEIGHBORS_ANN_NN_DESCENT_TEST
+    PATH
+    test/neighbors/ann_nn_descent/test_float_uint32_t.cu
+    test/neighbors/ann_nn_descent/test_int8_t_uint32_t.cu
+    test/neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
+    GPUS
+    1
+    PERCENT
+    100
+  )
+
+  ConfigureTest(
+    NAME
+    STATS_TEST
+    PATH
+    test/stats/accuracy.cu
+    test/stats/adjusted_rand_index.cu
+    test/stats/completeness_score.cu
+    test/stats/contingencyMatrix.cu
+    test/stats/cov.cu
+    test/stats/dispersion.cu
+    test/stats/entropy.cu
+    test/stats/histogram.cu
+    test/stats/homogeneity_score.cu
+    test/stats/information_criterion.cu
+    test/stats/kl_divergence.cu
+    test/stats/mean.cu
+    test/stats/meanvar.cu
+    test/stats/mean_center.cu
+    test/stats/minmax.cu
+    test/stats/mutual_info_score.cu
+    test/stats/neighborhood_recall.cu
+    test/stats/r2_score.cu
+    test/stats/rand_index.cu
+    test/stats/regression_metrics.cu
+    test/stats/silhouette_score.cu
+    test/stats/stddev.cu
+    test/stats/sum.cu
+    test/stats/trustworthiness.cu
+    test/stats/weighted_mean.cu
+    test/stats/v_measure.cu
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
+  )
+
+  ConfigureTest(
+    NAME
+    UTILS_TEST
+    PATH
+    test/core/seive.cu
+    test/util/bitonic_sort.cu
+    test/util/cudart_utils.cpp
+    test/util/device_atomics.cu
+    test/util/integer_utils.cpp
+    test/util/integer_utils.cu
+    test/util/memory_type_dispatcher.cu
+    test/util/pow2_utils.cu
+    test/util/reduction.cu
+  )
 endif()
 
 # ##################################################################################################
diff --git a/cpp/test/random/excess_sampling.cu b/cpp/test/random/excess_sampling.cu
index fec515900e..8c788c491b 100644
--- a/cpp/test/random/excess_sampling.cu
+++ b/cpp/test/random/excess_sampling.cu
@@ -35,8 +35,8 @@ namespace random {
 using namespace raft::random;
 
 struct inputs {
-  int N;
-  int n_samples;
+  int64_t N;
+  int64_t n_samples;
 };
 
 template <typename T>
@@ -67,8 +67,8 @@ class ExcessSamplingTest : public ::testing::TestWithParam<inputs> {
 
     resource::sync_stream(res, stream);
     std::unordered_set<int> occurrence;
-    size_t sum = 0;
-    for (int i = 0; i < params.n_samples; ++i) {
+    int64_t sum = 0;
+    for (int64_t i = 0; i < params.n_samples; ++i) {
       T val = h_out(i);
       sum += val;
       ASSERT_TRUE(0 <= val && val < params.N)
@@ -78,9 +78,9 @@ class ExcessSamplingTest : public ::testing::TestWithParam<inputs> {
       occurrence.insert(val);
     }
     float avg = sum / (float)params.n_samples;
-    std::cout << "samples " << params.n_samples << ", average" << avg << std::endl;
-    if (params.n_samples >= 100) {
-      ASSERT_TRUE(raft::match(avg, params.N / 2.0, raft::CompareApprox<float>(0.1)));
+    if (params.n_samples >= 100 && params.N / params.n_samples < 100) {
+      ASSERT_TRUE(raft::match(avg, (params.N - 1) / 2.0f, raft::CompareApprox<float>(0.2)))
+        << "non-uniform sample";
     }
   }
 
@@ -99,11 +99,14 @@ const std::vector<inputs> input1 = {{1, 0},
                                     {10, 1},
                                     {10, 2},
                                     {10, 10},
+                                    {137, 42},
                                     {200, 0},
                                     {200, 1},
                                     {200, 100},
                                     {200, 130},
-                                    {200, 200}};
+                                    {200, 200},
+                                    {10000, 893},
+                                    {10000000000, 1023}};
 
 using ExcessSamplingTestInt64 = ExcessSamplingTest<int64_t>;
 TEST_P(ExcessSamplingTestInt64, SamplingTest) { check(); }

From eb73ef5d336edb84e4109c63f2da23093af1d2ca Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Wed, 13 Mar 2024 09:06:20 +0100
Subject: [PATCH 07/16] added sample_rows to matrix namespace

---
 cpp/bench/prims/random/subsample.cu           | 18 +++++
 cpp/include/raft/matrix/detail/gather.cuh     | 72 +++++++++++++++++++
 .../raft/matrix/detail/sample_rows.cuh        | 54 ++++++++++++++
 cpp/include/raft/matrix/sample_rows.cuh       | 51 +++++++++++++
 cpp/include/raft/random/detail/rng_impl.cuh   |  2 +-
 cpp/include/raft/random/rng.cuh               |  4 +-
 6 files changed, 198 insertions(+), 3 deletions(-)
 create mode 100644 cpp/include/raft/matrix/detail/sample_rows.cuh
 create mode 100644 cpp/include/raft/matrix/sample_rows.cuh

diff --git a/cpp/bench/prims/random/subsample.cu b/cpp/bench/prims/random/subsample.cu
index 1c384f9a03..4c8ca2bf31 100644
--- a/cpp/bench/prims/random/subsample.cu
+++ b/cpp/bench/prims/random/subsample.cu
@@ -27,6 +27,7 @@
 #include <raft/util/cudart_utils.hpp>
 
 #include <rmm/device_scalar.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
 
 #include <cub/cub.cuh>
 
@@ -38,6 +39,12 @@ struct sample_inputs {
   int method;
 };  // struct sample_inputs
 
+inline auto operator<<(std::ostream& os, const sample_inputs& p) -> std::ostream&
+{
+  os << p.n_samples << "#" << p.n_train << "#" << p.method;
+  return os;
+}
+
 // Sample with replacement. We use this as a baseline.
 template <typename IdxT>
 auto bernoulli_subsample(raft::resources const& res, IdxT n_samples, IdxT n_subsamples, int seed)
@@ -56,14 +63,22 @@ template <typename T>
 struct sample : public fixture {
   sample(const sample_inputs& p)
     : params(p),
+      old_mr(rmm::mr::get_current_device_resource()),
+      pool_mr(rmm::mr::get_current_device_resource(), 2 * GiB),
       in(make_device_vector<T, int64_t>(res, p.n_samples)),
       out(make_device_vector<T, int64_t>(res, p.n_train))
   {
+    rmm::mr::set_current_device_resource(&pool_mr);
     raft::random::RngState r(123456ULL);
   }
 
+  ~sample() { rmm::mr::set_current_device_resource(old_mr); }
   void run_benchmark(::benchmark::State& state) override
   {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
     raft::random::RngState r(123456ULL);
     loop_on_state(state, [this, &r]() {
       if (params.method == 1) {
@@ -77,7 +92,10 @@ struct sample : public fixture {
   }
 
  private:
+  float GiB = 1073741824.0f;
   raft::device_resources res;
+  rmm::mr::device_memory_resource* old_mr;
+  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> pool_mr;
   sample_inputs params;
   raft::device_vector<T, int64_t> out, in;
 };  // struct sample
diff --git a/cpp/include/raft/matrix/detail/gather.cuh b/cpp/include/raft/matrix/detail/gather.cuh
index 651fec81c3..553f2d71f1 100644
--- a/cpp/include/raft/matrix/detail/gather.cuh
+++ b/cpp/include/raft/matrix/detail/gather.cuh
@@ -16,7 +16,15 @@
 
 #pragma once
 
+#include <raft/common/nvtx.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
 #include <raft/core/operators.hpp>
+#include <raft/core/pinned_mdarray.hpp>
+#include <raft/core/pinned_mdspan.hpp>
+#include <raft/util/cuda_dev_essentials.cuh>
 #include <raft/util/cudart_utils.hpp>
 
 #include <functional>
@@ -336,6 +344,70 @@ void gather_if(const InputIteratorT in,
   gatherImpl(in, D, N, map, stencil, map_length, out, pred_op, transform_op, stream);
 }
 
+template <typename T, typename IdxT = int64_t>
+void gather_buff(host_matrix_view<const T, IdxT> dataset,
+                 host_vector_view<const IdxT, IdxT> indices,
+                 IdxT offset,
+                 pinned_matrix_view<T, IdxT> buff)
+{
+  raft::common::nvtx::range<common::nvtx::domain::raft> fun_scope("gather_host_buff");
+  IdxT batch_size = std::min<IdxT>(buff.extent(0), indices.extent(0) - offset);
+
+#pragma omp for
+  for (IdxT i = 0; i < batch_size; i++) {
+    IdxT in_idx = indices(offset + i);
+    for (IdxT k = 0; k < buff.extent(1); k++) {
+      buff(i, k) = dataset(in_idx, k);
+    }
+  }
+}
+
+template <typename T, typename IdxT>
+void gather(raft::resources const& res,
+            host_matrix_view<const T, IdxT> dataset,
+            device_vector_view<const IdxT, IdxT> indices,
+            raft::device_matrix_view<T, IdxT> output)
+{
+  raft::common::nvtx::range<common::nvtx::domain::raft> fun_scope("gather");
+  IdxT n_dim        = output.extent(1);
+  IdxT n_train      = output.extent(0);
+  auto indices_host = raft::make_host_vector<IdxT, IdxT>(n_train);
+  raft::copy(
+    indices_host.data_handle(), indices.data_handle(), n_train, resource::get_cuda_stream(res));
+  resource::sync_stream(res);
+
+  const size_t max_batch_size = 32768;
+  // Gather the vector on the host in tmp buffers. We use two buffers to overlap H2D sync
+  // and gathering the data.
+  raft::common::nvtx::push_range("gather::alloc_buffers");
+  auto out_tmp1 = raft::make_pinned_matrix<T, IdxT>(res, max_batch_size, n_dim);
+  auto out_tmp2 = raft::make_pinned_matrix<T, IdxT>(res, max_batch_size, n_dim);
+  auto view1    = out_tmp1.view();
+  auto view2    = out_tmp2.view();
+  raft::common::nvtx::pop_range();
+
+  gather_buff(dataset, make_const_mdspan(indices_host.view()), (IdxT)0, view1);
+#pragma omp parallel
+  for (IdxT device_offset = 0; device_offset < n_train; device_offset += max_batch_size) {
+    IdxT batch_size = std::min<IdxT>(max_batch_size, n_train - device_offset);
+#pragma omp master
+    raft::copy(output.data_handle() + device_offset * n_dim,
+               view1.data_handle(),
+               batch_size * n_dim,
+               resource::get_cuda_stream(res));
+    // Start gathering the next batch on the host.
+    IdxT host_offset = device_offset + batch_size;
+    batch_size       = std::min<IdxT>(max_batch_size, n_train - host_offset);
+    if (batch_size > 0) {
+      gather_buff(dataset, make_const_mdspan(indices_host.view()), host_offset, view2);
+    }
+#pragma omp master
+    resource::sync_stream(res);
+#pragma omp barrier
+    std::swap(view1, view2);
+  }
+}
+
 }  // namespace detail
 }  // namespace matrix
 }  // namespace raft
diff --git a/cpp/include/raft/matrix/detail/sample_rows.cuh b/cpp/include/raft/matrix/detail/sample_rows.cuh
new file mode 100644
index 0000000000..c8120c9ab2
--- /dev/null
+++ b/cpp/include/raft/matrix/detail/sample_rows.cuh
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/logger.hpp>
+#include <raft/matrix/gather.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft::matrix {
+
+/** Select rows randomly from input and copy to output. */
+template <typename T, typename IdxT = int64_t>
+void sample_rows(raft::resources const& res,
+                 const T* input,
+                 IdxT n_rows_input,
+                 raft::device_matrix_view<T, IdxT> output,
+                 RngState random_state)
+{
+  IdxT n_dim     = output.extent(1);
+  IdxT n_samples = output.extent(0);
+
+  raft::device_vector<IdxT, IdxT> train_indices =
+    raft::random::excess_subsample<IdxT, int64_t>(res, random_state, n_rows_input, n_samples);
+
+  cudaPointerAttributes attr;
+  RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, input));
+  T* ptr = reinterpret_cast<T*>(attr.devicePointer);
+  if (ptr != nullptr) {
+    raft::matrix::gather(res,
+                         raft::make_device_matrix_view<const T, IdxT>(ptr, n_rows_input, n_dim),
+                         raft::make_const_mdspan(train_indices.view()),
+                         output);
+  } else {
+    auto dataset = raft::make_host_matrix_view<const T, IdxT>(input, n_rows_input, n_dim);
+    raft::matrix::detail::gather(res, dataset, make_const_mdspan(train_indices.view()), output);
+  }
+}
+}  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/sample_rows.cuh b/cpp/include/raft/matrix/sample_rows.cuh
new file mode 100644
index 0000000000..2f8b8e6248
--- /dev/null
+++ b/cpp/include/raft/matrix/sample_rows.cuh
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/logger.hpp>
+#include <raft/matrix/gather.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft::matrix {
+
+/** Select rows randomly from input and copy to output. */
+template <typename T, typename IdxT = int64_t>
+void sample_rows(raft::resources const& res,
+                 const T* input,
+                 IdxT n_rows_input,
+                 raft::device_matrix_view<T, IdxT> output,
+                 RngState random_state)
+{
+  detail::sample_rows(res, input, n_rows_input, output, random_state);
+}
+
+/** Subsample the dataset to create a training set*/
+template <typename T, typename IdxT = int64_t>
+raft::device_matrix<T, IdxT> sample_rows(raft::resources const& res,
+                                         const T* input,
+                                         IdxT n_rows_input,
+                                         IdxT n_train,
+                                         IdxT n_dim,
+                                         RngState random_state)
+{
+  auto output = raft::make_device_matrix<T, IdxT>(res, n_train, n_dim);
+  detail::sample_rows(res, input, n_rows_input, output, random_state);
+  return output;
+}
+}  // namespace raft::matrix
diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh
index 08a57e17c0..70ef1bbfcc 100644
--- a/cpp/include/raft/random/detail/rng_impl.cuh
+++ b/cpp/include/raft/random/detail/rng_impl.cuh
@@ -341,7 +341,7 @@ void affine_transform_params(RngState const& rng_state, IdxT n, IdxT& a, IdxT& b
  * The algorithm will allocate a workspace of size O(4*n_samples) internally.
  *
  * We use max N random numbers. Depending on how large n_samples is w.r.t to N, we
- * either use rejection sampling, sort the [0..N-1] values using random keys.
+ * either use rejection sampling, or sort the [0..N-1] values using random keys.
  *
  * @tparam IdxT type of indices that we sample
  * @tparam MatIdxT extent type of the returned mdarray
diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh
index 10d809d3b8..977d82830b 100644
--- a/cpp/include/raft/random/rng.cuh
+++ b/cpp/include/raft/random/rng.cuh
@@ -816,10 +816,10 @@ void sampleWithoutReplacement(raft::resources const& handle,
 /** @brief Sample without replacement from range 0..N-1.
  *
  * Elements are sampled uniformly.
- * The algorithm will allocate a workspace of size O(4*n_samples) internally.
+ * The algorithm will allocate a workspace of size 4*n_samples*sizeof(IdxT) internally.
  *
  * We use max N random numbers. Depending on how large n_samples is w.r.t to N, we
- * either use rejection sampling, sort the [0..N-1] values using random keys.
+ * either use rejection sampling, or sort the [0..N-1] values using random keys.
  *
  * @tparam IdxT type of indices that we sample
  * @tparam MatIdxT extent type of the returned mdarray

From cc2cf2409cc4fe692a66a84724fbf4b37eb89cdd Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Wed, 13 Mar 2024 09:19:17 +0100
Subject: [PATCH 08/16] add test for sample rows

---
 .../raft/spatial/knn/detail/ann_utils.cuh     |  82 --
 cpp/test/CMakeLists.txt                       | 733 +++++++++---------
 cpp/test/matrix/sample_rows.cu                |  79 ++
 3 files changed, 446 insertions(+), 448 deletions(-)
 create mode 100644 cpp/test/matrix/sample_rows.cu

diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index d7f4651b56..78e63f756d 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -577,86 +577,4 @@ struct batch_load_iterator {
   size_type cur_pos_;
 };
 
-template <typename IdxT>
-auto get_subsample_indices(raft::resources const& res, IdxT n_samples, IdxT n_subsamples, int seed)
-  -> raft::device_vector<IdxT, IdxT>
-{
-  RAFT_EXPECTS(n_subsamples <= n_samples, "Cannot have more training samples than dataset vectors");
-  // size_t free, total;
-  // float GiB = 1073741824.0f;
-  // cudaMemGetInfo(&free, &total);
-  // RAFT_LOG_INFO(
-  //   "get_subsample_indices::data free mem %6.1f, used mem %6.1f", free / GiB, (total - free) /
-  //   GiB);
-
-  auto data_indices = raft::make_device_vector<IdxT, IdxT>(res, n_samples);
-  // cudaMemGetInfo(&free, &total);
-  // RAFT_LOG_INFO("get_subsample_indices::train free mem %6.1f, used mem %6.1f",
-  //               free / GiB,
-  //               (total - free) / GiB);
-
-  auto train_indices = raft::make_device_vector<IdxT, IdxT>(res, n_subsamples);
-  raft::linalg::map_offset(res, data_indices.view(), identity_op());
-  raft::random::RngState rng(seed);
-  raft::random::sample_without_replacement(res,
-                                           rng,
-                                           raft::make_const_mdspan(data_indices.view()),
-                                           std::nullopt,
-                                           train_indices.view(),
-                                           std::nullopt);
-  return train_indices;
-}
-
-/** Subsample the dataset to create a training set*/
-template <typename T, typename IdxT = int64_t>
-void subsample(raft::resources const& res,
-               const T* input,
-               IdxT n_samples,
-               raft::device_matrix_view<T, IdxT> output,
-               int seed)
-{
-  IdxT n_dim   = output.extent(1);
-  IdxT n_train = output.extent(0);
-
-  raft::device_vector<IdxT, IdxT> train_indices =
-    get_subsample_indices<IdxT>(res, n_samples, n_train, seed);
-
-  cudaPointerAttributes attr;
-  RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, input));
-  T* ptr = reinterpret_cast<T*>(attr.devicePointer);
-  if (ptr != nullptr) {
-    raft::matrix::gather(res,
-                         raft::make_device_matrix_view<const T, IdxT>(ptr, n_samples, n_dim),
-                         raft::make_const_mdspan(train_indices.view()),
-                         output);
-  } else {
-    auto dataset = raft::make_host_matrix_view<const T, IdxT>(input, n_samples, n_dim);
-    raft::matrix::detail::gather(res, dataset, make_const_mdspan(train_indices.view()), output);
-  }
-}
-
-/** Subsample the dataset to create a training set*/
-template <typename T, typename IdxT = int64_t>
-raft::device_matrix<T, IdxT> subsample(
-  raft::resources const& res, const T* input, IdxT n_samples, IdxT n_train, IdxT n_dim, int seed)
-{
-  raft::device_vector<IdxT, IdxT> train_indices =
-    get_subsample_indices<IdxT>(res, n_samples, n_train, seed);
-
-  auto output = raft::make_device_matrix<T, IdxT>(res, n_train, n_dim);
-  cudaPointerAttributes attr;
-  RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, input));
-  T* ptr = reinterpret_cast<T*>(attr.devicePointer);
-  if (ptr != nullptr) {
-    raft::matrix::gather(res,
-                         raft::make_device_matrix_view<const T, IdxT>(ptr, n_samples, n_dim),
-                         raft::make_const_mdspan(train_indices.view()),
-                         output.view());
-  } else {
-    auto dataset = raft::make_host_matrix_view<const T, IdxT>(input, n_samples, n_dim);
-    raft::matrix::detail::gather(
-      res, dataset, make_const_mdspan(train_indices.view()), output.view());
-  }
-  return output;
-}
 }  // namespace raft::spatial::knn::detail::utils
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 037f85698c..cda9ca69e8 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -95,390 +95,391 @@ endfunction()
 # * distance tests -------------------------------------------------------------------------
 
 if(BUILD_TESTS)
-  ConfigureTest(
-    NAME
-    CLUSTER_TEST
-    PATH
-    test/cluster/kmeans.cu
-    test/cluster/kmeans_balanced.cu
-    test/cluster/kmeans_find_k.cu
-    test/cluster/cluster_solvers.cu
-    test/cluster/linkage.cu
-    test/cluster/spectral.cu
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
-  )
-
-  ConfigureTest(
-    NAME
-    CORE_TEST
-    PATH
-    test/core/bitset.cu
-    test/core/device_resources_manager.cpp
-    test/core/device_setter.cpp
-    test/core/logger.cpp
-    test/core/math_device.cu
-    test/core/math_host.cpp
-    test/core/operators_device.cu
-    test/core/operators_host.cpp
-    test/core/handle.cpp
-    test/core/interruptible.cu
-    test/core/nvtx.cpp
-    test/core/mdarray.cu
-    test/core/mdbuffer.cu
-    test/core/mdspan_copy.cpp
-    test/core/mdspan_copy.cu
-    test/core/mdspan_utils.cu
-    test/core/numpy_serializer.cu
-    test/core/memory_type.cpp
-    test/core/sparse_matrix.cu
-    test/core/sparse_matrix.cpp
-    test/core/span.cpp
-    test/core/span.cu
-    test/core/stream_view.cpp
-    test/core/temporary_device_buffer.cu
-    test/test.cpp
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
-  )
-
-  ConfigureTest(
-    NAME CORE_TEST PATH test/core/stream_view.cpp test/core/mdspan_copy.cpp LIB
-    EXPLICIT_INSTANTIATE_ONLY NOCUDA
-  )
-
-  ConfigureTest(
-    NAME
-    DISTANCE_TEST
-    PATH
-    test/distance/dist_adj.cu
-    test/distance/dist_adj_distance_instance.cu
-    test/distance/dist_canberra.cu
-    test/distance/dist_correlation.cu
-    test/distance/dist_cos.cu
-    test/distance/dist_hamming.cu
-    test/distance/dist_hellinger.cu
-    test/distance/dist_inner_product.cu
-    test/distance/dist_jensen_shannon.cu
-    test/distance/dist_kl_divergence.cu
-    test/distance/dist_l1.cu
-    test/distance/dist_l2_exp.cu
-    test/distance/dist_l2_unexp.cu
-    test/distance/dist_l2_sqrt_exp.cu
-    test/distance/dist_l_inf.cu
-    test/distance/dist_lp_unexp.cu
-    test/distance/dist_russell_rao.cu
-    test/distance/masked_nn.cu
-    test/distance/masked_nn_compress_to_bits.cu
-    test/distance/fused_l2_nn.cu
-    test/distance/gram.cu
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
-  )
-
-  list(
-    APPEND
-    EXT_HEADER_TEST_SOURCES
-    test/ext_headers/raft_neighbors_brute_force.cu
-    test/ext_headers/raft_distance_distance.cu
-    test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
-    test/ext_headers/raft_matrix_detail_select_k.cu
-    test/ext_headers/raft_neighbors_ball_cover.cu
-    test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
-    test/ext_headers/raft_distance_fused_l2_nn.cu
-    test/ext_headers/raft_neighbors_ivf_pq.cu
-    test/ext_headers/raft_util_memory_pool.cpp
-    test/ext_headers/raft_neighbors_ivf_flat.cu
-    test/ext_headers/raft_core_logger.cpp
-    test/ext_headers/raft_neighbors_refine.cu
-    test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu
-    test/ext_headers/raft_linalg_detail_coalesced_reduction.cu
-    test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
-    test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
-    test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
-  )
-
-  # Test that the split headers compile in isolation with:
-  #
-  # * EXT_HEADERS_TEST_COMPILED_EXPLICIT: RAFT_COMPILED, RAFT_EXPLICIT_INSTANTIATE_ONLY defined
-  # * EXT_HEADERS_TEST_COMPILED_IMPLICIT: RAFT_COMPILED defined
-  # * EXT_HEADERS_TEST_IMPLICIT:          no macros defined.
-  ConfigureTest(
-    NAME EXT_HEADERS_TEST_COMPILED_EXPLICIT PATH ${EXT_HEADER_TEST_SOURCES} LIB
-    EXPLICIT_INSTANTIATE_ONLY
-  )
-  ConfigureTest(NAME EXT_HEADERS_TEST_COMPILED_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES} LIB)
-  ConfigureTest(NAME EXT_HEADERS_TEST_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES})
-
-  ConfigureTest(NAME LABEL_TEST PATH test/label/label.cu test/label/merge_labels.cu)
-
-  ConfigureTest(
-    NAME
-    LINALG_TEST
-    PATH
-    test/linalg/add.cu
-    test/linalg/axpy.cu
-    test/linalg/binary_op.cu
-    test/linalg/cholesky_r1.cu
-    test/linalg/coalesced_reduction.cu
-    test/linalg/divide.cu
-    test/linalg/dot.cu
-    test/linalg/eig.cu
-    test/linalg/eig_sel.cu
-    test/linalg/gemm_layout.cu
-    test/linalg/gemv.cu
-    test/linalg/map.cu
-    test/linalg/map_then_reduce.cu
-    test/linalg/matrix_vector.cu
-    test/linalg/matrix_vector_op.cu
-    test/linalg/mean_squared_error.cu
-    test/linalg/multiply.cu
-    test/linalg/norm.cu
-    test/linalg/normalize.cu
-    test/linalg/power.cu
-    test/linalg/randomized_svd.cu
-    test/linalg/reduce.cu
-    test/linalg/reduce_cols_by_key.cu
-    test/linalg/reduce_rows_by_key.cu
-    test/linalg/rsvd.cu
-    test/linalg/sqrt.cu
-    test/linalg/strided_reduction.cu
-    test/linalg/subtract.cu
-    test/linalg/svd.cu
-    test/linalg/ternary_op.cu
-    test/linalg/transpose.cu
-    test/linalg/unary_op.cu
-  )
+  # ConfigureTest(
+  #   NAME
+  #   CLUSTER_TEST
+  #   PATH
+  #   test/cluster/kmeans.cu
+  #   test/cluster/kmeans_balanced.cu
+  #   test/cluster/kmeans_find_k.cu
+  #   test/cluster/cluster_solvers.cu
+  #   test/cluster/linkage.cu
+  #   test/cluster/spectral.cu
+  #   LIB
+  #   EXPLICIT_INSTANTIATE_ONLY
+  # )
+
+  # ConfigureTest(
+  #   NAME
+  #   CORE_TEST
+  #   PATH
+  #   test/core/bitset.cu
+  #   test/core/device_resources_manager.cpp
+  #   test/core/device_setter.cpp
+  #   test/core/logger.cpp
+  #   test/core/math_device.cu
+  #   test/core/math_host.cpp
+  #   test/core/operators_device.cu
+  #   test/core/operators_host.cpp
+  #   test/core/handle.cpp
+  #   test/core/interruptible.cu
+  #   test/core/nvtx.cpp
+  #   test/core/mdarray.cu
+  #   test/core/mdbuffer.cu
+  #   test/core/mdspan_copy.cpp
+  #   test/core/mdspan_copy.cu
+  #   test/core/mdspan_utils.cu
+  #   test/core/numpy_serializer.cu
+  #   test/core/memory_type.cpp
+  #   test/core/sparse_matrix.cu
+  #   test/core/sparse_matrix.cpp
+  #   test/core/span.cpp
+  #   test/core/span.cu
+  #   test/core/stream_view.cpp
+  #   test/core/temporary_device_buffer.cu
+  #   test/test.cpp
+  #   LIB
+  #   EXPLICIT_INSTANTIATE_ONLY
+  # )
+
+  # ConfigureTest(
+  #   NAME CORE_TEST PATH test/core/stream_view.cpp test/core/mdspan_copy.cpp LIB
+  #   EXPLICIT_INSTANTIATE_ONLY NOCUDA
+  # )
+
+  # ConfigureTest(
+  #   NAME
+  #   DISTANCE_TEST
+  #   PATH
+  #   test/distance/dist_adj.cu
+  #   test/distance/dist_adj_distance_instance.cu
+  #   test/distance/dist_canberra.cu
+  #   test/distance/dist_correlation.cu
+  #   test/distance/dist_cos.cu
+  #   test/distance/dist_hamming.cu
+  #   test/distance/dist_hellinger.cu
+  #   test/distance/dist_inner_product.cu
+  #   test/distance/dist_jensen_shannon.cu
+  #   test/distance/dist_kl_divergence.cu
+  #   test/distance/dist_l1.cu
+  #   test/distance/dist_l2_exp.cu
+  #   test/distance/dist_l2_unexp.cu
+  #   test/distance/dist_l2_sqrt_exp.cu
+  #   test/distance/dist_l_inf.cu
+  #   test/distance/dist_lp_unexp.cu
+  #   test/distance/dist_russell_rao.cu
+  #   test/distance/masked_nn.cu
+  #   test/distance/masked_nn_compress_to_bits.cu
+  #   test/distance/fused_l2_nn.cu
+  #   test/distance/gram.cu
+  #   LIB
+  #   EXPLICIT_INSTANTIATE_ONLY
+  # )
+
+  # list(
+  #   APPEND
+  #   EXT_HEADER_TEST_SOURCES
+  #   test/ext_headers/raft_neighbors_brute_force.cu
+  #   test/ext_headers/raft_distance_distance.cu
+  #   test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
+  #   test/ext_headers/raft_matrix_detail_select_k.cu
+  #   test/ext_headers/raft_neighbors_ball_cover.cu
+  #   test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
+  #   test/ext_headers/raft_distance_fused_l2_nn.cu
+  #   test/ext_headers/raft_neighbors_ivf_pq.cu
+  #   test/ext_headers/raft_util_memory_pool.cpp
+  #   test/ext_headers/raft_neighbors_ivf_flat.cu
+  #   test/ext_headers/raft_core_logger.cpp
+  #   test/ext_headers/raft_neighbors_refine.cu
+  #   test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu
+  #   test/ext_headers/raft_linalg_detail_coalesced_reduction.cu
+  #   test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
+  #   test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
+  #   test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
+  # )
+
+  # # Test that the split headers compile in isolation with:
+  # #
+  # # * EXT_HEADERS_TEST_COMPILED_EXPLICIT: RAFT_COMPILED, RAFT_EXPLICIT_INSTANTIATE_ONLY defined
+  # # * EXT_HEADERS_TEST_COMPILED_IMPLICIT: RAFT_COMPILED defined
+  # # * EXT_HEADERS_TEST_IMPLICIT:          no macros defined.
+  # ConfigureTest(
+  #   NAME EXT_HEADERS_TEST_COMPILED_EXPLICIT PATH ${EXT_HEADER_TEST_SOURCES} LIB
+  #   EXPLICIT_INSTANTIATE_ONLY
+  # )
+  # ConfigureTest(NAME EXT_HEADERS_TEST_COMPILED_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES} LIB)
+  # ConfigureTest(NAME EXT_HEADERS_TEST_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES})
+
+  # ConfigureTest(NAME LABEL_TEST PATH test/label/label.cu test/label/merge_labels.cu)
+
+  # ConfigureTest(
+  #   NAME
+  #   LINALG_TEST
+  #   PATH
+  #   test/linalg/add.cu
+  #   test/linalg/axpy.cu
+  #   test/linalg/binary_op.cu
+  #   test/linalg/cholesky_r1.cu
+  #   test/linalg/coalesced_reduction.cu
+  #   test/linalg/divide.cu
+  #   test/linalg/dot.cu
+  #   test/linalg/eig.cu
+  #   test/linalg/eig_sel.cu
+  #   test/linalg/gemm_layout.cu
+  #   test/linalg/gemv.cu
+  #   test/linalg/map.cu
+  #   test/linalg/map_then_reduce.cu
+  #   test/linalg/matrix_vector.cu
+  #   test/linalg/matrix_vector_op.cu
+  #   test/linalg/mean_squared_error.cu
+  #   test/linalg/multiply.cu
+  #   test/linalg/norm.cu
+  #   test/linalg/normalize.cu
+  #   test/linalg/power.cu
+  #   test/linalg/randomized_svd.cu
+  #   test/linalg/reduce.cu
+  #   test/linalg/reduce_cols_by_key.cu
+  #   test/linalg/reduce_rows_by_key.cu
+  #   test/linalg/rsvd.cu
+  #   test/linalg/sqrt.cu
+  #   test/linalg/strided_reduction.cu
+  #   test/linalg/subtract.cu
+  #   test/linalg/svd.cu
+  #   test/linalg/ternary_op.cu
+  #   test/linalg/transpose.cu
+  #   test/linalg/unary_op.cu
+  # )
 
   ConfigureTest(
     NAME
     MATRIX_TEST
     PATH
-    test/matrix/argmax.cu
-    test/matrix/argmin.cu
-    test/matrix/columnSort.cu
-    test/matrix/diagonal.cu
-    test/matrix/gather.cu
-    test/matrix/scatter.cu
-    test/matrix/eye.cu
-    test/matrix/linewise_op.cu
-    test/matrix/math.cu
-    test/matrix/matrix.cu
-    test/matrix/norm.cu
-    test/matrix/reverse.cu
-    test/matrix/slice.cu
-    test/matrix/triangular.cu
-    test/sparse/spectral_matrix.cu
+  #   test/matrix/argmax.cu
+  #   test/matrix/argmin.cu
+  #   test/matrix/columnSort.cu
+  #   test/matrix/diagonal.cu
+  #   test/matrix/gather.cu
+  #   test/matrix/scatter.cu
+  #   test/matrix/eye.cu
+  #   test/matrix/linewise_op.cu
+  #   test/matrix/math.cu
+  #   test/matrix/matrix.cu
+  #   test/matrix/norm.cu
+  #   test/matrix/reverse.cu
+    test/matrix/sample_rows.cu
+  #   test/matrix/slice.cu
+  #   test/matrix/triangular.cu
+  #   test/sparse/spectral_matrix.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY
   )
 
-  ConfigureTest(NAME MATRIX_SELECT_TEST PATH test/matrix/select_k.cu LIB EXPLICIT_INSTANTIATE_ONLY)
+  # ConfigureTest(NAME MATRIX_SELECT_TEST PATH test/matrix/select_k.cu LIB EXPLICIT_INSTANTIATE_ONLY)
 
-  ConfigureTest(
-    NAME MATRIX_SELECT_LARGE_TEST PATH test/matrix/select_large_k.cu LIB EXPLICIT_INSTANTIATE_ONLY
-  )
+  # ConfigureTest(
+  #   NAME MATRIX_SELECT_LARGE_TEST PATH test/matrix/select_large_k.cu LIB EXPLICIT_INSTANTIATE_ONLY
+  # )
 
   ConfigureTest(
     NAME
     RANDOM_TEST
     PATH
-    test/random/make_blobs.cu
-    test/random/make_regression.cu
-    test/random/multi_variable_gaussian.cu
-    test/random/rng_pcg_host_api.cu
-    test/random/permute.cu
-    test/random/rng.cu
-    test/random/rng_discrete.cu
-    test/random/rng_int.cu
-    test/random/rmat_rectangular_generator.cu
-    test/random/sample_without_replacement.cu
+    # test/random/make_blobs.cu
+  #   test/random/make_regression.cu
+  #   test/random/multi_variable_gaussian.cu
+  #   test/random/rng_pcg_host_api.cu
+  #   test/random/permute.cu
+  #   test/random/rng.cu
+  #   test/random/rng_discrete.cu
+  #   test/random/rng_int.cu
+  #   test/random/rmat_rectangular_generator.cu
+  #   test/random/sample_without_replacement.cu
     test/random/excess_sampling.cu
   )
 
-  ConfigureTest(
-    NAME SOLVERS_TEST PATH test/cluster/cluster_solvers_deprecated.cu test/linalg/eigen_solvers.cu
-    test/lap/lap.cu test/sparse/mst.cu LIB EXPLICIT_INSTANTIATE_ONLY
-  )
-
-  ConfigureTest(
-    NAME
-    SPARSE_TEST
-    PATH
-    test/sparse/add.cu
-    test/sparse/convert_coo.cu
-    test/sparse/convert_csr.cu
-    test/sparse/csr_row_slice.cu
-    test/sparse/csr_to_dense.cu
-    test/sparse/csr_transpose.cu
-    test/sparse/degree.cu
-    test/sparse/filter.cu
-    test/sparse/norm.cu
-    test/sparse/normalize.cu
-    test/sparse/reduce.cu
-    test/sparse/row_op.cu
-    test/sparse/sddmm.cu
-    test/sparse/sort.cu
-    test/sparse/spgemmi.cu
-    test/sparse/spmm.cu
-    test/sparse/symmetrize.cu
-  )
-
-  ConfigureTest(
-    NAME SPARSE_DIST_TEST PATH test/sparse/dist_coo_spmv.cu test/sparse/distance.cu
-    test/sparse/gram.cu LIB EXPLICIT_INSTANTIATE_ONLY
-  )
-
-  ConfigureTest(
-    NAME
-    SPARSE_NEIGHBORS_TEST
-    PATH
-    test/sparse/neighbors/cross_component_nn.cu
-    test/sparse/neighbors/brute_force.cu
-    test/sparse/neighbors/knn_graph.cu
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
-  )
-
-  ConfigureTest(
-    NAME
-    NEIGHBORS_TEST
-    PATH
-    test/neighbors/knn.cu
-    test/neighbors/fused_l2_knn.cu
-    test/neighbors/tiled_knn.cu
-    test/neighbors/haversine.cu
-    test/neighbors/ball_cover.cu
-    test/neighbors/epsilon_neighborhood.cu
-    test/neighbors/refine.cu
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
-  )
-
-  ConfigureTest(
-    NAME NEIGHBORS_ANN_BRUTE_FORCE_TEST PATH test/neighbors/ann_brute_force/test_float.cu LIB
-    EXPLICIT_INSTANTIATE_ONLY GPUS 1 PERCENT 100
-  )
-
-  ConfigureTest(
-    NAME
-    NEIGHBORS_ANN_CAGRA_TEST
-    PATH
-    test/neighbors/ann_cagra/test_float_uint32_t.cu
-    test/neighbors/ann_cagra/test_half_uint32_t.cu
-    test/neighbors/ann_cagra/test_int8_t_uint32_t.cu
-    test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu
-    test/neighbors/ann_cagra/test_float_int64_t.cu
-    test/neighbors/ann_cagra/test_half_int64_t.cu
-    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
-    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
-    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
-    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
-    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
-    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
-    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
-    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
-    src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu
-    src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu
-    src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu
-    src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu
-    src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu
-    src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu
-    src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu
-    src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
-    GPUS
-    1
-    PERCENT
-    100
-  )
-
-  ConfigureTest(
-    NAME
-    NEIGHBORS_ANN_IVF_TEST
-    PATH
-    test/neighbors/ann_ivf_flat/test_filter_float_int64_t.cu
-    test/neighbors/ann_ivf_flat/test_float_int64_t.cu
-    test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu
-    test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu
-    test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
-    test/neighbors/ann_ivf_pq/test_float_int64_t.cu
-    test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu
-    test/neighbors/ann_ivf_pq/test_uint8_t_int64_t.cu
-    test/neighbors/ann_ivf_pq/test_filter_float_int64_t.cu
-    test/neighbors/ann_ivf_pq/test_filter_int8_t_int64_t.cu
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
-    GPUS
-    1
-    PERCENT
-    100
-  )
-
-  ConfigureTest(
-    NAME
-    NEIGHBORS_ANN_NN_DESCENT_TEST
-    PATH
-    test/neighbors/ann_nn_descent/test_float_uint32_t.cu
-    test/neighbors/ann_nn_descent/test_int8_t_uint32_t.cu
-    test/neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
-    GPUS
-    1
-    PERCENT
-    100
-  )
-
-  ConfigureTest(
-    NAME
-    STATS_TEST
-    PATH
-    test/stats/accuracy.cu
-    test/stats/adjusted_rand_index.cu
-    test/stats/completeness_score.cu
-    test/stats/contingencyMatrix.cu
-    test/stats/cov.cu
-    test/stats/dispersion.cu
-    test/stats/entropy.cu
-    test/stats/histogram.cu
-    test/stats/homogeneity_score.cu
-    test/stats/information_criterion.cu
-    test/stats/kl_divergence.cu
-    test/stats/mean.cu
-    test/stats/meanvar.cu
-    test/stats/mean_center.cu
-    test/stats/minmax.cu
-    test/stats/mutual_info_score.cu
-    test/stats/neighborhood_recall.cu
-    test/stats/r2_score.cu
-    test/stats/rand_index.cu
-    test/stats/regression_metrics.cu
-    test/stats/silhouette_score.cu
-    test/stats/stddev.cu
-    test/stats/sum.cu
-    test/stats/trustworthiness.cu
-    test/stats/weighted_mean.cu
-    test/stats/v_measure.cu
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
-  )
-
-  ConfigureTest(
-    NAME
-    UTILS_TEST
-    PATH
-    test/core/seive.cu
-    test/util/bitonic_sort.cu
-    test/util/cudart_utils.cpp
-    test/util/device_atomics.cu
-    test/util/integer_utils.cpp
-    test/util/integer_utils.cu
-    test/util/memory_type_dispatcher.cu
-    test/util/pow2_utils.cu
-    test/util/reduction.cu
-  )
+  # ConfigureTest(
+  #   NAME SOLVERS_TEST PATH test/cluster/cluster_solvers_deprecated.cu test/linalg/eigen_solvers.cu
+  #   test/lap/lap.cu test/sparse/mst.cu LIB EXPLICIT_INSTANTIATE_ONLY
+  # )
+
+  # ConfigureTest(
+  #   NAME
+  #   SPARSE_TEST
+  #   PATH
+  #   test/sparse/add.cu
+  #   test/sparse/convert_coo.cu
+  #   test/sparse/convert_csr.cu
+  #   test/sparse/csr_row_slice.cu
+  #   test/sparse/csr_to_dense.cu
+  #   test/sparse/csr_transpose.cu
+  #   test/sparse/degree.cu
+  #   test/sparse/filter.cu
+  #   test/sparse/norm.cu
+  #   test/sparse/normalize.cu
+  #   test/sparse/reduce.cu
+  #   test/sparse/row_op.cu
+  #   test/sparse/sddmm.cu
+  #   test/sparse/sort.cu
+  #   test/sparse/spgemmi.cu
+  #   test/sparse/spmm.cu
+  #   test/sparse/symmetrize.cu
+  # )
+
+  # ConfigureTest(
+  #   NAME SPARSE_DIST_TEST PATH test/sparse/dist_coo_spmv.cu test/sparse/distance.cu
+  #   test/sparse/gram.cu LIB EXPLICIT_INSTANTIATE_ONLY
+  # )
+
+  # ConfigureTest(
+  #   NAME
+  #   SPARSE_NEIGHBORS_TEST
+  #   PATH
+  #   test/sparse/neighbors/cross_component_nn.cu
+  #   test/sparse/neighbors/brute_force.cu
+  #   test/sparse/neighbors/knn_graph.cu
+  #   LIB
+  #   EXPLICIT_INSTANTIATE_ONLY
+  # )
+
+  # ConfigureTest(
+  #   NAME
+  #   NEIGHBORS_TEST
+  #   PATH
+  #   test/neighbors/knn.cu
+  #   test/neighbors/fused_l2_knn.cu
+  #   test/neighbors/tiled_knn.cu
+  #   test/neighbors/haversine.cu
+  #   test/neighbors/ball_cover.cu
+  #   test/neighbors/epsilon_neighborhood.cu
+  #   test/neighbors/refine.cu
+  #   LIB
+  #   EXPLICIT_INSTANTIATE_ONLY
+  # )
+
+  # ConfigureTest(
+  #   NAME NEIGHBORS_ANN_BRUTE_FORCE_TEST PATH test/neighbors/ann_brute_force/test_float.cu LIB
+  #   EXPLICIT_INSTANTIATE_ONLY GPUS 1 PERCENT 100
+  # )
+
+  # ConfigureTest(
+  #   NAME
+  #   NEIGHBORS_ANN_CAGRA_TEST
+  #   PATH
+  #   test/neighbors/ann_cagra/test_float_uint32_t.cu
+  #   test/neighbors/ann_cagra/test_half_uint32_t.cu
+  #   test/neighbors/ann_cagra/test_int8_t_uint32_t.cu
+  #   test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu
+  #   test/neighbors/ann_cagra/test_float_int64_t.cu
+  #   test/neighbors/ann_cagra/test_half_int64_t.cu
+  #   src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
+  #   src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
+  #   src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
+  #   src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
+  #   src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
+  #   src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
+  #   src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
+  #   src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
+  #   src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu
+  #   src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu
+  #   src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu
+  #   src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu
+  #   src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu
+  #   src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu
+  #   src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu
+  #   src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu
+  #   LIB
+  #   EXPLICIT_INSTANTIATE_ONLY
+  #   GPUS
+  #   1
+  #   PERCENT
+  #   100
+  # )
+
+  # ConfigureTest(
+  #   NAME
+  #   NEIGHBORS_ANN_IVF_TEST
+  #   PATH
+  #   test/neighbors/ann_ivf_flat/test_filter_float_int64_t.cu
+  #   test/neighbors/ann_ivf_flat/test_float_int64_t.cu
+  #   test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu
+  #   test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu
+  #   test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
+  #   test/neighbors/ann_ivf_pq/test_float_int64_t.cu
+  #   test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu
+  #   test/neighbors/ann_ivf_pq/test_uint8_t_int64_t.cu
+  #   test/neighbors/ann_ivf_pq/test_filter_float_int64_t.cu
+  #   test/neighbors/ann_ivf_pq/test_filter_int8_t_int64_t.cu
+  #   LIB
+  #   EXPLICIT_INSTANTIATE_ONLY
+  #   GPUS
+  #   1
+  #   PERCENT
+  #   100
+  # )
+
+  # ConfigureTest(
+  #   NAME
+  #   NEIGHBORS_ANN_NN_DESCENT_TEST
+  #   PATH
+  #   test/neighbors/ann_nn_descent/test_float_uint32_t.cu
+  #   test/neighbors/ann_nn_descent/test_int8_t_uint32_t.cu
+  #   test/neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu
+  #   LIB
+  #   EXPLICIT_INSTANTIATE_ONLY
+  #   GPUS
+  #   1
+  #   PERCENT
+  #   100
+  # )
+
+  # ConfigureTest(
+  #   NAME
+  #   STATS_TEST
+  #   PATH
+  #   test/stats/accuracy.cu
+  #   test/stats/adjusted_rand_index.cu
+  #   test/stats/completeness_score.cu
+  #   test/stats/contingencyMatrix.cu
+  #   test/stats/cov.cu
+  #   test/stats/dispersion.cu
+  #   test/stats/entropy.cu
+  #   test/stats/histogram.cu
+  #   test/stats/homogeneity_score.cu
+  #   test/stats/information_criterion.cu
+  #   test/stats/kl_divergence.cu
+  #   test/stats/mean.cu
+  #   test/stats/meanvar.cu
+  #   test/stats/mean_center.cu
+  #   test/stats/minmax.cu
+  #   test/stats/mutual_info_score.cu
+  #   test/stats/neighborhood_recall.cu
+  #   test/stats/r2_score.cu
+  #   test/stats/rand_index.cu
+  #   test/stats/regression_metrics.cu
+  #   test/stats/silhouette_score.cu
+  #   test/stats/stddev.cu
+  #   test/stats/sum.cu
+  #   test/stats/trustworthiness.cu
+  #   test/stats/weighted_mean.cu
+  #   test/stats/v_measure.cu
+  #   LIB
+  #   EXPLICIT_INSTANTIATE_ONLY
+  # )
+
+  # ConfigureTest(
+  #   NAME
+  #   UTILS_TEST
+  #   PATH
+  #   test/core/seive.cu
+  #   test/util/bitonic_sort.cu
+  #   test/util/cudart_utils.cpp
+  #   test/util/device_atomics.cu
+  #   test/util/integer_utils.cpp
+  #   test/util/integer_utils.cu
+  #   test/util/memory_type_dispatcher.cu
+  #   test/util/pow2_utils.cu
+  #   test/util/reduction.cu
+  # )
 endif()
 
 # ##################################################################################################
diff --git a/cpp/test/matrix/sample_rows.cu b/cpp/test/matrix/sample_rows.cu
new file mode 100644
index 0000000000..5ca93d0fe5
--- /dev/null
+++ b/cpp/test/matrix/sample_rows.cu
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/sample_rows.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+#include <gtest/gtest.h>
+
+namespace raft {
+namespace matrix {
+
+struct inputs {
+  int N;
+  int dim;
+  int n_samples;
+};
+
+::std::ostream& operator<<(::std::ostream& os, const inputs p)
+{
+  os << p.N << "#" << p.k << "#" << p.n_samples;
+  return os;
+}
+
+template <typename T>
+class SampleRowsTest : public ::testing::TestWithParam<inputs> {
+ public:
+  SampleRowsTest()
+    : params(::testing::TestWithParam<inputs>::GetParam()),
+      state{137ULL},
+      in(make_device_vector<T, int64_t>(res, params.N, params.dim)),
+      out(make_device_vector<T, int64_t>(res, 0, 0))
+
+  {
+    raft::random::uniform(res, state, in.data_handle(), in.size(), T(-1.0), T(1.0));
+  }
+
+  void check()
+  {
+    out = raft::random::excess_subsample<T, int64_t>(res, state, params.N, params.n_samples);
+    ASSERT_TRUE(out.extent(0) == params.n_samples);
+    ASSERT_TRUE(out.extent(1) == params.dim)
+  }
+
+ protected:
+  inputs params;
+  raft::resources res;
+  cudaStream_t stream;
+  RngState state;
+  device_matrix<T, int64_t> out, in;
+};
+
+const std::vector<inputs> input1 = {
+  {10, 1, 1}, {10, 4, 1}, {10, 4, 10}, {10, 10}, {137, 42, 59}, {10000, 128, 893}};
+
+using SampleRowsTestInt64 = SampleRowsTest<float>;
+TEST_P(SampleRowsTestInt64, SamplingTest) { check(); }
+INSTANTIATE_TEST_SUITE_P(SampleRowsTests, SampleRowsTestInt64, ::testing::ValuesIn(input1));
+
+}  // namespace matrix
+}  // namespace raft

From eb7e6d14c677fa7507527811c92b558ec178fc27 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Wed, 13 Mar 2024 09:54:30 +0100
Subject: [PATCH 09/16] Add mdspan input API, fix cmakelists

---
 .../raft/matrix/detail/sample_rows.cuh        |   9 +-
 cpp/include/raft/matrix/sample_rows.cuh       |  36 +-
 cpp/test/CMakeLists.txt                       | 732 +++++++++---------
 cpp/test/matrix/sample_rows.cu                |  15 +-
 4 files changed, 400 insertions(+), 392 deletions(-)

diff --git a/cpp/include/raft/matrix/detail/sample_rows.cuh b/cpp/include/raft/matrix/detail/sample_rows.cuh
index c8120c9ab2..6c598551d7 100644
--- a/cpp/include/raft/matrix/detail/sample_rows.cuh
+++ b/cpp/include/raft/matrix/detail/sample_rows.cuh
@@ -16,13 +16,16 @@
 
 #pragma once
 
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
 #include <raft/core/logger.hpp>
+#include <raft/core/resources.hpp>
 #include <raft/matrix/gather.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 
-namespace raft::matrix {
+namespace raft::matrix::detail {
 
 /** Select rows randomly from input and copy to output. */
 template <typename T, typename IdxT = int64_t>
@@ -30,7 +33,7 @@ void sample_rows(raft::resources const& res,
                  const T* input,
                  IdxT n_rows_input,
                  raft::device_matrix_view<T, IdxT> output,
-                 RngState random_state)
+                 random::RngState random_state)
 {
   IdxT n_dim     = output.extent(1);
   IdxT n_samples = output.extent(0);
@@ -51,4 +54,4 @@ void sample_rows(raft::resources const& res,
     raft::matrix::detail::gather(res, dataset, make_const_mdspan(train_indices.view()), output);
   }
 }
-}  // namespace raft::matrix
+}  // namespace raft::matrix::detail
diff --git a/cpp/include/raft/matrix/sample_rows.cuh b/cpp/include/raft/matrix/sample_rows.cuh
index 2f8b8e6248..55b17800c7 100644
--- a/cpp/include/raft/matrix/sample_rows.cuh
+++ b/cpp/include/raft/matrix/sample_rows.cuh
@@ -16,36 +16,38 @@
 
 #pragma once
 
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
 #include <raft/core/logger.hpp>
-#include <raft/matrix/gather.cuh>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/detail/sample_rows.cuh>
 #include <raft/random/rng.cuh>
-#include <raft/util/cuda_utils.cuh>
-#include <raft/util/cudart_utils.hpp>
 
 namespace raft::matrix {
 
 /** Select rows randomly from input and copy to output. */
-template <typename T, typename IdxT = int64_t>
+template <typename T, typename IdxT = int64_t, typename accessor>
 void sample_rows(raft::resources const& res,
-                 const T* input,
-                 IdxT n_rows_input,
-                 raft::device_matrix_view<T, IdxT> output,
-                 RngState random_state)
+                 random::RngState random_state,
+                 mdspan<const T, matrix_extent<int64_t>, row_major, accessor> dataset,
+                 raft::device_matrix_view<T, IdxT> output)
 {
   detail::sample_rows(res, input, n_rows_input, output, random_state);
+
+  detail::sample_rows(res, dataset.data_handle(), dataset.extent(0), output, random_state);
 }
 
 /** Subsample the dataset to create a training set*/
-template <typename T, typename IdxT = int64_t>
-raft::device_matrix<T, IdxT> sample_rows(raft::resources const& res,
-                                         const T* input,
-                                         IdxT n_rows_input,
-                                         IdxT n_train,
-                                         IdxT n_dim,
-                                         RngState random_state)
+template <typename T, typename IdxT = int64_t, typename accessor>
+raft::device_matrix<T, IdxT> sample_rows(
+  raft::resources const& res,
+  random::RngState random_state,
+  mdspan<const T, matrix_extent<int64_t>, row_major, accessor> dataset,
+  IdxT n_samples)
 {
-  auto output = raft::make_device_matrix<T, IdxT>(res, n_train, n_dim);
-  detail::sample_rows(res, input, n_rows_input, output, random_state);
+  auto output = raft::make_device_matrix<T, IdxT>(res, n_samples, dataset.extent(1));
+  detail::sample_rows(res, random_state, dataset.data_handle(), dataset.extent(0), output);
   return output;
 }
+
 }  // namespace raft::matrix
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index cda9ca69e8..7d31903f5e 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -95,391 +95,391 @@ endfunction()
 # * distance tests -------------------------------------------------------------------------
 
 if(BUILD_TESTS)
-  # ConfigureTest(
-  #   NAME
-  #   CLUSTER_TEST
-  #   PATH
-  #   test/cluster/kmeans.cu
-  #   test/cluster/kmeans_balanced.cu
-  #   test/cluster/kmeans_find_k.cu
-  #   test/cluster/cluster_solvers.cu
-  #   test/cluster/linkage.cu
-  #   test/cluster/spectral.cu
-  #   LIB
-  #   EXPLICIT_INSTANTIATE_ONLY
-  # )
-
-  # ConfigureTest(
-  #   NAME
-  #   CORE_TEST
-  #   PATH
-  #   test/core/bitset.cu
-  #   test/core/device_resources_manager.cpp
-  #   test/core/device_setter.cpp
-  #   test/core/logger.cpp
-  #   test/core/math_device.cu
-  #   test/core/math_host.cpp
-  #   test/core/operators_device.cu
-  #   test/core/operators_host.cpp
-  #   test/core/handle.cpp
-  #   test/core/interruptible.cu
-  #   test/core/nvtx.cpp
-  #   test/core/mdarray.cu
-  #   test/core/mdbuffer.cu
-  #   test/core/mdspan_copy.cpp
-  #   test/core/mdspan_copy.cu
-  #   test/core/mdspan_utils.cu
-  #   test/core/numpy_serializer.cu
-  #   test/core/memory_type.cpp
-  #   test/core/sparse_matrix.cu
-  #   test/core/sparse_matrix.cpp
-  #   test/core/span.cpp
-  #   test/core/span.cu
-  #   test/core/stream_view.cpp
-  #   test/core/temporary_device_buffer.cu
-  #   test/test.cpp
-  #   LIB
-  #   EXPLICIT_INSTANTIATE_ONLY
-  # )
-
-  # ConfigureTest(
-  #   NAME CORE_TEST PATH test/core/stream_view.cpp test/core/mdspan_copy.cpp LIB
-  #   EXPLICIT_INSTANTIATE_ONLY NOCUDA
-  # )
-
-  # ConfigureTest(
-  #   NAME
-  #   DISTANCE_TEST
-  #   PATH
-  #   test/distance/dist_adj.cu
-  #   test/distance/dist_adj_distance_instance.cu
-  #   test/distance/dist_canberra.cu
-  #   test/distance/dist_correlation.cu
-  #   test/distance/dist_cos.cu
-  #   test/distance/dist_hamming.cu
-  #   test/distance/dist_hellinger.cu
-  #   test/distance/dist_inner_product.cu
-  #   test/distance/dist_jensen_shannon.cu
-  #   test/distance/dist_kl_divergence.cu
-  #   test/distance/dist_l1.cu
-  #   test/distance/dist_l2_exp.cu
-  #   test/distance/dist_l2_unexp.cu
-  #   test/distance/dist_l2_sqrt_exp.cu
-  #   test/distance/dist_l_inf.cu
-  #   test/distance/dist_lp_unexp.cu
-  #   test/distance/dist_russell_rao.cu
-  #   test/distance/masked_nn.cu
-  #   test/distance/masked_nn_compress_to_bits.cu
-  #   test/distance/fused_l2_nn.cu
-  #   test/distance/gram.cu
-  #   LIB
-  #   EXPLICIT_INSTANTIATE_ONLY
-  # )
-
-  # list(
-  #   APPEND
-  #   EXT_HEADER_TEST_SOURCES
-  #   test/ext_headers/raft_neighbors_brute_force.cu
-  #   test/ext_headers/raft_distance_distance.cu
-  #   test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
-  #   test/ext_headers/raft_matrix_detail_select_k.cu
-  #   test/ext_headers/raft_neighbors_ball_cover.cu
-  #   test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
-  #   test/ext_headers/raft_distance_fused_l2_nn.cu
-  #   test/ext_headers/raft_neighbors_ivf_pq.cu
-  #   test/ext_headers/raft_util_memory_pool.cpp
-  #   test/ext_headers/raft_neighbors_ivf_flat.cu
-  #   test/ext_headers/raft_core_logger.cpp
-  #   test/ext_headers/raft_neighbors_refine.cu
-  #   test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu
-  #   test/ext_headers/raft_linalg_detail_coalesced_reduction.cu
-  #   test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
-  #   test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
-  #   test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
-  # )
-
-  # # Test that the split headers compile in isolation with:
-  # #
-  # # * EXT_HEADERS_TEST_COMPILED_EXPLICIT: RAFT_COMPILED, RAFT_EXPLICIT_INSTANTIATE_ONLY defined
-  # # * EXT_HEADERS_TEST_COMPILED_IMPLICIT: RAFT_COMPILED defined
-  # # * EXT_HEADERS_TEST_IMPLICIT:          no macros defined.
-  # ConfigureTest(
-  #   NAME EXT_HEADERS_TEST_COMPILED_EXPLICIT PATH ${EXT_HEADER_TEST_SOURCES} LIB
-  #   EXPLICIT_INSTANTIATE_ONLY
-  # )
-  # ConfigureTest(NAME EXT_HEADERS_TEST_COMPILED_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES} LIB)
-  # ConfigureTest(NAME EXT_HEADERS_TEST_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES})
-
-  # ConfigureTest(NAME LABEL_TEST PATH test/label/label.cu test/label/merge_labels.cu)
-
-  # ConfigureTest(
-  #   NAME
-  #   LINALG_TEST
-  #   PATH
-  #   test/linalg/add.cu
-  #   test/linalg/axpy.cu
-  #   test/linalg/binary_op.cu
-  #   test/linalg/cholesky_r1.cu
-  #   test/linalg/coalesced_reduction.cu
-  #   test/linalg/divide.cu
-  #   test/linalg/dot.cu
-  #   test/linalg/eig.cu
-  #   test/linalg/eig_sel.cu
-  #   test/linalg/gemm_layout.cu
-  #   test/linalg/gemv.cu
-  #   test/linalg/map.cu
-  #   test/linalg/map_then_reduce.cu
-  #   test/linalg/matrix_vector.cu
-  #   test/linalg/matrix_vector_op.cu
-  #   test/linalg/mean_squared_error.cu
-  #   test/linalg/multiply.cu
-  #   test/linalg/norm.cu
-  #   test/linalg/normalize.cu
-  #   test/linalg/power.cu
-  #   test/linalg/randomized_svd.cu
-  #   test/linalg/reduce.cu
-  #   test/linalg/reduce_cols_by_key.cu
-  #   test/linalg/reduce_rows_by_key.cu
-  #   test/linalg/rsvd.cu
-  #   test/linalg/sqrt.cu
-  #   test/linalg/strided_reduction.cu
-  #   test/linalg/subtract.cu
-  #   test/linalg/svd.cu
-  #   test/linalg/ternary_op.cu
-  #   test/linalg/transpose.cu
-  #   test/linalg/unary_op.cu
-  # )
+  ConfigureTest(
+    NAME
+    CLUSTER_TEST
+    PATH
+    test/cluster/kmeans.cu
+    test/cluster/kmeans_balanced.cu
+    test/cluster/kmeans_find_k.cu
+    test/cluster/cluster_solvers.cu
+    test/cluster/linkage.cu
+    test/cluster/spectral.cu
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
+  )
+
+  ConfigureTest(
+    NAME
+    CORE_TEST
+    PATH
+    test/core/bitset.cu
+    test/core/device_resources_manager.cpp
+    test/core/device_setter.cpp
+    test/core/logger.cpp
+    test/core/math_device.cu
+    test/core/math_host.cpp
+    test/core/operators_device.cu
+    test/core/operators_host.cpp
+    test/core/handle.cpp
+    test/core/interruptible.cu
+    test/core/nvtx.cpp
+    test/core/mdarray.cu
+    test/core/mdbuffer.cu
+    test/core/mdspan_copy.cpp
+    test/core/mdspan_copy.cu
+    test/core/mdspan_utils.cu
+    test/core/numpy_serializer.cu
+    test/core/memory_type.cpp
+    test/core/sparse_matrix.cu
+    test/core/sparse_matrix.cpp
+    test/core/span.cpp
+    test/core/span.cu
+    test/core/stream_view.cpp
+    test/core/temporary_device_buffer.cu
+    test/test.cpp
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
+  )
+
+  ConfigureTest(
+    NAME CORE_TEST PATH test/core/stream_view.cpp test/core/mdspan_copy.cpp LIB
+    EXPLICIT_INSTANTIATE_ONLY NOCUDA
+  )
+
+  ConfigureTest(
+    NAME
+    DISTANCE_TEST
+    PATH
+    test/distance/dist_adj.cu
+    test/distance/dist_adj_distance_instance.cu
+    test/distance/dist_canberra.cu
+    test/distance/dist_correlation.cu
+    test/distance/dist_cos.cu
+    test/distance/dist_hamming.cu
+    test/distance/dist_hellinger.cu
+    test/distance/dist_inner_product.cu
+    test/distance/dist_jensen_shannon.cu
+    test/distance/dist_kl_divergence.cu
+    test/distance/dist_l1.cu
+    test/distance/dist_l2_exp.cu
+    test/distance/dist_l2_unexp.cu
+    test/distance/dist_l2_sqrt_exp.cu
+    test/distance/dist_l_inf.cu
+    test/distance/dist_lp_unexp.cu
+    test/distance/dist_russell_rao.cu
+    test/distance/masked_nn.cu
+    test/distance/masked_nn_compress_to_bits.cu
+    test/distance/fused_l2_nn.cu
+    test/distance/gram.cu
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
+  )
+
+  list(
+    APPEND
+    EXT_HEADER_TEST_SOURCES
+    test/ext_headers/raft_neighbors_brute_force.cu
+    test/ext_headers/raft_distance_distance.cu
+    test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
+    test/ext_headers/raft_matrix_detail_select_k.cu
+    test/ext_headers/raft_neighbors_ball_cover.cu
+    test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
+    test/ext_headers/raft_distance_fused_l2_nn.cu
+    test/ext_headers/raft_neighbors_ivf_pq.cu
+    test/ext_headers/raft_util_memory_pool.cpp
+    test/ext_headers/raft_neighbors_ivf_flat.cu
+    test/ext_headers/raft_core_logger.cpp
+    test/ext_headers/raft_neighbors_refine.cu
+    test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu
+    test/ext_headers/raft_linalg_detail_coalesced_reduction.cu
+    test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
+    test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
+    test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
+  )
+
+  # Test that the split headers compile in isolation with:
+  #
+  # * EXT_HEADERS_TEST_COMPILED_EXPLICIT: RAFT_COMPILED, RAFT_EXPLICIT_INSTANTIATE_ONLY defined
+  # * EXT_HEADERS_TEST_COMPILED_IMPLICIT: RAFT_COMPILED defined
+  # * EXT_HEADERS_TEST_IMPLICIT:          no macros defined.
+  ConfigureTest(
+    NAME EXT_HEADERS_TEST_COMPILED_EXPLICIT PATH ${EXT_HEADER_TEST_SOURCES} LIB
+    EXPLICIT_INSTANTIATE_ONLY
+  )
+  ConfigureTest(NAME EXT_HEADERS_TEST_COMPILED_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES} LIB)
+  ConfigureTest(NAME EXT_HEADERS_TEST_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES})
+
+  ConfigureTest(NAME LABEL_TEST PATH test/label/label.cu test/label/merge_labels.cu)
+
+  ConfigureTest(
+    NAME
+    LINALG_TEST
+    PATH
+    test/linalg/add.cu
+    test/linalg/axpy.cu
+    test/linalg/binary_op.cu
+    test/linalg/cholesky_r1.cu
+    test/linalg/coalesced_reduction.cu
+    test/linalg/divide.cu
+    test/linalg/dot.cu
+    test/linalg/eig.cu
+    test/linalg/eig_sel.cu
+    test/linalg/gemm_layout.cu
+    test/linalg/gemv.cu
+    test/linalg/map.cu
+    test/linalg/map_then_reduce.cu
+    test/linalg/matrix_vector.cu
+    test/linalg/matrix_vector_op.cu
+    test/linalg/mean_squared_error.cu
+    test/linalg/multiply.cu
+    test/linalg/norm.cu
+    test/linalg/normalize.cu
+    test/linalg/power.cu
+    test/linalg/randomized_svd.cu
+    test/linalg/reduce.cu
+    test/linalg/reduce_cols_by_key.cu
+    test/linalg/reduce_rows_by_key.cu
+    test/linalg/rsvd.cu
+    test/linalg/sqrt.cu
+    test/linalg/strided_reduction.cu
+    test/linalg/subtract.cu
+    test/linalg/svd.cu
+    test/linalg/ternary_op.cu
+    test/linalg/transpose.cu
+    test/linalg/unary_op.cu
+  )
 
   ConfigureTest(
     NAME
     MATRIX_TEST
     PATH
-  #   test/matrix/argmax.cu
-  #   test/matrix/argmin.cu
-  #   test/matrix/columnSort.cu
-  #   test/matrix/diagonal.cu
-  #   test/matrix/gather.cu
-  #   test/matrix/scatter.cu
-  #   test/matrix/eye.cu
-  #   test/matrix/linewise_op.cu
-  #   test/matrix/math.cu
-  #   test/matrix/matrix.cu
-  #   test/matrix/norm.cu
-  #   test/matrix/reverse.cu
+    test/matrix/argmax.cu
+    test/matrix/argmin.cu
+    test/matrix/columnSort.cu
+    test/matrix/diagonal.cu
+    test/matrix/gather.cu
+    test/matrix/scatter.cu
+    test/matrix/eye.cu
+    test/matrix/linewise_op.cu
+    test/matrix/math.cu
+    test/matrix/matrix.cu
+    test/matrix/norm.cu
+    test/matrix/reverse.cu
     test/matrix/sample_rows.cu
-  #   test/matrix/slice.cu
-  #   test/matrix/triangular.cu
-  #   test/sparse/spectral_matrix.cu
+    test/matrix/slice.cu
+    test/matrix/triangular.cu
+    test/sparse/spectral_matrix.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY
   )
 
-  # ConfigureTest(NAME MATRIX_SELECT_TEST PATH test/matrix/select_k.cu LIB EXPLICIT_INSTANTIATE_ONLY)
+  ConfigureTest(NAME MATRIX_SELECT_TEST PATH test/matrix/select_k.cu LIB EXPLICIT_INSTANTIATE_ONLY)
 
-  # ConfigureTest(
-  #   NAME MATRIX_SELECT_LARGE_TEST PATH test/matrix/select_large_k.cu LIB EXPLICIT_INSTANTIATE_ONLY
-  # )
+  ConfigureTest(
+    NAME MATRIX_SELECT_LARGE_TEST PATH test/matrix/select_large_k.cu LIB EXPLICIT_INSTANTIATE_ONLY
+  )
 
   ConfigureTest(
     NAME
     RANDOM_TEST
     PATH
-    # test/random/make_blobs.cu
-  #   test/random/make_regression.cu
-  #   test/random/multi_variable_gaussian.cu
-  #   test/random/rng_pcg_host_api.cu
-  #   test/random/permute.cu
-  #   test/random/rng.cu
-  #   test/random/rng_discrete.cu
-  #   test/random/rng_int.cu
-  #   test/random/rmat_rectangular_generator.cu
-  #   test/random/sample_without_replacement.cu
+    test/random/make_blobs.cu
+    test/random/make_regression.cu
+    test/random/multi_variable_gaussian.cu
+    test/random/rng_pcg_host_api.cu
+    test/random/permute.cu
+    test/random/rng.cu
+    test/random/rng_discrete.cu
+    test/random/rng_int.cu
+    test/random/rmat_rectangular_generator.cu
+    test/random/sample_without_replacement.cu
     test/random/excess_sampling.cu
   )
 
-  # ConfigureTest(
-  #   NAME SOLVERS_TEST PATH test/cluster/cluster_solvers_deprecated.cu test/linalg/eigen_solvers.cu
-  #   test/lap/lap.cu test/sparse/mst.cu LIB EXPLICIT_INSTANTIATE_ONLY
-  # )
-
-  # ConfigureTest(
-  #   NAME
-  #   SPARSE_TEST
-  #   PATH
-  #   test/sparse/add.cu
-  #   test/sparse/convert_coo.cu
-  #   test/sparse/convert_csr.cu
-  #   test/sparse/csr_row_slice.cu
-  #   test/sparse/csr_to_dense.cu
-  #   test/sparse/csr_transpose.cu
-  #   test/sparse/degree.cu
-  #   test/sparse/filter.cu
-  #   test/sparse/norm.cu
-  #   test/sparse/normalize.cu
-  #   test/sparse/reduce.cu
-  #   test/sparse/row_op.cu
-  #   test/sparse/sddmm.cu
-  #   test/sparse/sort.cu
-  #   test/sparse/spgemmi.cu
-  #   test/sparse/spmm.cu
-  #   test/sparse/symmetrize.cu
-  # )
-
-  # ConfigureTest(
-  #   NAME SPARSE_DIST_TEST PATH test/sparse/dist_coo_spmv.cu test/sparse/distance.cu
-  #   test/sparse/gram.cu LIB EXPLICIT_INSTANTIATE_ONLY
-  # )
-
-  # ConfigureTest(
-  #   NAME
-  #   SPARSE_NEIGHBORS_TEST
-  #   PATH
-  #   test/sparse/neighbors/cross_component_nn.cu
-  #   test/sparse/neighbors/brute_force.cu
-  #   test/sparse/neighbors/knn_graph.cu
-  #   LIB
-  #   EXPLICIT_INSTANTIATE_ONLY
-  # )
-
-  # ConfigureTest(
-  #   NAME
-  #   NEIGHBORS_TEST
-  #   PATH
-  #   test/neighbors/knn.cu
-  #   test/neighbors/fused_l2_knn.cu
-  #   test/neighbors/tiled_knn.cu
-  #   test/neighbors/haversine.cu
-  #   test/neighbors/ball_cover.cu
-  #   test/neighbors/epsilon_neighborhood.cu
-  #   test/neighbors/refine.cu
-  #   LIB
-  #   EXPLICIT_INSTANTIATE_ONLY
-  # )
-
-  # ConfigureTest(
-  #   NAME NEIGHBORS_ANN_BRUTE_FORCE_TEST PATH test/neighbors/ann_brute_force/test_float.cu LIB
-  #   EXPLICIT_INSTANTIATE_ONLY GPUS 1 PERCENT 100
-  # )
-
-  # ConfigureTest(
-  #   NAME
-  #   NEIGHBORS_ANN_CAGRA_TEST
-  #   PATH
-  #   test/neighbors/ann_cagra/test_float_uint32_t.cu
-  #   test/neighbors/ann_cagra/test_half_uint32_t.cu
-  #   test/neighbors/ann_cagra/test_int8_t_uint32_t.cu
-  #   test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu
-  #   test/neighbors/ann_cagra/test_float_int64_t.cu
-  #   test/neighbors/ann_cagra/test_half_int64_t.cu
-  #   src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
-  #   src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
-  #   src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
-  #   src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
-  #   src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
-  #   src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
-  #   src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
-  #   src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
-  #   src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu
-  #   src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu
-  #   src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu
-  #   src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu
-  #   src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu
-  #   src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu
-  #   src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu
-  #   src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu
-  #   LIB
-  #   EXPLICIT_INSTANTIATE_ONLY
-  #   GPUS
-  #   1
-  #   PERCENT
-  #   100
-  # )
-
-  # ConfigureTest(
-  #   NAME
-  #   NEIGHBORS_ANN_IVF_TEST
-  #   PATH
-  #   test/neighbors/ann_ivf_flat/test_filter_float_int64_t.cu
-  #   test/neighbors/ann_ivf_flat/test_float_int64_t.cu
-  #   test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu
-  #   test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu
-  #   test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
-  #   test/neighbors/ann_ivf_pq/test_float_int64_t.cu
-  #   test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu
-  #   test/neighbors/ann_ivf_pq/test_uint8_t_int64_t.cu
-  #   test/neighbors/ann_ivf_pq/test_filter_float_int64_t.cu
-  #   test/neighbors/ann_ivf_pq/test_filter_int8_t_int64_t.cu
-  #   LIB
-  #   EXPLICIT_INSTANTIATE_ONLY
-  #   GPUS
-  #   1
-  #   PERCENT
-  #   100
-  # )
-
-  # ConfigureTest(
-  #   NAME
-  #   NEIGHBORS_ANN_NN_DESCENT_TEST
-  #   PATH
-  #   test/neighbors/ann_nn_descent/test_float_uint32_t.cu
-  #   test/neighbors/ann_nn_descent/test_int8_t_uint32_t.cu
-  #   test/neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu
-  #   LIB
-  #   EXPLICIT_INSTANTIATE_ONLY
-  #   GPUS
-  #   1
-  #   PERCENT
-  #   100
-  # )
-
-  # ConfigureTest(
-  #   NAME
-  #   STATS_TEST
-  #   PATH
-  #   test/stats/accuracy.cu
-  #   test/stats/adjusted_rand_index.cu
-  #   test/stats/completeness_score.cu
-  #   test/stats/contingencyMatrix.cu
-  #   test/stats/cov.cu
-  #   test/stats/dispersion.cu
-  #   test/stats/entropy.cu
-  #   test/stats/histogram.cu
-  #   test/stats/homogeneity_score.cu
-  #   test/stats/information_criterion.cu
-  #   test/stats/kl_divergence.cu
-  #   test/stats/mean.cu
-  #   test/stats/meanvar.cu
-  #   test/stats/mean_center.cu
-  #   test/stats/minmax.cu
-  #   test/stats/mutual_info_score.cu
-  #   test/stats/neighborhood_recall.cu
-  #   test/stats/r2_score.cu
-  #   test/stats/rand_index.cu
-  #   test/stats/regression_metrics.cu
-  #   test/stats/silhouette_score.cu
-  #   test/stats/stddev.cu
-  #   test/stats/sum.cu
-  #   test/stats/trustworthiness.cu
-  #   test/stats/weighted_mean.cu
-  #   test/stats/v_measure.cu
-  #   LIB
-  #   EXPLICIT_INSTANTIATE_ONLY
-  # )
-
-  # ConfigureTest(
-  #   NAME
-  #   UTILS_TEST
-  #   PATH
-  #   test/core/seive.cu
-  #   test/util/bitonic_sort.cu
-  #   test/util/cudart_utils.cpp
-  #   test/util/device_atomics.cu
-  #   test/util/integer_utils.cpp
-  #   test/util/integer_utils.cu
-  #   test/util/memory_type_dispatcher.cu
-  #   test/util/pow2_utils.cu
-  #   test/util/reduction.cu
-  # )
+  ConfigureTest(
+    NAME SOLVERS_TEST PATH test/cluster/cluster_solvers_deprecated.cu test/linalg/eigen_solvers.cu
+    test/lap/lap.cu test/sparse/mst.cu LIB EXPLICIT_INSTANTIATE_ONLY
+  )
+
+  ConfigureTest(
+    NAME
+    SPARSE_TEST
+    PATH
+    test/sparse/add.cu
+    test/sparse/convert_coo.cu
+    test/sparse/convert_csr.cu
+    test/sparse/csr_row_slice.cu
+    test/sparse/csr_to_dense.cu
+    test/sparse/csr_transpose.cu
+    test/sparse/degree.cu
+    test/sparse/filter.cu
+    test/sparse/norm.cu
+    test/sparse/normalize.cu
+    test/sparse/reduce.cu
+    test/sparse/row_op.cu
+    test/sparse/sddmm.cu
+    test/sparse/sort.cu
+    test/sparse/spgemmi.cu
+    test/sparse/spmm.cu
+    test/sparse/symmetrize.cu
+  )
+
+  ConfigureTest(
+    NAME SPARSE_DIST_TEST PATH test/sparse/dist_coo_spmv.cu test/sparse/distance.cu
+    test/sparse/gram.cu LIB EXPLICIT_INSTANTIATE_ONLY
+  )
+
+  ConfigureTest(
+    NAME
+    SPARSE_NEIGHBORS_TEST
+    PATH
+    test/sparse/neighbors/cross_component_nn.cu
+    test/sparse/neighbors/brute_force.cu
+    test/sparse/neighbors/knn_graph.cu
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
+  )
+
+  ConfigureTest(
+    NAME
+    NEIGHBORS_TEST
+    PATH
+    test/neighbors/knn.cu
+    test/neighbors/fused_l2_knn.cu
+    test/neighbors/tiled_knn.cu
+    test/neighbors/haversine.cu
+    test/neighbors/ball_cover.cu
+    test/neighbors/epsilon_neighborhood.cu
+    test/neighbors/refine.cu
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
+  )
+
+  ConfigureTest(
+    NAME NEIGHBORS_ANN_BRUTE_FORCE_TEST PATH test/neighbors/ann_brute_force/test_float.cu LIB
+    EXPLICIT_INSTANTIATE_ONLY GPUS 1 PERCENT 100
+  )
+
+  ConfigureTest(
+    NAME
+    NEIGHBORS_ANN_CAGRA_TEST
+    PATH
+    test/neighbors/ann_cagra/test_float_uint32_t.cu
+    test/neighbors/ann_cagra/test_half_uint32_t.cu
+    test/neighbors/ann_cagra/test_int8_t_uint32_t.cu
+    test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu
+    test/neighbors/ann_cagra/test_float_int64_t.cu
+    test/neighbors/ann_cagra/test_half_int64_t.cu
+    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
+    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
+    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
+    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
+    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
+    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
+    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
+    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
+    src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu
+    src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu
+    src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu
+    src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu
+    src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu
+    src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu
+    src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu
+    src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
+    GPUS
+    1
+    PERCENT
+    100
+  )
+
+  ConfigureTest(
+    NAME
+    NEIGHBORS_ANN_IVF_TEST
+    PATH
+    test/neighbors/ann_ivf_flat/test_filter_float_int64_t.cu
+    test/neighbors/ann_ivf_flat/test_float_int64_t.cu
+    test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu
+    test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu
+    test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
+    test/neighbors/ann_ivf_pq/test_float_int64_t.cu
+    test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu
+    test/neighbors/ann_ivf_pq/test_uint8_t_int64_t.cu
+    test/neighbors/ann_ivf_pq/test_filter_float_int64_t.cu
+    test/neighbors/ann_ivf_pq/test_filter_int8_t_int64_t.cu
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
+    GPUS
+    1
+    PERCENT
+    100
+  )
+
+  ConfigureTest(
+    NAME
+    NEIGHBORS_ANN_NN_DESCENT_TEST
+    PATH
+    test/neighbors/ann_nn_descent/test_float_uint32_t.cu
+    test/neighbors/ann_nn_descent/test_int8_t_uint32_t.cu
+    test/neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
+    GPUS
+    1
+    PERCENT
+    100
+  )
+
+  ConfigureTest(
+    NAME
+    STATS_TEST
+    PATH
+    test/stats/accuracy.cu
+    test/stats/adjusted_rand_index.cu
+    test/stats/completeness_score.cu
+    test/stats/contingencyMatrix.cu
+    test/stats/cov.cu
+    test/stats/dispersion.cu
+    test/stats/entropy.cu
+    test/stats/histogram.cu
+    test/stats/homogeneity_score.cu
+    test/stats/information_criterion.cu
+    test/stats/kl_divergence.cu
+    test/stats/mean.cu
+    test/stats/meanvar.cu
+    test/stats/mean_center.cu
+    test/stats/minmax.cu
+    test/stats/mutual_info_score.cu
+    test/stats/neighborhood_recall.cu
+    test/stats/r2_score.cu
+    test/stats/rand_index.cu
+    test/stats/regression_metrics.cu
+    test/stats/silhouette_score.cu
+    test/stats/stddev.cu
+    test/stats/sum.cu
+    test/stats/trustworthiness.cu
+    test/stats/weighted_mean.cu
+    test/stats/v_measure.cu
+    LIB
+    EXPLICIT_INSTANTIATE_ONLY
+  )
+
+  ConfigureTest(
+    NAME
+    UTILS_TEST
+    PATH
+    test/core/seive.cu
+    test/util/bitonic_sort.cu
+    test/util/cudart_utils.cpp
+    test/util/device_atomics.cu
+    test/util/integer_utils.cpp
+    test/util/integer_utils.cu
+    test/util/memory_type_dispatcher.cu
+    test/util/pow2_utils.cu
+    test/util/reduction.cu
+  )
 endif()
 
 # ##################################################################################################
diff --git a/cpp/test/matrix/sample_rows.cu b/cpp/test/matrix/sample_rows.cu
index 5ca93d0fe5..80abeb7397 100644
--- a/cpp/test/matrix/sample_rows.cu
+++ b/cpp/test/matrix/sample_rows.cu
@@ -21,6 +21,7 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/matrix/sample_rows.cuh>
+#include <raft/random/rng.cuh>
 #include <raft/util/cudart_utils.hpp>
 
 #include <gtest/gtest.h>
@@ -36,7 +37,7 @@ struct inputs {
 
 ::std::ostream& operator<<(::std::ostream& os, const inputs p)
 {
-  os << p.N << "#" << p.k << "#" << p.n_samples;
+  os << p.N << "#" << p.dim << "#" << p.n_samples;
   return os;
 }
 
@@ -46,8 +47,8 @@ class SampleRowsTest : public ::testing::TestWithParam<inputs> {
   SampleRowsTest()
     : params(::testing::TestWithParam<inputs>::GetParam()),
       state{137ULL},
-      in(make_device_vector<T, int64_t>(res, params.N, params.dim)),
-      out(make_device_vector<T, int64_t>(res, 0, 0))
+      in(make_device_matrix<T, int64_t>(res, params.N, params.dim)),
+      out(make_device_matrix<T, int64_t>(res, 0, 0))
 
   {
     raft::random::uniform(res, state, in.data_handle(), in.size(), T(-1.0), T(1.0));
@@ -55,16 +56,18 @@ class SampleRowsTest : public ::testing::TestWithParam<inputs> {
 
   void check()
   {
-    out = raft::random::excess_subsample<T, int64_t>(res, state, params.N, params.n_samples);
+    out = raft::matrix::sample_rows<T, int64_t>(res, state, make_const_mdspan(in.view()));
     ASSERT_TRUE(out.extent(0) == params.n_samples);
-    ASSERT_TRUE(out.extent(1) == params.dim)
+    ASSERT_TRUE(out.extent(1) == params.dim);
+    // TODO(tfeher): check sampled values
+    // TODO(tfeher): check host / device input
   }
 
  protected:
   inputs params;
   raft::resources res;
   cudaStream_t stream;
-  RngState state;
+  random::RngState state;
   device_matrix<T, int64_t> out, in;
 };
 

From 7857f2fd433c958d51a533a8ffe5b1e7881b93f0 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Wed, 13 Mar 2024 10:15:12 +0100
Subject: [PATCH 10/16] corrections

---
 cpp/include/raft/matrix/sample_rows.cuh | 6 +++---
 cpp/test/matrix/sample_rows.cu          | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/cpp/include/raft/matrix/sample_rows.cuh b/cpp/include/raft/matrix/sample_rows.cuh
index 55b17800c7..67281ff297 100644
--- a/cpp/include/raft/matrix/sample_rows.cuh
+++ b/cpp/include/raft/matrix/sample_rows.cuh
@@ -29,7 +29,7 @@ namespace raft::matrix {
 template <typename T, typename IdxT = int64_t, typename accessor>
 void sample_rows(raft::resources const& res,
                  random::RngState random_state,
-                 mdspan<const T, matrix_extent<int64_t>, row_major, accessor> dataset,
+                 mdspan<const T, matrix_extent<IdxT>, row_major, accessor> dataset,
                  raft::device_matrix_view<T, IdxT> output)
 {
   detail::sample_rows(res, input, n_rows_input, output, random_state);
@@ -42,11 +42,11 @@ template <typename T, typename IdxT = int64_t, typename accessor>
 raft::device_matrix<T, IdxT> sample_rows(
   raft::resources const& res,
   random::RngState random_state,
-  mdspan<const T, matrix_extent<int64_t>, row_major, accessor> dataset,
+  mdspan<const T, matrix_extent<IdxT>, row_major, accessor> dataset,
   IdxT n_samples)
 {
   auto output = raft::make_device_matrix<T, IdxT>(res, n_samples, dataset.extent(1));
-  detail::sample_rows(res, random_state, dataset.data_handle(), dataset.extent(0), output);
+  detail::sample_rows(res, random_state, dataset, output.view());
   return output;
 }
 
diff --git a/cpp/test/matrix/sample_rows.cu b/cpp/test/matrix/sample_rows.cu
index 80abeb7397..8d9be8e1e1 100644
--- a/cpp/test/matrix/sample_rows.cu
+++ b/cpp/test/matrix/sample_rows.cu
@@ -56,7 +56,8 @@ class SampleRowsTest : public ::testing::TestWithParam<inputs> {
 
   void check()
   {
-    out = raft::matrix::sample_rows<T, int64_t>(res, state, make_const_mdspan(in.view()));
+    out = raft::matrix::sample_rows<T, int64_t>(
+      res, state, make_const_mdspan(in.view()), params.n_samples);
     ASSERT_TRUE(out.extent(0) == params.n_samples);
     ASSERT_TRUE(out.extent(1) == params.dim);
     // TODO(tfeher): check sampled values

From 93ff94f936d29796fca2f69cf223ca4e2a16662d Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Wed, 13 Mar 2024 13:21:40 +0100
Subject: [PATCH 11/16] Add test to sample_rows

---
 .../raft/matrix/detail/sample_rows.cuh        |  4 +-
 cpp/include/raft/matrix/sample_rows.cuh       |  6 +-
 cpp/test/matrix/sample_rows.cu                | 78 ++++++++++++++++---
 cpp/test/random/excess_sampling.cu            |  3 +-
 4 files changed, 73 insertions(+), 18 deletions(-)

diff --git a/cpp/include/raft/matrix/detail/sample_rows.cuh b/cpp/include/raft/matrix/detail/sample_rows.cuh
index 6c598551d7..e28ad648da 100644
--- a/cpp/include/raft/matrix/detail/sample_rows.cuh
+++ b/cpp/include/raft/matrix/detail/sample_rows.cuh
@@ -30,10 +30,10 @@ namespace raft::matrix::detail {
 /** Select rows randomly from input and copy to output. */
 template <typename T, typename IdxT = int64_t>
 void sample_rows(raft::resources const& res,
+                 random::RngState random_state,
                  const T* input,
                  IdxT n_rows_input,
-                 raft::device_matrix_view<T, IdxT> output,
-                 random::RngState random_state)
+                 raft::device_matrix_view<T, IdxT> output)
 {
   IdxT n_dim     = output.extent(1);
   IdxT n_samples = output.extent(0);
diff --git a/cpp/include/raft/matrix/sample_rows.cuh b/cpp/include/raft/matrix/sample_rows.cuh
index 67281ff297..7a1f9bf756 100644
--- a/cpp/include/raft/matrix/sample_rows.cuh
+++ b/cpp/include/raft/matrix/sample_rows.cuh
@@ -32,9 +32,7 @@ void sample_rows(raft::resources const& res,
                  mdspan<const T, matrix_extent<IdxT>, row_major, accessor> dataset,
                  raft::device_matrix_view<T, IdxT> output)
 {
-  detail::sample_rows(res, input, n_rows_input, output, random_state);
-
-  detail::sample_rows(res, dataset.data_handle(), dataset.extent(0), output, random_state);
+  detail::sample_rows<T, IdxT>(res, random_state, dataset.data_handle(), dataset.extent(0), output);
 }
 
 /** Subsample the dataset to create a training set*/
@@ -46,7 +44,7 @@ raft::device_matrix<T, IdxT> sample_rows(
   IdxT n_samples)
 {
   auto output = raft::make_device_matrix<T, IdxT>(res, n_samples, dataset.extent(1));
-  detail::sample_rows(res, random_state, dataset, output.view());
+  sample_rows(res, random_state, dataset, output.view());
   return output;
 }
 
diff --git a/cpp/test/matrix/sample_rows.cu b/cpp/test/matrix/sample_rows.cu
index 8d9be8e1e1..048edde2ba 100644
--- a/cpp/test/matrix/sample_rows.cu
+++ b/cpp/test/matrix/sample_rows.cu
@@ -18,14 +18,18 @@
 
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/host_mdarray.hpp>
+#include <raft/core/mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/matrix/sample_rows.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/cudart_utils.hpp>
+#include <raft/util/itertools.hpp>
 
 #include <gtest/gtest.h>
 
+#include <unordered_set>
+
 namespace raft {
 namespace matrix {
 
@@ -33,11 +37,12 @@ struct inputs {
   int N;
   int dim;
   int n_samples;
+  bool host;
 };
 
 ::std::ostream& operator<<(::std::ostream& os, const inputs p)
 {
-  os << p.N << "#" << p.dim << "#" << p.n_samples;
+  os << p.N << "#" << p.dim << "#" << p.n_samples << (p.host ? "#host" : "#device");
   return os;
 }
 
@@ -46,22 +51,50 @@ class SampleRowsTest : public ::testing::TestWithParam<inputs> {
  public:
   SampleRowsTest()
     : params(::testing::TestWithParam<inputs>::GetParam()),
+      stream(resource::get_cuda_stream(res)),
       state{137ULL},
       in(make_device_matrix<T, int64_t>(res, params.N, params.dim)),
-      out(make_device_matrix<T, int64_t>(res, 0, 0))
-
+      out(make_device_matrix<T, int64_t>(res, 0, 0)),
+      in_h(make_host_matrix<T, int64_t>(res, params.N, params.dim)),
+      out_h(make_host_matrix<T, int64_t>(res, params.n_samples, params.dim))
   {
     raft::random::uniform(res, state, in.data_handle(), in.size(), T(-1.0), T(1.0));
+    for (int64_t i = 0; i < params.N; i++) {
+      for (int64_t k = 0; k < params.dim; k++)
+        in_h(i, k) = i * 1000 + k;
+    }
+    raft::copy(in.data_handle(), in_h.data_handle(), in_h.size(), stream);
   }
 
   void check()
   {
-    out = raft::matrix::sample_rows<T, int64_t>(
-      res, state, make_const_mdspan(in.view()), params.n_samples);
+    if (params.host) {
+      out = raft::matrix::sample_rows<T, int64_t>(
+        res, state, make_const_mdspan(in_h.view()), (int64_t)params.n_samples);
+    } else {
+      out = raft::matrix::sample_rows<T, int64_t>(
+        res, state, make_const_mdspan(in.view()), (int64_t)params.n_samples);
+    }
+
+    raft::copy(out_h.data_handle(), out.data_handle(), out.size(), stream);
+    resource::sync_stream(res, stream);
+
     ASSERT_TRUE(out.extent(0) == params.n_samples);
     ASSERT_TRUE(out.extent(1) == params.dim);
-    // TODO(tfeher): check sampled values
-    // TODO(tfeher): check host / device input
+
+    std::unordered_set<int> occurrence;
+
+    for (int64_t i = 0; i < params.n_samples; ++i) {
+      int val = (int)out_h(i, 0) / 1000;
+      ASSERT_TRUE(0 <= val && val < params.N)
+        << "out-of-range index @i=" << i << " val=" << val << " params=" << params;
+      ASSERT_TRUE(occurrence.find(val) == occurrence.end())
+        << "repeated index @i=" << i << " idx=" << val << " params=" << params;
+      occurrence.insert(val);
+      for (int64_t k = 0; k < params.dim; k++) {
+        ASSERT_TRUE(raft::match((int64_t)(out_h(i, k)), val * 1000 + k, raft::Compare<int64_t>()));
+      }
+    }
   }
 
  protected:
@@ -69,15 +102,38 @@ class SampleRowsTest : public ::testing::TestWithParam<inputs> {
   raft::resources res;
   cudaStream_t stream;
   random::RngState state;
-  device_matrix<T, int64_t> out, in;
+  device_matrix<T, int64_t> in, out;
+  host_matrix<T, int64_t> in_h, out_h;
 };
 
-const std::vector<inputs> input1 = {
-  {10, 1, 1}, {10, 4, 1}, {10, 4, 10}, {10, 10}, {137, 42, 59}, {10000, 128, 893}};
+inline std::vector<inputs> generate_inputs()
+{
+  std::vector<inputs> input1 =
+    raft::util::itertools::product<inputs>({10}, {1, 17, 96}, {1, 6, 9, 10}, {false});
+
+  std::vector<inputs> input2 =
+    raft::util::itertools::product<inputs>({137}, {1, 17, 128}, {1, 10, 100, 137}, {false});
+  input1.insert(input1.end(), input2.begin(), input2.end());
+
+  input2 = raft::util::itertools::product<inputs>(
+    {100000}, {1, 42}, {1, 137, 1000, 10000, 100000}, {false});
+  input1.insert(input1.end(), input2.begin(), input2.end());
+
+  int n = input1.size();
+  // Add same tests for host data
+  for (int i = 0; i < n; i++) {
+    inputs x = input1[i];
+    x.host   = true;
+    input1.push_back(x);
+  }
+  return input1;
+}
+
+const std::vector<inputs> inputs1 = generate_inputs();
 
 using SampleRowsTestInt64 = SampleRowsTest<float>;
 TEST_P(SampleRowsTestInt64, SamplingTest) { check(); }
-INSTANTIATE_TEST_SUITE_P(SampleRowsTests, SampleRowsTestInt64, ::testing::ValuesIn(input1));
+INSTANTIATE_TEST_SUITE_P(SampleRowsTests, SampleRowsTestInt64, ::testing::ValuesIn(inputs1));
 
 }  // namespace matrix
 }  // namespace raft
diff --git a/cpp/test/random/excess_sampling.cu b/cpp/test/random/excess_sampling.cu
index 8c788c491b..45ed2a6727 100644
--- a/cpp/test/random/excess_sampling.cu
+++ b/cpp/test/random/excess_sampling.cu
@@ -51,6 +51,7 @@ class ExcessSamplingTest : public ::testing::TestWithParam<inputs> {
  public:
   ExcessSamplingTest()
     : params(::testing::TestWithParam<inputs>::GetParam()),
+      stream(resource::get_cuda_stream(res)),
       state{137ULL},
       in(make_device_vector<T, int64_t>(res, params.n_samples)),
       out(make_device_vector<T, int64_t>(res, 0)),
@@ -89,7 +90,7 @@ class ExcessSamplingTest : public ::testing::TestWithParam<inputs> {
   raft::resources res;
   cudaStream_t stream;
   RngState state;
-  device_vector<T, int64_t> out, in;
+  device_vector<T, int64_t> in, out;
   host_vector<T, int64_t> h_out;
 };
 

From 3f9cbc36fd389dbe35f31dc842c44e5801569082 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Fri, 15 Mar 2024 12:41:21 +0100
Subject: [PATCH 12/16] Address issues

---
 cpp/include/raft/matrix/detail/gather.cuh     |  2 --
 cpp/include/raft/matrix/sample_rows.cuh       | 28 +++++++++++++++++--
 cpp/include/raft/random/detail/rng_device.cuh |  1 -
 cpp/include/raft/random/detail/rng_impl.cuh   |  4 +++
 cpp/include/raft/random/rng.cuh               |  6 ++--
 .../raft/spatial/knn/detail/ann_utils.cuh     |  4 ---
 cpp/test/matrix/sample_rows.cu                |  9 +++---
 7 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/cpp/include/raft/matrix/detail/gather.cuh b/cpp/include/raft/matrix/detail/gather.cuh
index 553f2d71f1..04541e4378 100644
--- a/cpp/include/raft/matrix/detail/gather.cuh
+++ b/cpp/include/raft/matrix/detail/gather.cuh
@@ -379,12 +379,10 @@ void gather(raft::resources const& res,
   const size_t max_batch_size = 32768;
   // Gather the vector on the host in tmp buffers. We use two buffers to overlap H2D sync
   // and gathering the data.
-  raft::common::nvtx::push_range("gather::alloc_buffers");
   auto out_tmp1 = raft::make_pinned_matrix<T, IdxT>(res, max_batch_size, n_dim);
   auto out_tmp2 = raft::make_pinned_matrix<T, IdxT>(res, max_batch_size, n_dim);
   auto view1    = out_tmp1.view();
   auto view2    = out_tmp2.view();
-  raft::common::nvtx::pop_range();
 
   gather_buff(dataset, make_const_mdspan(indices_host.view()), (IdxT)0, view1);
 #pragma omp parallel
diff --git a/cpp/include/raft/matrix/sample_rows.cuh b/cpp/include/raft/matrix/sample_rows.cuh
index 7a1f9bf756..7925d344e4 100644
--- a/cpp/include/raft/matrix/sample_rows.cuh
+++ b/cpp/include/raft/matrix/sample_rows.cuh
@@ -25,17 +25,41 @@
 
 namespace raft::matrix {
 
-/** Select rows randomly from input and copy to output. */
+/** @brief Select rows randomly from input and copy to output.
+ *
+ * The rows are selected randomly. The random sampling method does not guarantee completely unique
+ * selection of rows, but it is close to being unique.
+ *
+ * @param res RAFT resource handle
+ * @param random_state
+ * @param dataset input dataset
+ * @param output subsampled dataset
+ */
 template <typename T, typename IdxT = int64_t, typename accessor>
 void sample_rows(raft::resources const& res,
                  random::RngState random_state,
                  mdspan<const T, matrix_extent<IdxT>, row_major, accessor> dataset,
                  raft::device_matrix_view<T, IdxT> output)
 {
+  RAFT_EXPECTS(dataset.extent(1) == output.extent(1),
+               "dataset dims must match, but received %ld vs %ld",
+               static_cast<long>(dataset.extent(1)),
+               static_cast<long>(output.extent(1)));
   detail::sample_rows<T, IdxT>(res, random_state, dataset.data_handle(), dataset.extent(0), output);
 }
 
-/** Subsample the dataset to create a training set*/
+/** @brief Select rows randomly from input and copy to output.
+ *
+ * The rows are selected randomly. The random sampling method does not guarantee completely unique
+ * selection of rows, but it is close to being unique.
+ *
+ * @param res RAFT resource handle
+ * @param random_state
+ * @param dataset input dataset
+ * @param n_samples number of rows in the returned matrix
+ *
+ * @return subsampled dataset
+ * */
 template <typename T, typename IdxT = int64_t, typename accessor>
 raft::device_matrix<T, IdxT> sample_rows(
   raft::resources const& res,
diff --git a/cpp/include/raft/random/detail/rng_device.cuh b/cpp/include/raft/random/detail/rng_device.cuh
index 5e962fc982..12c67679ba 100644
--- a/cpp/include/raft/random/detail/rng_device.cuh
+++ b/cpp/include/raft/random/detail/rng_device.cuh
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <raft/linalg/map.cuh>
 #include <raft/random/rng_state.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/integer_utils.hpp>
diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh
index 70ef1bbfcc..98841cdf90 100644
--- a/cpp/include/raft/random/detail/rng_impl.cuh
+++ b/cpp/include/raft/random/detail/rng_impl.cuh
@@ -19,6 +19,8 @@
 #include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/math.hpp>
+#include <raft/core/operators.cuh>
+#include <raft/linalg/map.cuh>
 #include <raft/random/rng_device.cuh>
 #include <raft/random/rng_state.hpp>
 #include <raft/util/cudart_utils.hpp>
@@ -444,6 +446,8 @@ auto excess_subsample(raft::resources const& res, RngState& state, IdxT N, IdxT
     RAFT_LOG_DEBUG("Subsampling returned with less unique indices (%zu) than requested (%zu)",
                    (size_t)selected,
                    (size_t)n_samples);
+
+    // We continue to select n_samples elements, this will now contains a few duplicates.
   }
 
   // After duplicates are removed, we need to shuffle back to random order
diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh
index 977d82830b..6fd1071d30 100644
--- a/cpp/include/raft/random/rng.cuh
+++ b/cpp/include/raft/random/rng.cuh
@@ -813,9 +813,11 @@ void sampleWithoutReplacement(raft::resources const& handle,
     rng_state, out, outIdx, in, wts, sampledLen, len, resource::get_cuda_stream(handle));
 }
 
-/** @brief Sample without replacement from range 0..N-1.
+/** @brief Sample from range 0..N-1.
+ *
+ * Elements are sampled uniformly. The method aims to sample without replacement,
+ * but there is a small probability of a few having duplicate elements.
  *
- * Elements are sampled uniformly.
  * The algorithm will allocate a workspace of size 4*n_samples*sizeof(IdxT) internally.
  *
  * We use max N random numbers. Depending on how large n_samples is w.r.t to N, we
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index 78e63f756d..d862e586e3 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -18,10 +18,6 @@
 
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
-#include <raft/linalg/map.cuh>
-#include <raft/matrix/gather.cuh>
-#include <raft/random/rng.cuh>
-#include <raft/random/sample_without_replacement.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/integer_utils.hpp>
diff --git a/cpp/test/matrix/sample_rows.cu b/cpp/test/matrix/sample_rows.cu
index 048edde2ba..e332a918fe 100644
--- a/cpp/test/matrix/sample_rows.cu
+++ b/cpp/test/matrix/sample_rows.cu
@@ -85,14 +85,14 @@ class SampleRowsTest : public ::testing::TestWithParam<inputs> {
     std::unordered_set<int> occurrence;
 
     for (int64_t i = 0; i < params.n_samples; ++i) {
-      int val = (int)out_h(i, 0) / 1000;
+      T val = out_h(i, 0) / 1000;
       ASSERT_TRUE(0 <= val && val < params.N)
         << "out-of-range index @i=" << i << " val=" << val << " params=" << params;
-      ASSERT_TRUE(occurrence.find(val) == occurrence.end())
+      EXPECT_TRUE(occurrence.find(val) == occurrence.end())
         << "repeated index @i=" << i << " idx=" << val << " params=" << params;
       occurrence.insert(val);
       for (int64_t k = 0; k < params.dim; k++) {
-        ASSERT_TRUE(raft::match((int64_t)(out_h(i, k)), val * 1000 + k, raft::Compare<int64_t>()));
+        ASSERT_TRUE(raft::match(out_h(i, k), val * 1000 + k, raft::CompareApprox<T>(1e-6)));
       }
     }
   }
@@ -116,7 +116,8 @@ inline std::vector<inputs> generate_inputs()
   input1.insert(input1.end(), input2.begin(), input2.end());
 
   input2 = raft::util::itertools::product<inputs>(
-    {100000}, {1, 42}, {1, 137, 1000, 10000, 100000}, {false});
+    {100000}, {1, 42}, {1, 137, 1000, 10000, 50000, 62000, 100000}, {false});
+
   input1.insert(input1.end(), input2.begin(), input2.end());
 
   int n = input1.size();

From 57cb99c1423c215465ff7a1b317a67d90c3d96a4 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Fri, 15 Mar 2024 22:19:16 +0100
Subject: [PATCH 13/16] change member variables in test to local vars

---
 cpp/test/random/excess_sampling.cu | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/cpp/test/random/excess_sampling.cu b/cpp/test/random/excess_sampling.cu
index 45ed2a6727..e86436fb7d 100644
--- a/cpp/test/random/excess_sampling.cu
+++ b/cpp/test/random/excess_sampling.cu
@@ -52,21 +52,20 @@ class ExcessSamplingTest : public ::testing::TestWithParam<inputs> {
   ExcessSamplingTest()
     : params(::testing::TestWithParam<inputs>::GetParam()),
       stream(resource::get_cuda_stream(res)),
-      state{137ULL},
-      in(make_device_vector<T, int64_t>(res, params.n_samples)),
-      out(make_device_vector<T, int64_t>(res, 0)),
-      h_out(make_host_vector<T, int64_t>(res, params.n_samples))
-
+      state{137ULL}
   {
   }
 
   void check()
   {
-    out = raft::random::excess_subsample<T, int64_t>(res, state, params.N, params.n_samples);
+    device_vector<T, int64_t> out =
+      raft::random::excess_subsample<T, int64_t>(res, state, params.N, params.n_samples);
     ASSERT_TRUE(out.extent(0) == params.n_samples);
-    raft::copy(h_out.data_handle(), out.data_handle(), out.size(), stream);
 
+    auto h_out = make_host_vector<T, int64_t>(res, params.n_samples);
+    raft::copy(h_out.data_handle(), out.data_handle(), out.size(), stream);
     resource::sync_stream(res, stream);
+
     std::unordered_set<int> occurrence;
     int64_t sum = 0;
     for (int64_t i = 0; i < params.n_samples; ++i) {
@@ -90,8 +89,6 @@ class ExcessSamplingTest : public ::testing::TestWithParam<inputs> {
   raft::resources res;
   cudaStream_t stream;
   RngState state;
-  device_vector<T, int64_t> in, out;
-  host_vector<T, int64_t> h_out;
 };
 
 const std::vector<inputs> input1 = {{1, 0},

From 84e307e88109ddffa61be9ceb2cb65cfc9eed2b8 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Mon, 18 Mar 2024 14:31:01 +0100
Subject: [PATCH 14/16] Fix omp gather and add bench

---
 cpp/bench/prims/matrix/gather.cu          | 38 ++++++++++-
 cpp/include/raft/matrix/detail/gather.cuh | 79 ++++++++++++++---------
 2 files changed, 83 insertions(+), 34 deletions(-)

diff --git a/cpp/bench/prims/matrix/gather.cu b/cpp/bench/prims/matrix/gather.cu
index e6f26ba925..078f9e6198 100644
--- a/cpp/bench/prims/matrix/gather.cu
+++ b/cpp/bench/prims/matrix/gather.cu
@@ -16,34 +16,48 @@
 
 #include <common/benchmark.hpp>
 
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/host_mdarray.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/matrix/gather.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/itertools.hpp>
 
 #include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
 
 namespace raft::bench::matrix {
 
 template <typename IdxT>
 struct GatherParams {
   IdxT rows, cols, map_length;
+  bool host;
 };
 
 template <typename IdxT>
 inline auto operator<<(std::ostream& os, const GatherParams<IdxT>& p) -> std::ostream&
 {
-  os << p.rows << "#" << p.cols << "#" << p.map_length;
+  os << p.rows << "#" << p.cols << "#" << p.map_length << (p.host ? "#host" : "#device");
   return os;
 }
 
 template <typename T, typename MapT, typename IdxT, bool Conditional = false>
 struct Gather : public fixture {
   Gather(const GatherParams<IdxT>& p)
-    : params(p), matrix(this->handle), map(this->handle), out(this->handle), stencil(this->handle)
+    : params(p),
+      old_mr(rmm::mr::get_current_device_resource()),
+      pool_mr(rmm::mr::get_current_device_resource(), 2 * (1ULL << 30)),
+      matrix(this->handle),
+      map(this->handle),
+      out(this->handle),
+      stencil(this->handle),
+      matrix_h(this->handle)
   {
+    rmm::mr::set_current_device_resource(&pool_mr);
   }
 
+  ~Gather() { rmm::mr::set_current_device_resource(old_mr); }
+
   void allocate_data(const ::benchmark::State& state) override
   {
     matrix  = raft::make_device_matrix<T, IdxT>(handle, params.rows, params.cols);
@@ -59,6 +73,11 @@ struct Gather : public fixture {
     if constexpr (Conditional) {
       raft::random::uniform(handle, rng, stencil.data_handle(), params.map_length, T(-1), T(1));
     }
+
+    if (params.host) {
+      matrix_h = raft::make_host_matrix<T, IdxT>(handle, params.rows, params.cols);
+      raft::copy(matrix_h.data_handle(), matrix.data_handle(), matrix.size(), stream);
+    }
     resource::sync_stream(handle, stream);
   }
 
@@ -77,14 +96,22 @@ struct Gather : public fixture {
         raft::matrix::gather_if(
           handle, matrix_const_view, out.view(), map_const_view, stencil_const_view, pred_op);
       } else {
-        raft::matrix::gather(handle, matrix_const_view, map_const_view, out.view());
+        if (params.host) {
+          raft::matrix::detail::gather(
+            handle, make_const_mdspan(matrix_h.view()), map_const_view, out.view());
+        } else {
+          raft::matrix::gather(handle, matrix_const_view, map_const_view, out.view());
+        }
       }
     });
   }
 
  private:
   GatherParams<IdxT> params;
+  rmm::mr::device_memory_resource* old_mr;
+  rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> pool_mr;
   raft::device_matrix<T, IdxT> matrix, out;
+  raft::host_matrix<T, IdxT> matrix_h;
   raft::device_vector<T, IdxT> stencil;
   raft::device_vector<MapT, IdxT> map;
 };  // struct Gather
@@ -100,4 +127,9 @@ RAFT_BENCH_REGISTER((Gather<float, uint32_t, int64_t>), "", gather_inputs_i64);
 RAFT_BENCH_REGISTER((Gather<double, uint32_t, int64_t>), "", gather_inputs_i64);
 RAFT_BENCH_REGISTER((GatherIf<float, uint32_t, int64_t>), "", gather_inputs_i64);
 RAFT_BENCH_REGISTER((GatherIf<double, uint32_t, int64_t>), "", gather_inputs_i64);
+
+auto inputs_host = raft::util::itertools::product<GatherParams<int64_t>>(
+  {10000000}, {100}, {1000, 1000000, 10000000}, {true});
+RAFT_BENCH_REGISTER((Gather<float, uint32_t, int64_t>), "Host", inputs_host);
+
 }  // namespace raft::bench::matrix
diff --git a/cpp/include/raft/matrix/detail/gather.cuh b/cpp/include/raft/matrix/detail/gather.cuh
index 04541e4378..05cc9204bf 100644
--- a/cpp/include/raft/matrix/detail/gather.cuh
+++ b/cpp/include/raft/matrix/detail/gather.cuh
@@ -27,6 +27,8 @@
 #include <raft/util/cuda_dev_essentials.cuh>
 #include <raft/util/cudart_utils.hpp>
 
+#include <omp.h>
+
 #include <functional>
 
 namespace raft {
@@ -344,11 +346,14 @@ void gather_if(const InputIteratorT in,
   gatherImpl(in, D, N, map, stencil, map_length, out, pred_op, transform_op, stream);
 }
 
-template <typename T, typename IdxT = int64_t>
-void gather_buff(host_matrix_view<const T, IdxT> dataset,
-                 host_vector_view<const IdxT, IdxT> indices,
-                 IdxT offset,
-                 pinned_matrix_view<T, IdxT> buff)
+/**
+ * Helper function to gather a set of vectors from a (host) dataset.
+ */
+template <typename T, typename IdxT, typename MatIdxT = int64_t>
+void gather_buff(host_matrix_view<const T, MatIdxT> dataset,
+                 host_vector_view<const IdxT, MatIdxT> indices,
+                 MatIdxT offset,
+                 pinned_matrix_view<T, MatIdxT> buff)
 {
   raft::common::nvtx::range<common::nvtx::domain::raft> fun_scope("gather_host_buff");
   IdxT batch_size = std::min<IdxT>(buff.extent(0), indices.extent(0) - offset);
@@ -362,47 +367,59 @@ void gather_buff(host_matrix_view<const T, IdxT> dataset,
   }
 }
 
-template <typename T, typename IdxT>
+template <typename T, typename IdxT, typename MatIdxT = int64_t>
 void gather(raft::resources const& res,
-            host_matrix_view<const T, IdxT> dataset,
-            device_vector_view<const IdxT, IdxT> indices,
-            raft::device_matrix_view<T, IdxT> output)
+            host_matrix_view<const T, MatIdxT> dataset,
+            device_vector_view<const IdxT, MatIdxT> indices,
+            raft::device_matrix_view<T, MatIdxT> output)
 {
   raft::common::nvtx::range<common::nvtx::domain::raft> fun_scope("gather");
   IdxT n_dim        = output.extent(1);
   IdxT n_train      = output.extent(0);
-  auto indices_host = raft::make_host_vector<IdxT, IdxT>(n_train);
+  auto indices_host = raft::make_host_vector<IdxT, MatIdxT>(n_train);
   raft::copy(
     indices_host.data_handle(), indices.data_handle(), n_train, resource::get_cuda_stream(res));
   resource::sync_stream(res);
 
-  const size_t max_batch_size = 32768;
+  const size_t buffer_size = 32768 * 1024;  // bytes
+  const size_t max_batch_size =
+    std::min<size_t>(round_up_safe<size_t>(buffer_size / n_dim, 32), n_train);
+  RAFT_LOG_DEBUG("Gathering data with batch size %zu", max_batch_size);
+
   // Gather the vector on the host in tmp buffers. We use two buffers to overlap H2D sync
   // and gathering the data.
-  auto out_tmp1 = raft::make_pinned_matrix<T, IdxT>(res, max_batch_size, n_dim);
-  auto out_tmp2 = raft::make_pinned_matrix<T, IdxT>(res, max_batch_size, n_dim);
-  auto view1    = out_tmp1.view();
-  auto view2    = out_tmp2.view();
+  auto out_tmp1 = raft::make_pinned_matrix<T, MatIdxT>(res, max_batch_size, n_dim);
+  auto out_tmp2 = raft::make_pinned_matrix<T, MatIdxT>(res, max_batch_size, n_dim);
+
+  // Usually a limited number of threads provide sufficient bandwidth for gathering data.
+  int n_threads = std::min(omp_get_max_threads(), 32);
+
+  // The gather_buff function has a parallel for loop. We start the the omp parallel
+  // region here, to avoid repeated overhead within the device_offset loop.
+#pragma omp parallel num_threads(n_threads)
+  {
+    auto view1 = out_tmp1.view();
+    auto view2 = out_tmp2.view();
+    gather_buff(dataset, make_const_mdspan(indices_host.view()), (MatIdxT)0, view1);
+    for (MatIdxT device_offset = 0; device_offset < n_train; device_offset += max_batch_size) {
+      MatIdxT batch_size = std::min<IdxT>(max_batch_size, n_train - device_offset);
 
-  gather_buff(dataset, make_const_mdspan(indices_host.view()), (IdxT)0, view1);
-#pragma omp parallel
-  for (IdxT device_offset = 0; device_offset < n_train; device_offset += max_batch_size) {
-    IdxT batch_size = std::min<IdxT>(max_batch_size, n_train - device_offset);
 #pragma omp master
-    raft::copy(output.data_handle() + device_offset * n_dim,
-               view1.data_handle(),
-               batch_size * n_dim,
-               resource::get_cuda_stream(res));
-    // Start gathering the next batch on the host.
-    IdxT host_offset = device_offset + batch_size;
-    batch_size       = std::min<IdxT>(max_batch_size, n_train - host_offset);
-    if (batch_size > 0) {
-      gather_buff(dataset, make_const_mdspan(indices_host.view()), host_offset, view2);
-    }
+      raft::copy(output.data_handle() + device_offset * n_dim,
+                 view1.data_handle(),
+                 batch_size * n_dim,
+                 resource::get_cuda_stream(res));
+      // Start gathering the next batch on the host.
+      MatIdxT host_offset = device_offset + batch_size;
+      batch_size          = std::min<IdxT>(max_batch_size, n_train - host_offset);
+      if (batch_size > 0) {
+        gather_buff(dataset, make_const_mdspan(indices_host.view()), host_offset, view2);
+      }
 #pragma omp master
-    resource::sync_stream(res);
+      resource::sync_stream(res);
 #pragma omp barrier
-    std::swap(view1, view2);
+      std::swap(view1, view2);
+    }
   }
 }
 

From 65cf725d9af6a6225faa948b046d77601ae5ad36 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Mon, 18 Mar 2024 19:05:30 +0100
Subject: [PATCH 15/16] Fix docstring

---
 cpp/include/raft/random/detail/rng_impl.cuh | 2 +-
 cpp/include/raft/random/rng.cuh             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh
index 98841cdf90..61a944e9b6 100644
--- a/cpp/include/raft/random/detail/rng_impl.cuh
+++ b/cpp/include/raft/random/detail/rng_impl.cuh
@@ -349,7 +349,7 @@ void affine_transform_params(RngState const& rng_state, IdxT n, IdxT& a, IdxT& b
  * @tparam MatIdxT extent type of the returned mdarray
  *
  * @param res RAFT resource handle
- * @param RngState state random number generator state
+ * @param state random number generator state
  * @param N number of elements to sample from. We will sample values in range 0..N-1
  * @param n_samples number of samples to return
  *
diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh
index 6fd1071d30..b6d9a3c40a 100644
--- a/cpp/include/raft/random/rng.cuh
+++ b/cpp/include/raft/random/rng.cuh
@@ -827,7 +827,7 @@ void sampleWithoutReplacement(raft::resources const& handle,
  * @tparam MatIdxT extent type of the returned mdarray
  *
  * @param res RAFT resource handle
- * @param RngState state random number generator state
+ * @param state random number generator state
  * @param N number of elements to sample from. We will sample values in range 0..N-1
  * @param n_samples number of samples to return
  *

From 0b5d48f17c7dc8e8f7c21b04fd9b48ab1d7b868b Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Tue, 19 Mar 2024 00:46:11 +0100
Subject: [PATCH 16/16] comment

---
 cpp/include/raft/random/rng.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh
index b6d9a3c40a..7fd461980f 100644
--- a/cpp/include/raft/random/rng.cuh
+++ b/cpp/include/raft/random/rng.cuh
@@ -828,7 +828,7 @@ void sampleWithoutReplacement(raft::resources const& handle,
  *
  * @param res RAFT resource handle
  * @param state random number generator state
- * @param N number of elements to sample from. We will sample values in range 0..N-1
+ * @param N number of elements to sample from. We will sample values in range 0..N-1.
  * @param n_samples number of samples to return
  *
  * @return device mdarray with the random samples