diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
index 5214047571..7a0f1d5201 100644
--- a/cpp/bench/CMakeLists.txt
+++ b/cpp/bench/CMakeLists.txt
@@ -18,7 +18,18 @@ set(RAFT_CPP_BENCH_TARGET "bench_raft")
 
 # (please keep the filenames in alphabetical order)
 add_executable(${RAFT_CPP_BENCH_TARGET}
+  bench/distance/distance_cosine.cu
+  bench/distance/distance_exp_l2.cu
+  bench/distance/distance_l1.cu
+  bench/distance/distance_unexp_l2.cu
+  bench/linalg/add.cu
+  bench/linalg/map_then_reduce.cu
+  bench/linalg/matrix_vector_op.cu
   bench/linalg/reduce.cu
+  bench/random/make_blobs.cu
+  bench/random/permute.cu
+  bench/random/rng.cu
+  bench/spatial/fused_l2_nn.cu
   bench/spatial/selection.cu
   bench/main.cpp
 )
@@ -47,6 +58,8 @@ target_include_directories(${RAFT_CPP_BENCH_TARGET}
 target_link_libraries(${RAFT_CPP_BENCH_TARGET}
   PRIVATE
     raft::raft
+    raft::distance
+    raft::nn
     faiss::faiss
     benchmark::benchmark
     $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
diff --git a/cpp/bench/distance/distance_common.cuh b/cpp/bench/distance/distance_common.cuh
new file mode 100644
index 0000000000..dae2550326
--- /dev/null
+++ b/cpp/bench/distance/distance_common.cuh
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <raft/cudart_utils.h>
+#include <raft/distance/distance.hpp>
+#if defined RAFT_DISTANCE_COMPILED
+#include <raft/distance/specializations.hpp>
+#endif
+#include <rmm/device_uvector.hpp>
+
+namespace raft::bench::distance {
+
+struct distance_inputs {
+  int m, n, k;
+  bool isRowMajor;
+};  // struct distance_inputs
+
+template <typename T, raft::distance::DistanceType DType>
+struct distance : public fixture {
+  distance(const distance_inputs& p)
+    : params(p),
+      x(p.m * p.k, stream),
+      y(p.n * p.k, stream),
+      out(p.m * p.n, stream),
+      workspace(0, stream)
+  {
+    RAFT_CUDA_TRY(cudaMemsetAsync(x.data(), 0, x.size() * sizeof(T), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(y.data(), 0, y.size() * sizeof(T), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(out.data(), 0, out.size() * sizeof(T), stream));
+    worksize = raft::distance::getWorkspaceSize<DType, T, T, T>(
+      x.data(), y.data(), params.m, params.n, params.k);
+    workspace.resize(worksize, stream);
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    loop_on_state(state, [this]() {
+      raft::distance::distance<DType, T, T, T>(x.data(),
+                                               y.data(),
+                                               out.data(),
+                                               params.m,
+                                               params.n,
+                                               params.k,
+                                               (void*)workspace.data(),
+                                               worksize,
+                                               stream,
+                                               params.isRowMajor);
+    });
+  }
+
+ private:
+  distance_inputs params;
+  rmm::device_uvector<T> x, y, out;
+  rmm::device_uvector<char> workspace;
+  size_t worksize;
+};  // struct Distance
+
+const std::vector<distance_inputs> dist_input_vecs{
+  {32, 16384, 16384, true},    {64, 16384, 16384, true},    {128, 16384, 16384, true},
+  {256, 16384, 16384, true},   {512, 16384, 16384, true},   {1024, 16384, 16384, true},
+  {16384, 32, 16384, true},    {16384, 64, 16384, true},    {16384, 128, 16384, true},
+  {16384, 256, 16384, true},   {16384, 512, 16384, true},   {16384, 1024, 16384, true},
+  {16384, 16384, 32, true},    {16384, 16384, 64, true},    {16384, 16384, 128, true},
+  {16384, 16384, 256, true},   {16384, 16384, 512, true},   {16384, 16384, 1024, true},
+  {16384, 16384, 16384, true}, {32, 16384, 16384, false},   {64, 16384, 16384, false},
+  {128, 16384, 16384, false},  {256, 16384, 16384, false},  {512, 16384, 16384, false},
+  {1024, 16384, 16384, false}, {16384, 32, 16384, false},   {16384, 64, 16384, false},
+  {16384, 128, 16384, false},  {16384, 256, 16384, false},  {16384, 512, 16384, false},
+  {16384, 1024, 16384, false}, {16384, 16384, 32, false},   {16384, 16384, 64, false},
+  {16384, 16384, 128, false},  {16384, 16384, 256, false},  {16384, 16384, 512, false},
+  {16384, 16384, 1024, false}, {16384, 16384, 16384, false}
+
+};
+
+#define DIST_BENCH_REGISTER(Name, Metric)            \
+  using Name##F = distance<float, Metric>;           \
+  RAFT_BENCH_REGISTER(Name##F, "", dist_input_vecs); \
+  using Name##D = distance<double, Metric>;          \
+  RAFT_BENCH_REGISTER(Name##D, "", dist_input_vecs);
+
+}  // namespace raft::bench::distance
diff --git a/cpp/bench/distance/distance_cosine.cu b/cpp/bench/distance/distance_cosine.cu
new file mode 100644
index 0000000000..20f29ce4ef
--- /dev/null
+++ b/cpp/bench/distance/distance_cosine.cu
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "distance_common.cuh"
+
+namespace raft::bench::distance {
+
+DIST_BENCH_REGISTER(DistanceCosine, raft::distance::DistanceType::CosineExpanded);
+
+}  // namespace raft::bench::distance
diff --git a/cpp/bench/distance/distance_exp_l2.cu b/cpp/bench/distance/distance_exp_l2.cu
new file mode 100644
index 0000000000..5a3af17193
--- /dev/null
+++ b/cpp/bench/distance/distance_exp_l2.cu
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "distance_common.cuh"
+
+namespace raft::bench::distance {
+
+DIST_BENCH_REGISTER(DistanceL2Sq, raft::distance::DistanceType::L2Expanded);
+DIST_BENCH_REGISTER(DistanceL2Sqrt, raft::distance::DistanceType::L2SqrtExpanded);
+
+}  // namespace raft::bench::distance
diff --git a/cpp/bench/distance/distance_l1.cu b/cpp/bench/distance/distance_l1.cu
new file mode 100644
index 0000000000..2ad7d5e957
--- /dev/null
+++ b/cpp/bench/distance/distance_l1.cu
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "distance_common.cuh"
+
+namespace raft::bench::distance {
+
+DIST_BENCH_REGISTER(DistanceL1, raft::distance::DistanceType::L1);
+
+}  // namespace raft::bench::distance
diff --git a/cpp/bench/distance/distance_unexp_l2.cu b/cpp/bench/distance/distance_unexp_l2.cu
new file mode 100644
index 0000000000..406aca2378
--- /dev/null
+++ b/cpp/bench/distance/distance_unexp_l2.cu
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "distance_common.cuh"
+
+namespace raft::bench::distance {
+
+DIST_BENCH_REGISTER(DistanceUnexpL2Sq, raft::distance::DistanceType::L2Unexpanded);
+DIST_BENCH_REGISTER(DistanceUnexpL2Sqrt, raft::distance::DistanceType::L2SqrtUnexpanded);
+
+}  // namespace raft::bench::distance
diff --git a/cpp/bench/linalg/add.cu b/cpp/bench/linalg/add.cu
new file mode 100644
index 0000000000..7c651b61ed
--- /dev/null
+++ b/cpp/bench/linalg/add.cu
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <raft/linalg/add.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft::bench::linalg {
+
+struct add_inputs {
+  int len;
+};  // struct add_inputs
+
+template <typename T>
+struct add : public fixture {
+  add(const add_inputs& p) : params(p), ptr0(p.len, stream), ptr1(p.len, stream) {}
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    loop_on_state(state, [this]() {
+      raft::linalg::add(ptr0.data(), ptr0.data(), ptr1.data(), params.len, stream);
+    });
+  }
+
+ private:
+  add_inputs params;
+  rmm::device_uvector<T> ptr0, ptr1;
+};  // struct add
+
+const std::vector<add_inputs> add_input_vecs{
+  {256 * 1024 * 1024}, {256 * 1024 * 1024 + 2}, {256 * 1024 * 1024 + 1}
+
+};
+
+RAFT_BENCH_REGISTER(add<float>, "", add_input_vecs);
+RAFT_BENCH_REGISTER(add<double>, "", add_input_vecs);
+
+}  // namespace raft::bench::linalg
diff --git a/cpp/bench/linalg/map_then_reduce.cu b/cpp/bench/linalg/map_then_reduce.cu
new file mode 100644
index 0000000000..7eeb4a79b6
--- /dev/null
+++ b/cpp/bench/linalg/map_then_reduce.cu
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <raft/linalg/map_then_reduce.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft::bench::linalg {
+
+struct map_then_reduce_inputs {
+  int len;
+};
+
+template <typename Type>
+struct Identity {
+  HDI Type operator()(Type a) { return a; }
+};
+
+template <typename T>
+struct map_then_reduce : public fixture {
+  map_then_reduce(const map_then_reduce_inputs& p) : params(p), in(p.len, stream), out(1, stream) {}
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    loop_on_state(state, [this]() {
+      raft::linalg::mapThenSumReduce(out.data(), params.len, Identity<T>(), stream, in.data());
+    });
+  }
+
+ private:
+  map_then_reduce_inputs params;
+  rmm::device_uvector<T> out, in;
+};  // struct MapThenReduce
+
+const std::vector<map_then_reduce_inputs> map_then_reduce_input_vecs{
+  {1024 * 1024},
+  {32 * 1024 * 1024},
+  {1024 * 1024 * 1024},
+  {1024 * 1024 + 2},
+  {32 * 1024 * 1024 + 2},
+  {1024 * 1024 * 1024 + 2},
+  {1024 * 1024 + 1},
+  {32 * 1024 * 1024 + 1},
+  {1024 * 1024 * 1024 + 1},
+
+};
+
+RAFT_BENCH_REGISTER(map_then_reduce<float>, "", map_then_reduce_input_vecs);
+RAFT_BENCH_REGISTER(map_then_reduce<double>, "", map_then_reduce_input_vecs);
+
+}  // namespace raft::bench::linalg
diff --git a/cpp/bench/linalg/matrix_vector_op.cu b/cpp/bench/linalg/matrix_vector_op.cu
new file mode 100644
index 0000000000..d3a53ea345
--- /dev/null
+++ b/cpp/bench/linalg/matrix_vector_op.cu
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <raft/linalg/matrix_vector_op.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft::bench::linalg {
+
+struct mat_vec_op_inputs {
+  int rows, cols;
+  bool rowMajor, bcastAlongRows;
+};  // struct mat_vec_op_inputs
+
+template <typename T>
+struct mat_vec_op : public fixture {
+  mat_vec_op(const mat_vec_op_inputs& p)
+    : params(p),
+      out(p.rows * p.cols, stream),
+      in(p.rows * p.cols, stream),
+      vec(p.bcastAlongRows ? p.cols : p.rows, stream)
+  {
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    loop_on_state(state, [this]() {
+      raft::linalg::matrixVectorOp(out.data(),
+                                   in.data(),
+                                   vec.data(),
+                                   params.cols,
+                                   params.rows,
+                                   params.rowMajor,
+                                   params.bcastAlongRows,
+                                   raft::Sum<T>(),
+                                   stream);
+    });
+  }
+
+ private:
+  mat_vec_op_inputs params;
+  rmm::device_uvector<T> out, in, vec;
+};  // struct MatVecOp
+
+const std::vector<mat_vec_op_inputs> mat_vec_op_input_vecs{
+  {1024, 128, true, true},       {1024 * 1024, 128, true, true},
+  {1024, 128 + 2, true, true},   {1024 * 1024, 128 + 2, true, true},
+  {1024, 128 + 1, true, true},   {1024 * 1024, 128 + 1, true, true},
+
+  {1024, 128, true, false},      {1024 * 1024, 128, true, false},
+  {1024, 128 + 2, true, false},  {1024 * 1024, 128 + 2, true, false},
+  {1024, 128 + 1, true, false},  {1024 * 1024, 128 + 1, true, false},
+
+  {1024, 128, false, false},     {1024 * 1024, 128, false, false},
+  {1024, 128 + 2, false, false}, {1024 * 1024, 128 + 2, false, false},
+  {1024, 128 + 1, false, false}, {1024 * 1024, 128 + 1, false, false},
+
+  {1024, 128, false, true},      {1024 * 1024, 128, false, true},
+  {1024, 128 + 2, false, true},  {1024 * 1024, 128 + 2, false, true},
+  {1024, 128 + 1, false, true},  {1024 * 1024, 128 + 1, false, true},
+
+};
+
+RAFT_BENCH_REGISTER(mat_vec_op<float>, "", mat_vec_op_input_vecs);
+RAFT_BENCH_REGISTER(mat_vec_op<double>, "", mat_vec_op_input_vecs);
+
+}  // namespace raft::bench::linalg
diff --git a/cpp/bench/random/make_blobs.cu b/cpp/bench/random/make_blobs.cu
new file mode 100644
index 0000000000..c449223040
--- /dev/null
+++ b/cpp/bench/random/make_blobs.cu
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <raft/random/make_blobs.hpp>
+#include <rmm/device_uvector.hpp>
+#include <vector>
+
+namespace raft::bench::random {
+struct make_blobs_inputs {
+  int rows, cols, clusters;
+  bool row_major;
+};  // struct make_blobs_inputs
+
+template <typename T>
+struct make_blobs : public fixture {
+  make_blobs(const make_blobs_inputs& p)
+    : params(p), data(p.rows * p.cols, stream), labels(p.rows, stream)
+  {
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    loop_on_state(state, [this]() {
+      raft::random::make_blobs(data.data(),
+                               labels.data(),
+                               params.rows,
+                               params.cols,
+                               params.clusters,
+                               this->stream,
+                               params.row_major);
+    });
+  }
+
+ private:
+  make_blobs_inputs params;
+  rmm::device_uvector<T> data;
+  rmm::device_uvector<int> labels;
+};  // struct MakeBlobs
+
+static std::vector<make_blobs_inputs> get_make_blobs_input_vecs()
+{
+  std::vector<make_blobs_inputs> out;
+  make_blobs_inputs p;
+  for (auto rows : std::vector<int>{100000, 1000000}) {
+    for (auto cols : std::vector<int>{10, 100}) {
+      for (auto clusters : std::vector<int>{2, 10, 100}) {
+        p.rows      = rows;
+        p.cols      = cols;
+        p.clusters  = clusters;
+        p.row_major = true;
+        out.push_back(p);
+        p.row_major = false;
+        out.push_back(p);
+      }
+    }
+  }
+  return out;
+}
+
+RAFT_BENCH_REGISTER(make_blobs<float>, "", get_make_blobs_input_vecs());
+RAFT_BENCH_REGISTER(make_blobs<double>, "", get_make_blobs_input_vecs());
+
+}  // namespace raft::bench::random
diff --git a/cpp/bench/random/permute.cu b/cpp/bench/random/permute.cu
new file mode 100644
index 0000000000..9ec9fb2cc9
--- /dev/null
+++ b/cpp/bench/random/permute.cu
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <raft/cudart_utils.h>
+#include <raft/random/permute.hpp>
+#include <raft/random/rng.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+namespace raft::bench::random {
+
+struct permute_inputs {
+  int rows, cols;
+  bool needPerms, needShuffle, rowMajor;
+};  // struct permute_inputs
+
+template <typename T>
+struct permute : public fixture {
+  permute(const permute_inputs& p)
+    : params(p),
+      perms(p.needPerms ? p.rows : 0, stream),
+      out(p.rows * p.cols, stream),
+      in(p.rows * p.cols, stream)
+  {
+    raft::random::Rng r(123456ULL);
+    r.uniform(in.data(), p.rows, T(-1.0), T(1.0), stream);
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    raft::random::Rng r(123456ULL);
+    loop_on_state(state, [this, &r]() {
+      raft::random::permute(
+        perms.data(), out.data(), in.data(), params.cols, params.rows, params.rowMajor, stream);
+    });
+  }
+
+ private:
+  permute_inputs params;
+  rmm::device_uvector<T> out, in;
+  rmm::device_uvector<int> perms;
+};  // struct permute
+
+const std::vector<permute_inputs> permute_input_vecs = {
+  {32 * 1024, 128, true, true, true},
+  {1024 * 1024, 128, true, true, true},
+  {32 * 1024, 128 + 2, true, true, true},
+  {1024 * 1024, 128 + 2, true, true, true},
+  {32 * 1024, 128 + 1, true, true, true},
+  {1024 * 1024, 128 + 1, true, true, true},
+
+  {32 * 1024, 128, true, true, false},
+  {1024 * 1024, 128, true, true, false},
+  {32 * 1024, 128 + 2, true, true, false},
+  {1024 * 1024, 128 + 2, true, true, false},
+  {32 * 1024, 128 + 1, true, true, false},
+  {1024 * 1024, 128 + 1, true, true, false},
+
+};
+
+RAFT_BENCH_REGISTER(permute<float>, "", permute_input_vecs);
+RAFT_BENCH_REGISTER(permute<double>, "", permute_input_vecs);
+
+}  // namespace raft::bench::random
diff --git a/cpp/bench/random/rng.cu b/cpp/bench/random/rng.cu
new file mode 100644
index 0000000000..942606cddf
--- /dev/null
+++ b/cpp/bench/random/rng.cu
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <raft/cudart_utils.h>
+#include <raft/random/rng.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+namespace raft::bench::random {
+
+enum RandomType {
+  RNG_Normal,
+  RNG_LogNormal,
+  RNG_Uniform,
+  RNG_Gumbel,
+  RNG_Logistic,
+  RNG_Exp,
+  RNG_Rayleigh,
+  RNG_Laplace,
+  RNG_Fill
+};  // enum RandomType
+
+template <typename T>
+struct rng_inputs {
+  int len;
+  RandomType type;
+  raft::random::GeneratorType gtype;
+  T start, end;
+};  // struct rng_inputs
+
+template <typename T>
+struct rng : public fixture {
+  rng(const rng_inputs<T>& p) : params(p), ptr(p.len, stream) {}
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    raft::random::Rng r(123456ULL, params.gtype);
+    loop_on_state(state, [this, &r]() {
+      switch (params.type) {
+        case RNG_Normal: r.normal(ptr.data(), params.len, params.start, params.end, stream); break;
+        case RNG_LogNormal:
+          r.lognormal(ptr.data(), params.len, params.start, params.end, stream);
+          break;
+        case RNG_Uniform:
+          r.uniform(ptr.data(), params.len, params.start, params.end, stream);
+          break;
+        case RNG_Gumbel: r.gumbel(ptr.data(), params.len, params.start, params.end, stream); break;
+        case RNG_Logistic:
+          r.logistic(ptr.data(), params.len, params.start, params.end, stream);
+          break;
+        case RNG_Exp: r.exponential(ptr.data(), params.len, params.start, stream); break;
+        case RNG_Rayleigh: r.rayleigh(ptr.data(), params.len, params.start, stream); break;
+        case RNG_Laplace:
+          r.laplace(ptr.data(), params.len, params.start, params.end, stream);
+          break;
+        case RNG_Fill: r.fill(ptr.data(), params.len, params.start, stream); break;
+      };
+    });
+  }
+
+ private:
+  rng_inputs<T> params;
+  rmm::device_uvector<T> ptr;
+};  // struct RngBench
+
+template <typename T>
+static std::vector<rng_inputs<T>> get_rng_input_vecs()
+{
+  using namespace raft::random;
+  return {
+    {1024 * 1024, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)},
+    {32 * 1024 * 1024, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)},
+    {1024 * 1024 * 1024, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)},
+    {1024 * 1024 + 2, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)},
+    {32 * 1024 * 1024 + 2, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)},
+    {1024 * 1024 * 1024 + 2, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)},
+    {1024 * 1024 + 1, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)},
+    {32 * 1024 * 1024 + 1, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)},
+    {1024 * 1024 * 1024 + 1, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)},
+
+    {1024 * 1024, RNG_Uniform, GenPC, T(-1.0), T(1.0)},
+    {32 * 1024 * 1024, RNG_Uniform, GenPC, T(-1.0), T(1.0)},
+    {1024 * 1024 * 1024, RNG_Uniform, GenPC, T(-1.0), T(1.0)},
+    {1024 * 1024 + 2, RNG_Uniform, GenPC, T(-1.0), T(1.0)},
+    {32 * 1024 * 1024 + 2, RNG_Uniform, GenPC, T(-1.0), T(1.0)},
+    {1024 * 1024 * 1024 + 2, RNG_Uniform, GenPC, T(-1.0), T(1.0)},
+    {1024 * 1024 + 1, RNG_Uniform, GenPC, T(-1.0), T(1.0)},
+    {32 * 1024 * 1024 + 1, RNG_Uniform, GenPC, T(-1.0), T(1.0)},
+    {1024 * 1024 * 1024 + 1, RNG_Uniform, GenPC, T(-1.0), T(1.0)},
+
+    {1024 * 1024, RNG_Fill, GenPhilox, T(-1.0), T(1.0)},
+    {32 * 1024 * 1024, RNG_Fill, GenPhilox, T(-1.0), T(1.0)},
+    {1024 * 1024 * 1024, RNG_Fill, GenPhilox, T(-1.0), T(1.0)},
+    {1024 * 1024 + 2, RNG_Fill, GenPhilox, T(-1.0), T(1.0)},
+    {32 * 1024 * 1024 + 2, RNG_Fill, GenPhilox, T(-1.0), T(1.0)},
+    {1024 * 1024 * 1024 + 2, RNG_Fill, GenPhilox, T(-1.0), T(1.0)},
+    {1024 * 1024 + 1, RNG_Fill, GenPhilox, T(-1.0), T(1.0)},
+    {32 * 1024 * 1024 + 1, RNG_Fill, GenPhilox, T(-1.0), T(1.0)},
+    {1024 * 1024 * 1024 + 1, RNG_Fill, GenPhilox, T(-1.0), T(1.0)},
+  };
+}
+
+RAFT_BENCH_REGISTER(rng<float>, "", get_rng_input_vecs<float>());
+RAFT_BENCH_REGISTER(rng<double>, "", get_rng_input_vecs<double>());
+
+}  // namespace raft::bench::random
diff --git a/cpp/bench/spatial/fused_l2_nn.cu b/cpp/bench/spatial/fused_l2_nn.cu
new file mode 100644
index 0000000000..2eb2097920
--- /dev/null
+++ b/cpp/bench/spatial/fused_l2_nn.cu
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <limits>
+#include <raft/cudart_utils.h>
+#include <raft/distance/fused_l2_nn.hpp>
+#include <raft/handle.hpp>
+#include <raft/linalg/norm.hpp>
+#include <raft/random/rng.hpp>
+
+#if defined RAFT_NN_COMPILED
+#include <raft/spatial/knn/specializations.hpp>
+#endif
+
+namespace raft::bench::spatial {
+
+struct fused_l2_nn_inputs {
+  int m, n, k;
+};  // struct fused_l2_nn_inputs
+
+template <typename T>
+struct fused_l2_nn : public fixture {
+  fused_l2_nn(const fused_l2_nn_inputs& p)
+    : params(p),
+      out(p.m, stream),
+      x(p.m * p.k, stream),
+      y(p.n * p.k, stream),
+      xn(p.m, stream),
+      yn(p.n, stream),
+      workspace(p.m, stream)
+  {
+    raft::handle_t handle{stream};
+    raft::random::Rng r(123456ULL);
+
+    r.uniform(x.data(), p.m * p.k, T(-1.0), T(1.0), stream);
+    r.uniform(y.data(), p.n * p.k, T(-1.0), T(1.0), stream);
+    raft::linalg::rowNorm(xn.data(), x.data(), p.k, p.m, raft::linalg::L2Norm, true, stream);
+    raft::linalg::rowNorm(yn.data(), y.data(), p.k, p.n, raft::linalg::L2Norm, true, stream);
+    raft::distance::initialize<T, cub::KeyValuePair<int, T>, int>(
+      handle, out.data(), p.m, std::numeric_limits<T>::max(), op);
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    loop_on_state(state, [this]() {
+      // it is enough to only benchmark the L2-squared metric
+      raft::distance::fusedL2NN<T, cub::KeyValuePair<int, T>, int>(out.data(),
+                                                                   x.data(),
+                                                                   y.data(),
+                                                                   xn.data(),
+                                                                   yn.data(),
+                                                                   params.m,
+                                                                   params.n,
+                                                                   params.k,
+                                                                   (void*)workspace.data(),
+                                                                   op,
+                                                                   pairRedOp,
+                                                                   false,
+                                                                   false,
+                                                                   stream);
+    });
+  }
+
+ private:
+  fused_l2_nn_inputs params;
+  rmm::device_uvector<T> x, y, xn, yn;
+  rmm::device_uvector<cub::KeyValuePair<int, T>> out;
+  rmm::device_uvector<int> workspace;
+  raft::distance::KVPMinReduce<int, T> pairRedOp;
+  raft::distance::MinAndDistanceReduceOp<int, T> op;
+};  // struct FusedL2NN
+
+const std::vector<fused_l2_nn_inputs> fused_l2_nn_input_vecs = {
+  {32, 16384, 16384},  {64, 16384, 16384},   {128, 16384, 16384},   {256, 16384, 16384},
+  {512, 16384, 16384}, {1024, 16384, 16384}, {16384, 32, 16384},    {16384, 64, 16384},
+  {16384, 128, 16384}, {16384, 256, 16384},  {16384, 512, 16384},   {16384, 1024, 16384},
+  {16384, 16384, 32},  {16384, 16384, 64},   {16384, 16384, 128},   {16384, 16384, 256},
+  {16384, 16384, 512}, {16384, 16384, 1024}, {16384, 16384, 16384},
+
+};
+
+RAFT_BENCH_REGISTER(fused_l2_nn<float>, "", fused_l2_nn_input_vecs);
+RAFT_BENCH_REGISTER(fused_l2_nn<double>, "", fused_l2_nn_input_vecs);
+
+}  // namespace raft::bench::spatial
diff --git a/cpp/bench/spatial/selection.cu b/cpp/bench/spatial/selection.cu
index 09d02940a5..841c7754ed 100644
--- a/cpp/bench/spatial/selection.cu
+++ b/cpp/bench/spatial/selection.cu
@@ -17,6 +17,10 @@
 #include <common/benchmark.hpp>
 #include <raft/spatial/knn/knn.cuh>
 
+#if defined RAFT_NN_COMPILED
+#include <raft/spatial/knn/specializations.hpp>
+#endif
+
 #include <raft/random/rng.hpp>
 #include <raft/sparse/detail/utils.h>
 
diff --git a/cpp/include/raft/stats/completeness_score.cuh b/cpp/include/raft/stats/completeness_score.cuh
index dbfe6ce430..407986de05 100644
--- a/cpp/include/raft/stats/completeness_score.cuh
+++ b/cpp/include/raft/stats/completeness_score.cuh
@@ -19,7 +19,7 @@
 
 #pragma once
 
-#include <raft/stats/detail/completeness_score.cuh>
+#include <raft/stats/detail/homogeneity_score.cuh>
 
 namespace raft {
 namespace stats {
@@ -42,8 +42,8 @@ double completeness_score(const T* truthClusterArray,
                           T upperLabelRange,
                           cudaStream_t stream)
 {
-  return detail::completeness_score(
-    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
+  return detail::homogeneity_score(
+    predClusterArray, truthClusterArray, size, lowerLabelRange, upperLabelRange, stream);
 }
 
 };  // end namespace stats
diff --git a/cpp/include/raft/stats/completeness_score.hpp b/cpp/include/raft/stats/completeness_score.hpp
index 0dd97e9782..40e60c852c 100644
--- a/cpp/include/raft/stats/completeness_score.hpp
+++ b/cpp/include/raft/stats/completeness_score.hpp
@@ -18,39 +18,6 @@
  * Please use the cuh version instead.
  */
 
-#ifndef __COMPLETENESS_SCORE_H
-#define __COMPLETENESS_SCORE_H
-
 #pragma once
 
-#include <raft/stats/detail/completeness_score.cuh>
-
-namespace raft {
-namespace stats {
-
-/**
- * @brief Function to calculate the completeness score between two clusters
- *
- * @param truthClusterArray: the array of truth classes of type T
- * @param predClusterArray: the array of predicted classes of type T
- * @param size: the size of the data points of type int
- * @param lowerLabelRange: the lower bound of the range of labels
- * @param upperLabelRange: the upper bound of the range of labels
- * @param stream: the cudaStream object
- */
-template <typename T>
-double completeness_score(const T* truthClusterArray,
-                          const T* predClusterArray,
-                          int size,
-                          T lowerLabelRange,
-                          T upperLabelRange,
-                          cudaStream_t stream)
-{
-  return detail::completeness_score(
-    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
-}
-
-};  // end namespace stats
-};  // end namespace raft
-
-#endif
+#include <raft/stats/completeness_score.cuh>
\ No newline at end of file
diff --git a/cpp/include/raft/stats/detail/completeness_score.cuh b/cpp/include/raft/stats/detail/completeness_score.cuh
deleted file mode 100644
index 5e6fb835ef..0000000000
--- a/cpp/include/raft/stats/detail/completeness_score.cuh
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * @file completeness_score.cuh
- *
- * @brief A clustering result satisfies completeness if all the data points
- * that are members of a given class are elements of the same cluster.
- */
-
-#pragma once
-
-#include <raft/stats/entropy.cuh>
-#include <raft/stats/mutual_info_score.cuh>
-
-namespace raft {
-namespace stats {
-namespace detail {
-
-/**
- * @brief Function to calculate the completeness score between two clusters
- *
- * @param truthClusterArray: the array of truth classes of type T
- * @param predClusterArray: the array of predicted classes of type T
- * @param size: the size of the data points of type int
- * @param lowerLabelRange: the lower bound of the range of labels
- * @param upperLabelRange: the upper bound of the range of labels
- * @param stream: the cudaStream object
- */
-template <typename T>
-double completeness_score(const T* truthClusterArray,
-                          const T* predClusterArray,
-                          int size,
-                          T lowerLabelRange,
-                          T upperLabelRange,
-                          cudaStream_t stream)
-{
-  if (size == 0) return 1.0;
-
-  double computedMI, computedEntropy;
-
-  computedMI = raft::stats::mutual_info_score(
-    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
-  computedEntropy =
-    raft::stats::entropy(predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
-
-  double completeness;
-
-  if (computedEntropy) {
-    completeness = computedMI / computedEntropy;
-  } else
-    completeness = 1.0;
-
-  return completeness;
-}
-
-};  // end namespace detail
-};  // end namespace stats
-};  // end namespace raft