diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9768dc266c..cf4d0ed55e 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -317,8 +317,6 @@ if(RAFT_COMPILE_DIST_LIBRARY)
     src/distance/cluster/kmeans_init_plus_plus_float.cu
     src/distance/distance/specializations/detail/canberra_double_double_double_int.cu
     src/distance/distance/specializations/detail/canberra_float_float_float_int.cu
-    src/distance/distance/specializations/detail/chebyshev_double_double_double_int.cu
-    src/distance/distance/specializations/detail/chebyshev_float_float_float_int.cu
     src/distance/distance/specializations/detail/correlation_double_double_double_int.cu
     src/distance/distance/specializations/detail/correlation_float_float_float_int.cu
     src/distance/distance/specializations/detail/cosine_double_double_double_int.cu
@@ -352,6 +350,8 @@ if(RAFT_COMPILE_DIST_LIBRARY)
     src/distance/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu
     src/distance/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
     src/distance/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
+    src/distance/distance/specializations/detail/l_inf_double_double_double_int.cu
+    src/distance/distance/specializations/detail/l_inf_float_float_float_int.cu
     src/distance/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
     src/distance/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
     src/distance/distance/specializations/detail/russel_rao_double_double_double_int.cu
diff --git a/cpp/include/raft/distance/detail/canberra.cuh b/cpp/include/raft/distance/detail/canberra.cuh
deleted file mode 100644
index 88add4c2b0..0000000000
--- a/cpp/include/raft/distance/detail/canberra.cuh
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include <raft/distance/detail/pairwise_distance_base.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-/**
- * @brief the canberra distance matrix calculation implementer
- *  It computes the following equation: cij = max(cij, op(ai-bj))
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
- * @tparam Veclen         number of k-elements loaded by each thread
-                          for every LDG call. details in contractions.cuh
- * @tparam FinalLambda    final lambda called on final distance value
- * @tparam isRowMajor     true if input/output is row major,
-                          false for column major
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of rows of B and cols of C/D
- * @param[in]       k number of cols of A and B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[output]   dOutput output matrix
- * @param fin_op    the final gemm epilogue lambda
- * @param stream    cuda stream to launch work
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-static void canberraImpl(const DataT* x,
-                         const DataT* y,
-                         IdxT m,
-                         IdxT n,
-                         IdxT k,
-                         IdxT lda,
-                         IdxT ldb,
-                         IdxT ldd,
-                         OutT* dOutput,
-                         FinalLambda fin_op,
-                         cudaStream_t stream)
-{
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-  dim3 blk(KPolicy::Nthreads);
-
-  // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    const auto diff = raft::abs(x - y);
-    const auto add  = raft::abs(x) + raft::abs(y);
-    // deal with potential for 0 in denominator by
-    // forcing 1/0 instead
-    acc += ((add != 0) * diff / (add + (add == 0)));
-  };
-
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = raft::void_op();
-
-  if constexpr (isRowMajor) {
-    auto canberraRowMajor = pairwiseDistanceMatKernel<false,
-                                                      DataT,
-                                                      AccT,
-                                                      OutT,
-                                                      IdxT,
-                                                      KPolicy,
-                                                      decltype(core_lambda),
-                                                      decltype(epilog_lambda),
-                                                      FinalLambda,
-                                                      true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, canberraRowMajor);
-
-    canberraRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  } else {
-    auto canberraColMajor = pairwiseDistanceMatKernel<false,
-                                                      DataT,
-                                                      AccT,
-                                                      OutT,
-                                                      IdxT,
-                                                      KPolicy,
-                                                      decltype(core_lambda),
-                                                      decltype(epilog_lambda),
-                                                      FinalLambda,
-                                                      false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, canberraColMajor);
-    canberraColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void canberra(IdxT m,
-              IdxT n,
-              IdxT k,
-              IdxT lda,
-              IdxT ldb,
-              IdxT ldd,
-              const DataT* x,
-              const DataT* y,
-              OutT* dOutput,
-              FinalLambda fin_op,
-              cudaStream_t stream)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  canberraImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-    x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-}
-
-/**
- * @brief the canberra distance matrix calculation
- *  It computes the following equation: cij = max(cij, op(ai-bj))
- * @tparam InType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OutType output data-type (for C and D matrices)
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
- * @param[in] m number of rows of A and C/D
- * @param[in] n number of rows of B and cols of C/D
- * @param[in] k number of cols of A and B
- * @param[in] pA input matrix
- * @param[in] pB input matrix
- * @param[out] pD output matrix
- * @param[in] fin_op the final element-wise epilogue lambda
- * @param[in] stream cuda stream to launch work
- * @param[in] isRowMajor whether the input and output matrices are row major
- */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void canberraImpl(int m,
-                  int n,
-                  int k,
-                  const InType* pA,
-                  const InType* pB,
-                  OutType* pD,
-                  FinalLambda fin_op,
-                  cudaStream_t stream,
-                  bool isRowMajor)
-{
-  typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type canberraOutType;
-  Index_ lda, ldb, ldd;
-  canberraOutType* pDcast = reinterpret_cast<canberraOutType*>(pD);
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    canberra<InType, AccType, canberraOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
-  } else {
-    lda = n, ldb = m, ldd = m;
-    canberra<InType, AccType, canberraOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
-  }
-}
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/chebyshev.cuh b/cpp/include/raft/distance/detail/chebyshev.cuh
deleted file mode 100644
index 43b36e7921..0000000000
--- a/cpp/include/raft/distance/detail/chebyshev.cuh
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include <raft/core/operators.hpp>
-#include <raft/distance/detail/pairwise_distance_base.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-/**
- * @brief the Chebyshev distance matrix calculation implementer
- *  It computes the following equation: cij = max(cij, op(ai-bj))
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
- * @tparam Veclen         number of k-elements loaded by each thread
-                          for every LDG call. details in contractions.cuh
- * @tparam FinalLambda    final lambda called on final distance value
- * @tparam isRowMajor     true if input/output is row major,
-                          false for column major
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of rows of B and cols of C/D
- * @param[in]       k number of cols of A and B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[out]      dOutput output matrix
- * @param[in]       fin_op the final gemm epilogue lambda
- * @param[in]       stream cuda stream to launch work
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-static void chebyshevImpl(const DataT* x,
-                          const DataT* y,
-                          IdxT m,
-                          IdxT n,
-                          IdxT k,
-                          IdxT lda,
-                          IdxT ldb,
-                          IdxT ldd,
-                          OutT* dOutput,
-                          FinalLambda fin_op,
-                          cudaStream_t stream)
-{
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-  dim3 blk(KPolicy::Nthreads);
-
-  // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    const auto diff = raft::abs(x - y);
-    acc             = raft::max(acc, diff);
-  };
-
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = raft::void_op();
-
-  if (isRowMajor) {
-    auto chebyshevRowMajor = pairwiseDistanceMatKernel<false,
-                                                       DataT,
-                                                       AccT,
-                                                       OutT,
-                                                       IdxT,
-                                                       KPolicy,
-                                                       decltype(core_lambda),
-                                                       decltype(epilog_lambda),
-                                                       FinalLambda,
-                                                       true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, chebyshevRowMajor);
-
-    chebyshevRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  } else {
-    auto chebyshevColMajor = pairwiseDistanceMatKernel<false,
-                                                       DataT,
-                                                       AccT,
-                                                       OutT,
-                                                       IdxT,
-                                                       KPolicy,
-                                                       decltype(core_lambda),
-                                                       decltype(epilog_lambda),
-                                                       FinalLambda,
-                                                       false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, chebyshevColMajor);
-    chebyshevColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void chebyshev(IdxT m,
-               IdxT n,
-               IdxT k,
-               IdxT lda,
-               IdxT ldb,
-               IdxT ldd,
-               const DataT* x,
-               const DataT* y,
-               OutT* dOutput,
-               FinalLambda fin_op,
-               cudaStream_t stream)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    chebyshevImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    chebyshevImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else {
-    chebyshevImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  }
-}
-
-/**
- * @brief the chebyshev distance matrix calculation
- *  It computes the following equation: cij = max(cij, op(ai-bj))
- * @tparam InType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OutType output data-type (for C and D matrices)
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
- * @param[in] m number of rows of A and C/D
- * @param[in] n number of rows of B and cols of C/D
- * @param[in] k number of cols of A and B
- * @param[in] pA input matrix
- * @param[in] pB input matrix
- * @param[out] pD output matrix
- * @param[in] fin_op the final element-wise epilogue lambda
- * @param[in] stream cuda stream to launch work
- * @param[in] isRowMajor whether the input and output matrices are row major
- */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void chebyshevImpl(int m,
-                   int n,
-                   int k,
-                   const InType* pA,
-                   const InType* pB,
-                   OutType* pD,
-                   FinalLambda fin_op,
-                   cudaStream_t stream,
-                   bool isRowMajor)
-{
-  typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type chebyshevOutType;
-  Index_ lda, ldb, ldd;
-  chebyshevOutType* pDcast = reinterpret_cast<chebyshevOutType*>(pD);
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    chebyshev<InType, AccType, chebyshevOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
-  } else {
-    lda = n, ldb = m, ldd = m;
-    chebyshev<InType, AccType, chebyshevOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
-  }
-}
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/correlation.cuh b/cpp/include/raft/distance/detail/correlation.cuh
deleted file mode 100644
index f7fe3678e6..0000000000
--- a/cpp/include/raft/distance/detail/correlation.cuh
+++ /dev/null
@@ -1,339 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/reduce.cuh>
-#include <raft/util/cuda_utils.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-/**
- * @brief the Correlation distance matrix:
- *
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
- * @tparam Veclen         number of k-elements loaded by each thread
-                          for every LDG call. details in contractions.cuh
- * @tparam FinalLambda    final lambda called on final distance value
- * @tparam isRowMajor     true if input/output is row major,
-                          false for column major
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of rows of B and C/D
- * @param[in]       k number of cols of A and B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[output]   dOutput output matrix
- * @param[in]       fin_op the final gemm epilogue lambda
- * @param[in]       stream cuda stream to launch work
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-static void correlationImpl(const DataT* x,
-                            const DataT* y,
-                            const DataT* xn,
-                            const DataT* yn,
-                            const DataT* x2n,
-                            const DataT* y2n,
-                            IdxT m,
-                            IdxT n,
-                            IdxT k,
-                            IdxT lda,
-                            IdxT ldb,
-                            IdxT ldd,
-                            OutT* dOutput,
-                            FinalLambda fin_op,
-                            cudaStream_t stream)
-{
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-  dim3 blk(KPolicy::Nthreads);
-
-  // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
-
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [x2n, y2n, m, n, k] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn,
-                         DataT * regyn,
-                         IdxT gridStrideX,
-                         IdxT gridStrideY) {
-    DataT regx2n[KPolicy::AccRowsPerTh], regy2n[KPolicy::AccColsPerTh];
-
-    extern __shared__ char smem[];
-    DataT* sx2Norm =
-      (DataT*)(&smem[KPolicy::SmemSize + (KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)]);
-    DataT* sy2Norm = (&sx2Norm[KPolicy::Mblk]);
-
-    // Load x & y norms required by this threadblock in shmem buffer
-    if (gridStrideX == blockIdx.x * KPolicy::Nblk) {
-      for (int i = threadIdx.x; i < KPolicy::Mblk; i += KPolicy::Nthreads) {
-        auto idx   = gridStrideY + i;
-        sx2Norm[i] = idx < m ? x2n[idx] : 0;
-      }
-    }
-
-    for (int i = threadIdx.x; i < KPolicy::Nblk; i += KPolicy::Nthreads) {
-      auto idx   = gridStrideX + i;
-      sy2Norm[i] = idx < n ? y2n[idx] : 0;
-    }
-    __syncthreads();
-
-#pragma unroll
-    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-      regx2n[i] = sx2Norm[i * KPolicy::AccThRows + (threadIdx.x / KPolicy::AccThCols)];
-    }
-#pragma unroll
-    for (int i = 0; i < KPolicy::AccColsPerTh; ++i) {
-      regy2n[i] = sy2Norm[i * KPolicy::AccThCols + (threadIdx.x % KPolicy::AccThCols)];
-    }
-
-#pragma unroll
-    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-#pragma unroll
-      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        auto numer   = k * acc[i][j] - (regxn[i] * regyn[j]);
-        auto Q_denom = k * regx2n[i] - (regxn[i] * regxn[i]);
-        auto R_denom = k * regy2n[j] - (regyn[j] * regyn[j]);
-
-        acc[i][j] = 1 - (numer / raft::sqrt(Q_denom * R_denom));
-      }
-    }
-  };
-
-  constexpr size_t shmemSize =
-    KPolicy::SmemSize + (2 * (KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
-  if (isRowMajor) {
-    constexpr auto correlationRowMajor = pairwiseDistanceMatKernel<true,
-                                                                   DataT,
-                                                                   AccT,
-                                                                   OutT,
-                                                                   IdxT,
-                                                                   KPolicy,
-                                                                   decltype(core_lambda),
-                                                                   decltype(epilog_lambda),
-                                                                   FinalLambda,
-                                                                   true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, correlationRowMajor);
-    correlationRowMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  } else {
-    constexpr auto correlationColMajor = pairwiseDistanceMatKernel<true,
-                                                                   DataT,
-                                                                   AccT,
-                                                                   OutT,
-                                                                   IdxT,
-                                                                   KPolicy,
-                                                                   decltype(core_lambda),
-                                                                   decltype(epilog_lambda),
-                                                                   FinalLambda,
-                                                                   false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, correlationColMajor);
-    correlationColMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void correlation(IdxT m,
-                 IdxT n,
-                 IdxT k,
-                 IdxT lda,
-                 IdxT ldb,
-                 IdxT ldd,
-                 const DataT* x,
-                 const DataT* y,
-                 const DataT* xn,
-                 const DataT* yn,
-                 const DataT* x2n,
-                 const DataT* y2n,
-                 OutT* dOutput,
-                 FinalLambda fin_op,
-                 cudaStream_t stream)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    correlationImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    correlationImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else {
-    correlationImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  }
-}
-
-/**
- * @brief the Correlation distance matrix calculation
- *
- * @tparam InType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OutType output data-type (for C and D matrices)
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
- * @param m number of rows of A and C/D
- * @param n number of columns of B and C/D
- * @param k number of cols of A and rows of B
- * @param pA input matrix
- * @param pB input matrix
- * @param pD output matrix
- * @param fin_op the final element-wise epilogue lambda
- * @param stream cuda stream where to launch work
- * @param isRowMajor whether the input and output matrices are row major
- */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void correlationImpl(int m,
-                     int n,
-                     int k,
-                     const InType* pA,
-                     const InType* pB,
-                     OutType* pD,
-                     AccType* workspace,
-                     size_t& worksize,
-                     FinalLambda fin_op,
-                     cudaStream_t stream,
-                     bool isRowMajor)
-{
-  typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type correlationOutType;
-  Index_ lda, ldb, ldd;
-  correlationOutType* pDcast = reinterpret_cast<correlationOutType*>(pD);
-
-  ASSERT(!(((pA != pB) && (worksize < 2 * (m + n) * sizeof(AccType))) ||
-           (worksize < 2 * m * sizeof(AccType))),
-         "workspace size error");
-  ASSERT(workspace != nullptr, "workspace is null");
-
-  AccType* norm_col_vec    = workspace;
-  AccType* norm_row_vec    = workspace;
-  AccType* sq_norm_col_vec = workspace;
-  AccType* sq_norm_row_vec = workspace;
-  if (pA != pB) {
-    norm_row_vec += m;
-
-    raft::linalg::reduce(norm_col_vec,
-                         pA,
-                         k,
-                         m,
-                         (AccType)0,
-                         isRowMajor,
-                         true,
-                         stream,
-                         false,
-                         raft::identity_op(),
-                         raft::add_op());
-    raft::linalg::reduce(norm_row_vec,
-                         pB,
-                         k,
-                         n,
-                         (AccType)0,
-                         isRowMajor,
-                         true,
-                         stream,
-                         false,
-                         raft::identity_op(),
-                         raft::add_op());
-
-    sq_norm_col_vec += (m + n);
-    sq_norm_row_vec = sq_norm_col_vec + m;
-    raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream);
-    raft::linalg::rowNorm(sq_norm_row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream);
-  } else {
-    raft::linalg::reduce(norm_col_vec,
-                         pA,
-                         k,
-                         m,
-                         (AccType)0,
-                         isRowMajor,
-                         true,
-                         stream,
-                         false,
-                         raft::identity_op(),
-                         raft::add_op());
-    sq_norm_col_vec += m;
-    sq_norm_row_vec = sq_norm_col_vec;
-    raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream);
-  }
-
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    correlation<InType, AccType, correlationOutType, Index_, FinalLambda, true>(m,
-                                                                                n,
-                                                                                k,
-                                                                                lda,
-                                                                                ldb,
-                                                                                ldd,
-                                                                                pA,
-                                                                                pB,
-                                                                                norm_col_vec,
-                                                                                norm_row_vec,
-                                                                                sq_norm_col_vec,
-                                                                                sq_norm_row_vec,
-                                                                                pDcast,
-                                                                                fin_op,
-                                                                                stream);
-  } else {
-    lda = n, ldb = m, ldd = m;
-    correlation<InType, AccType, correlationOutType, Index_, FinalLambda, false>(n,
-                                                                                 m,
-                                                                                 k,
-                                                                                 lda,
-                                                                                 ldb,
-                                                                                 ldd,
-                                                                                 pB,
-                                                                                 pA,
-                                                                                 norm_row_vec,
-                                                                                 norm_col_vec,
-                                                                                 sq_norm_row_vec,
-                                                                                 sq_norm_col_vec,
-                                                                                 pDcast,
-                                                                                 fin_op,
-                                                                                 stream);
-  }
-}
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/cosine.cuh b/cpp/include/raft/distance/detail/cosine.cuh
deleted file mode 100644
index 46a694aa51..0000000000
--- a/cpp/include/raft/distance/detail/cosine.cuh
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/distance/detail/pairwise_distance_cutlass_base.cuh>
-#include <raft/linalg/norm.cuh>
-#include <raft/util/cuda_utils.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-template <typename DataT, typename AccT>
-struct CosineOp {
-  __device__ CosineOp() noexcept {}
-  __device__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const noexcept
-  {
-    return static_cast<AccT>(1.0) - (AccT)(accVal / (aNorm * bNorm));
-  }
-  __device__ AccT operator()(DataT aData) const noexcept { return aData; }
-};
-
-/**
- * @brief the cosine distance matrix calculation implementer
- *  It computes the following equation:
- *    C = 1 - op(A * B / sqrt(A^2) * sqrt(B^2)))
- * @tparam DataT input data-type (for A and B matrices)
- * @tparam AccT   accumulation data-type
- * @tparam OutT   output data-type (for C and D matrices)
- * @tparam IdxT   index data-type
- * @tparam Veclen number of k-elements loaded by each thread for every LDG call
- *                it makes. check contractions.cuh for details.
- * @tparam FinalLambda the final lambda called on final distance value
- * @tparam isRowMajor  true if input/output is row major,
-                       false for column major
- * @param[in]     x input matrix
- * @param[in]     y input matrix
- * @param[in]     xn row norms of input matrix A.
- * @param[in]     yn row norms of input matrix B.
- * @param[in]     m number of rows of A and C/D
- * @param[in]     n number of columns of B and C/D
- * @param[in]     k number of cols of A and rows of B
- * @param[in]     lda leading dimension of A
- * @param[in]     ldb leading dimension of B
- * @param[in]     ldd leading dimension of C/D
- * @param[output] pD output matrix
- * @param fin_op  the final gemm epilogue lambda
-*  @param stream  cuda stream to launch cuda operations.
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-void cosineImpl(const DataT* x,
-                const DataT* y,
-                const DataT* xn,
-                const DataT* yn,
-                IdxT m,
-                IdxT n,
-                IdxT k,
-                IdxT lda,
-                IdxT ldb,
-                IdxT ldd,
-                OutT* dOutput,
-                FinalLambda fin_op,
-                cudaStream_t stream)
-{
-#if (__CUDACC_VER_MAJOR__ < 12)
-  const auto deviceVersion = getComputeCapability();
-  if (deviceVersion.first >= 8) {
-    using CosineOp_ = CosineOp<DataT, AccT>;
-    CosineOp_ cosine_dist_op;
-
-    cutlassDistanceKernel<DataT, AccT, OutT, IdxT, VecLen, FinalLambda, CosineOp_, isRowMajor>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, cosine_dist_op, stream);
-
-  } else
-#endif
-  {
-    typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-    typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-    typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-    dim3 blk(KPolicy::Nthreads);
-
-    // Accumulation operation lambda
-    auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
-
-    // epilogue operation lambda for final value calculation
-    auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                       DataT * regxn,
-                                       DataT * regyn,
-                                       IdxT gridStrideX,
-                                       IdxT gridStrideY) {
-#pragma unroll
-      for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-#pragma unroll
-        for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-          acc[i][j] = 1.0 - (acc[i][j] / (regxn[i] * regyn[j]));
-        }
-      }
-    };
-
-    constexpr size_t shmemSize =
-      KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
-    if (isRowMajor) {
-      auto cosineRowMajor = pairwiseDistanceMatKernelPriorToAmpere<true,
-                                                                   DataT,
-                                                                   AccT,
-                                                                   OutT,
-                                                                   IdxT,
-                                                                   KPolicy,
-                                                                   decltype(core_lambda),
-                                                                   decltype(epilog_lambda),
-                                                                   FinalLambda,
-                                                                   true>;
-      dim3 grid           = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineRowMajor);
-      cosineRowMajor<<<grid, blk, shmemSize, stream>>>(
-        x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-    } else {
-      auto cosineColMajor = pairwiseDistanceMatKernelPriorToAmpere<true,
-                                                                   DataT,
-                                                                   AccT,
-                                                                   OutT,
-                                                                   IdxT,
-                                                                   KPolicy,
-                                                                   decltype(core_lambda),
-                                                                   decltype(epilog_lambda),
-                                                                   FinalLambda,
-                                                                   false>;
-      dim3 grid           = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineColMajor);
-      cosineColMajor<<<grid, blk, shmemSize, stream>>>(
-        x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-    }
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void cosine(IdxT m,
-            IdxT n,
-            IdxT k,
-            IdxT lda,
-            IdxT ldb,
-            IdxT ldd,
-            const DataT* x,
-            const DataT* y,
-            const DataT* xn,
-            const DataT* yn,
-            OutT* dOutput,
-            FinalLambda fin_op,
-            cudaStream_t stream)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    cosineImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    cosineImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else {
-    cosineImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  }
-}
-
-/**
- * @brief the expanded cosine distance matrix calculation
- *  It computes the following equation:
- *              C = 1 - op(A * B / sqrt(A^2) * sqrt(B^2)))
- * @tparam IType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OType output data-type (for C and D matrices)
- * @tparam OutputTile_ output tile size for the thread block
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
- * @param m number of rows of A and C/D
- * @param n number of columns of B and C/D
- * @param k number of cols of A and rows of B
- * @param pA input matrix
- * @param pB input matrix
- * @param pD output matrix
- * @tparam in_params user-defined input parameter
- * @param workspace temporary workspace needed for computations
- * @param worksize number of bytes of the workspace
- * @param fin_op the final gemm epilogue lambda
- * @param stream cuda stream where to launch work
- * @param isRowMajor whether the input and output matrices are row major
- */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void cosineAlgo1(Index_ m,
-                 Index_ n,
-                 Index_ k,
-                 const InType* pA,
-                 const InType* pB,
-                 OutType* pD,
-                 AccType* workspace,
-                 size_t worksize,
-                 FinalLambda fin_op,
-                 cudaStream_t stream,
-                 bool isRowMajor)
-{
-  // raft distance support inputs as float/double and output as uint8_t/float/double.
-  static_assert(!((sizeof(OutType) > 1) && (sizeof(AccType) != sizeof(OutType))),
-                "OutType can be uint8_t, float, double,"
-                "if sizeof(OutType) > 1 then sizeof(AccType) == sizeof(OutType).");
-  typedef typename std::conditional<sizeof(OutType) == 1, OutType, AccType>::type CosOutType;
-  CosOutType* pDcast = reinterpret_cast<CosOutType*>(pD);
-
-  ASSERT(
-    !(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || (worksize < m * sizeof(AccType))),
-    "workspace size error");
-  ASSERT(workspace != nullptr, "workspace is null");
-
-  Index_ lda, ldb, ldd;
-  InType* col_vec = workspace;
-  InType* row_vec = workspace;
-  if (pA != pB) {
-    row_vec += m;
-    raft::linalg::rowNorm(
-      col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::sqrt_op{});
-    raft::linalg::rowNorm(
-      row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, raft::sqrt_op{});
-  } else {
-    raft::linalg::rowNorm(
-      col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::sqrt_op{});
-  }
-
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    cosine<InType, AccType, CosOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, pDcast, fin_op, stream);
-  } else {
-    lda = n, ldb = m, ldd = m;
-    cosine<InType, AccType, CosOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, pDcast, fin_op, stream);
-  }
-}
-
-};  // end namespace detail
-};  // end namespace distance
-};  // end namespace raft
diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index d2d2b40283..7887eb96be 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -17,19 +17,32 @@
 #pragma once
 
 #include <cuda_runtime_api.h>
+#include <type_traits>
+
 #include <raft/core/resource/cuda_stream.hpp>
-#include <raft/distance/detail/canberra.cuh>
-#include <raft/distance/detail/chebyshev.cuh>
-#include <raft/distance/detail/correlation.cuh>
-#include <raft/distance/detail/cosine.cuh>
-#include <raft/distance/detail/euclidean.cuh>
-#include <raft/distance/detail/hamming.cuh>
-#include <raft/distance/detail/hellinger.cuh>
-#include <raft/distance/detail/jensen_shannon.cuh>
-#include <raft/distance/detail/kl_divergence.cuh>
-#include <raft/distance/detail/l1.cuh>
-#include <raft/distance/detail/minkowski.cuh>
-#include <raft/distance/detail/russell_rao.cuh>
+#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/linalg/reduce.cuh>
+#include <raft/linalg/unary_op.cuh>
+
+#include <raft/core/operators.hpp>
+
+#include <raft/distance/detail/distance_ops/canberra.cuh>
+#include <raft/distance/detail/distance_ops/correlation.cuh>
+#include <raft/distance/detail/distance_ops/cosine.cuh>
+#include <raft/distance/detail/distance_ops/hamming.cuh>
+#include <raft/distance/detail/distance_ops/hellinger.cuh>
+#include <raft/distance/detail/distance_ops/jensen_shannon.cuh>
+#include <raft/distance/detail/distance_ops/kl_divergence.cuh>
+#include <raft/distance/detail/distance_ops/l1.cuh>
+#include <raft/distance/detail/distance_ops/l2_exp.cuh>
+#include <raft/distance/detail/distance_ops/l2_unexp.cuh>
+#include <raft/distance/detail/distance_ops/l_inf.cuh>
+#include <raft/distance/detail/distance_ops/lp_unexp.cuh>
+#include <raft/distance/detail/distance_ops/russel_rao.cuh>
+
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>
+
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/gemm.cuh>
 #include <raft/util/cuda_utils.cuh>
@@ -39,594 +52,673 @@ namespace raft {
 namespace distance {
 namespace detail {
 
-/** enum to tell how to compute distance */
-enum DistanceType : unsigned short {
-
-  /** evaluate as dist_ij = sum(x_ik^2) + sum(y_ij)^2 - 2*sum(x_ik * y_jk) */
-  L2Expanded = 0,
-  /** same as above, but inside the epilogue, perform square root operation */
-  L2SqrtExpanded = 1,
-  /** cosine distance */
-  CosineExpanded = 2,
-  /** L1 distance */
-  L1 = 3,
-  /** evaluate as dist_ij += (x_ik - y-jk)^2 */
-  L2Unexpanded = 4,
-  /** same as above, but inside the epilogue, perform square root operation */
-  L2SqrtUnexpanded = 5,
-  /** basic inner product **/
-  InnerProduct = 6,
-  /** Chebyshev (Linf) distance **/
-  Linf = 7,
-  /** Canberra distance **/
-  Canberra = 8,
-  /** Generalized Minkowski distance **/
-  LpUnexpanded = 9,
-  /** Correlation distance **/
-  CorrelationExpanded = 10,
-  /** Jaccard distance **/
-  JaccardExpanded = 11,
-  /** Hellinger distance **/
-  HellingerExpanded = 12,
-  /** Haversine distance **/
-  Haversine = 13,
-  /** Bray-Curtis distance **/
-  BrayCurtis = 14,
-  /** Jensen-Shannon distance**/
-  JensenShannon = 15,
-  /** Hamming distance **/
-  HammingUnexpanded = 16,
-  /** KLDivergence **/
-  KLDivergence = 17,
-  /** RusselRao **/
-  RusselRaoExpanded = 18,
-  /** Dice-Sorensen distance **/
-  DiceExpanded = 19,
-  /** Precomputed (special value) **/
-  Precomputed = 100
-};
-
-namespace {
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl {
-  void run(raft::resources const& handle,
-           const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void* workspace,
-           size_t worksize,
-           FinalLambda fin_op,
-           bool isRowMajor,
-           InType metric_arg = 2.0f)
-  {
-  }
-};
+/**
+ * @brief: A tag type for overload resolution based on DistanceType
+ *
+ * It is not possible to partially specialize function templates on a single
+ * parameter. Instead, it is often easier to use a combination of conventional
+ * method overloading and a parameter with a specific tag type. The following
+ * type is used to help method overloading based on the DistanceType enum.
+ */
+template <DistanceType d>
+using distance_tag = std::integral_constant<DistanceType, d>;
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L2Expanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(raft::resources const& handle,
-           const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void* workspace,
-           size_t worksize,
-           FinalLambda fin_op,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::euclideanAlgo1<InType, AccType, OutType, FinalLambda, Index_>(
-      m,
-      n,
-      k,
-      x,
-      y,
-      dist,
-      false,
-      (AccType*)workspace,
-      worksize,
-      fin_op,
-      raft::resource::get_cuda_stream(handle),
-      isRowMajor);
-  }
-};
+/**
+ * @brief Implement pairwise_matrix for specific distance
+ *
+ * There are multiple overloads for this function, one for each distance type.
+ * They are implemented below. The documentation of this function serves as
+ * documentation for all functions. The following overloads are defined:
+ *
+ * - DistanceType::Canberra:
+ * - DistanceType::CorrelationExpanded:
+ * - DistanceType::CosineExpanded:
+ * - DistanceType::HammingUnexpanded:
+ * - DistanceType::HellingerExpanded:
+ * - DistanceType::JensenShannon:
+ * - DistanceType::KLDivergence:
+ * - DistanceType::L1:
+ * - DistanceType::L2Expanded:
+ * - DistanceType::L2SqrtExpanded:
+ * - DistanceType::L2Unexpanded:
+ * - DistanceType::L2SqrtUnexpanded:
+ * - DistanceType::Linf:
+ * - DistanceType::LpUnexpanded:
+ * - DistanceType::RusselRaoExpanded:
+ *
+ * @tparam DataT   Input data type
+ * @tparam AccT    Accumulation data type
+ * @tparam OutT    Output data type
+ * @tparam FinOpT  Type of final operation
+ * @tparam IdxT    Index type
+ *
+ * @param handle        RAFT resources handle
+ * @param distance_type A tag type to indicate which distance is calculated.
+ * @param x             First set of points
+ * @param y             Second set of points
+ * @param out           Output distance matrix
+ * @param m             Number of points in x
+ * @param n             Number of points in y
+ * @param k             Dimensionality of points in x, y
+ * @param workspace     Temporary workspace needed for computations
+ * @param worksize      Number of bytes of the workspace
+ * @param is_row_major  Whether the matrices are row-major or col-major
+ * @param metric_arg    The `p` argument for Lp.
+ */
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(raft::resources const& handle,
+                   distance_tag<DistanceType::Canberra> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT* workspace,  // unused
+                   size_t worksize,  // unused
+                   FinOpT fin_op,
+                   bool is_row_major,
+                   DataT metric_arg)  // unused
+{
+  ops::canberra_distance_op<DataT, AccT, IdxT> distance_op{};
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L2SqrtExpanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(raft::resources const& handle,
-           const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void* workspace,
-           size_t worksize,
-           FinalLambda fin_op,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::euclideanAlgo1<InType, AccType, OutType, FinalLambda, Index_>(
-      m,
-      n,
-      k,
-      x,
-      y,
-      dist,
-      true,
-      (AccType*)workspace,
-      worksize,
-      fin_op,
-      raft::resource::get_cuda_stream(handle),
-      isRowMajor);
-  }
-};
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::CosineExpanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(raft::resources const& handle,
-           const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void* workspace,
-           size_t worksize,
-           FinalLambda fin_op,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::cosineAlgo1<InType, AccType, OutType, FinalLambda, Index_>(
-      m,
-      n,
-      k,
-      x,
-      y,
-      dist,
-      (AccType*)workspace,
-      worksize,
-      fin_op,
-      raft::resource::get_cuda_stream(handle),
-      isRowMajor);
-  }
-};
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
+}
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::InnerProduct,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(raft::resources const& handle,
-           const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void*,
-           size_t,
-           FinalLambda,
-           bool isRowMajor,
-           InType)
-  {
-    cudaStream_t stream = raft::resource::get_cuda_stream(handle);
-    raft::linalg::gemm(handle,
-                       dist,
-                       const_cast<InType*>(x),
-                       const_cast<InType*>(y),
-                       m,
-                       n,
-                       k,
-                       !isRowMajor,
-                       !isRowMajor,
-                       isRowMajor,
-                       stream);
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(raft::resources const& handle,
+                   distance_tag<DistanceType::CorrelationExpanded> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT* workspace,
+                   size_t worksize,
+                   FinOpT fin_op,
+                   bool is_row_major,
+                   DataT)  // unused
+{
+  ASSERT(
+    !(((x != y) && (worksize < 2 * (m + n) * sizeof(AccT))) || (worksize < 2 * m * sizeof(AccT))),
+    "workspace size error");
+  ASSERT(workspace != nullptr, "workspace is null");
+
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+
+  AccT* norm_col_vec    = workspace;
+  AccT* norm_row_vec    = workspace;
+  AccT* sq_norm_col_vec = workspace;
+  AccT* sq_norm_row_vec = workspace;
+  if (x != y) {
+    norm_row_vec += m;
+
+    raft::linalg::reduce(norm_col_vec,
+                         x,
+                         k,
+                         m,
+                         (AccT)0,
+                         is_row_major,
+                         true,
+                         stream,
+                         false,
+                         raft::identity_op(),
+                         raft::add_op());
+    raft::linalg::reduce(norm_row_vec,
+                         y,
+                         k,
+                         n,
+                         (AccT)0,
+                         is_row_major,
+                         true,
+                         stream,
+                         false,
+                         raft::identity_op(),
+                         raft::add_op());
+
+    sq_norm_col_vec += (m + n);
+    sq_norm_row_vec = sq_norm_col_vec + m;
+    raft::linalg::rowNorm(sq_norm_col_vec, x, k, m, raft::linalg::L2Norm, is_row_major, stream);
+    raft::linalg::rowNorm(sq_norm_row_vec, y, k, n, raft::linalg::L2Norm, is_row_major, stream);
+  } else {
+    raft::linalg::reduce(norm_col_vec,
+                         x,
+                         k,
+                         m,
+                         (AccT)0,
+                         is_row_major,
+                         true,
+                         stream,
+                         false,
+                         raft::identity_op(),
+                         raft::add_op());
+    sq_norm_col_vec += m;
+    sq_norm_row_vec = sq_norm_col_vec;
+    raft::linalg::rowNorm(sq_norm_col_vec, x, k, m, raft::linalg::L2Norm, is_row_major, stream);
   }
-};
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L2Unexpanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(raft::resources const& handle,
-           const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void*,
-           size_t,
-           FinalLambda fin_op,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::euclideanAlgo2<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, false, fin_op, raft::resource::get_cuda_stream(handle), isRowMajor);
-  }
-};
+  using OpT = ops::correlation_distance_op<DataT, AccT, IdxT>;
+  OpT corr_op(is_row_major, sq_norm_col_vec, sq_norm_row_vec, m, n, k);
+  distance_matrix_dispatch<decltype(corr_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    corr_op, m, n, k, x, y, norm_col_vec, norm_row_vec, out, fin_op, stream, is_row_major);
+}
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L2SqrtUnexpanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(raft::resources const& handle,
-           const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void*,
-           size_t,
-           FinalLambda fin_op,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::euclideanAlgo2<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, true, fin_op, raft::resource::get_cuda_stream(handle), isRowMajor);
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(raft::resources const& handle,
+                   distance_tag<DistanceType::CosineExpanded> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT* workspace,
+                   size_t worksize,
+                   FinOpT fin_op,
+                   bool is_row_major,
+                   DataT)  // unused
+{
+  // raft distance support inputs as float/double and output as uint8_t/float/double.
+  static_assert(!((sizeof(OutT) > 1) && (sizeof(AccT) != sizeof(OutT))),
+                "OutT can be uint8_t, float, double,"
+                "if sizeof(OutT) > 1 then sizeof(AccT) == sizeof(OutT).");
+
+  ASSERT(!(((x != y) && (worksize < (m + n) * sizeof(AccT))) || (worksize < m * sizeof(AccT))),
+         "workspace size error");
+  ASSERT(workspace != nullptr, "workspace is null");
+
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+
+  DataT* norm_A = workspace;
+  DataT* norm_B = workspace;
+  if (x != y) {
+    norm_B += m;
+    raft::linalg::rowNorm(
+      norm_A, x, k, m, raft::linalg::L2Norm, is_row_major, stream, raft::sqrt_op{});
+    raft::linalg::rowNorm(
+      norm_B, y, k, n, raft::linalg::L2Norm, is_row_major, stream, raft::sqrt_op{});
+  } else {
+    raft::linalg::rowNorm(
+      norm_A, x, k, m, raft::linalg::L2Norm, is_row_major, stream, raft::sqrt_op{});
   }
-};
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L1,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(raft::resources const& handle,
-           const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void*,
-           size_t,
-           FinalLambda fin_op,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::l1Impl<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, fin_op, raft::resource::get_cuda_stream(handle), isRowMajor);
+  // On CUDA 12:
+  // - always execute normal kernel
+  //
+  // On CUDA 11 and below:
+  // - execute CUTLASS-based kernel on SM_80 and above
+  // - execute normal kernel otherwise.
+
+  if constexpr (__CUDACC_VER_MAJOR__ == 12) {
+    // Always execute legacy kernels on CUDA 12
+    ops::cosine_distance_op<DataT, AccT, IdxT> distance_op{};
+    distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+      distance_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
+  } else {
+    const auto deviceVersion = getComputeCapability();
+    if (deviceVersion.first >= 8) {
+      // If device is SM_80 or later, use CUTLASS-based kernel.
+      using Op = ops::cosine_cutlass_op<DataT, AccT>;
+      Op distance_op{};
+
+      distance_matrix_cutlass_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+        distance_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
+    } else {
+      // Else use "legacy" cosine kernel
+      ops::cosine_distance_op<DataT, AccT, IdxT> distance_op{};
+      distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+        distance_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
+    }
   }
-};
+}
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::Linf,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(raft::resources const& handle,
-           const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void*,
-           size_t,
-           FinalLambda fin_op,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::chebyshevImpl<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, fin_op, raft::resource::get_cuda_stream(handle), isRowMajor);
-  }
-};
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(raft::resources const& handle,
+                   distance_tag<DistanceType::HammingUnexpanded> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT*,   // workspace unused
+                   size_t,  // worksize unused
+                   FinOpT fin_op,
+                   bool is_row_major,
+                   DataT)  // metric_arg unused
+{
+  ops::hamming_distance_op<DataT, AccT, IdxT> distance_op{k};
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::HellingerExpanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(raft::resources const& handle,
-           const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void*,
-           size_t,
-           FinalLambda fin_op,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::hellingerImpl<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, fin_op, raft::resource::get_cuda_stream(handle), isRowMajor);
-  }
-};
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::LpUnexpanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(raft::resources const& handle,
-           const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void*,
-           size_t,
-           FinalLambda fin_op,
-           bool isRowMajor,
-           InType metric_arg)
-  {
-    raft::distance::detail::minkowskiImpl<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, fin_op, raft::resource::get_cuda_stream(handle), isRowMajor, metric_arg);
-  }
-};
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::Canberra,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(raft::resources const& handle,
-           const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void*,
-           size_t,
-           FinalLambda fin_op,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::canberraImpl<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, fin_op, raft::resource::get_cuda_stream(handle), isRowMajor);
-  }
-};
+  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
+}
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::HammingUnexpanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(raft::resources const& handle,
-           const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void*,
-           size_t,
-           FinalLambda fin_op,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::hammingUnexpandedImpl<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, fin_op, raft::resource::get_cuda_stream(handle), isRowMajor);
-  }
-};
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(raft::resources const& handle,
+                   distance_tag<DistanceType::InnerProduct> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT*,   // workspace unused
+                   size_t,  // worksize unused
+                   FinOpT fin_op,
+                   bool is_row_major,
+                   DataT)  // metric_arg unused
+{
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+  raft::linalg::gemm(handle,
+                     out,
+                     const_cast<DataT*>(x),
+                     const_cast<DataT*>(y),
+                     m,
+                     n,
+                     k,
+                     !is_row_major,
+                     !is_row_major,
+                     is_row_major,
+                     stream);
+}
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::JensenShannon,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(raft::resources const& handle,
-           const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void*,
-           size_t,
-           FinalLambda fin_op,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::jensenShannonImpl<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, fin_op, raft::resource::get_cuda_stream(handle), isRowMajor);
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(raft::resources const& handle,
+                   distance_tag<DistanceType::HellingerExpanded> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT*,   // workspace unused
+                   size_t,  // worksize unused
+                   FinOpT fin_op,
+                   bool is_row_major,
+                   DataT)  // metric_arg unused
+{
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+
+  // First sqrt x and y
+  const auto raft_sqrt = raft::linalg::unaryOp<DataT, raft::sqrt_op, IdxT>;
+
+  raft_sqrt((DataT*)x, x, m * k, raft::sqrt_op{}, stream);
+  if (x != y) { raft_sqrt((DataT*)y, y, n * k, raft::sqrt_op{}, stream); }
+
+  // Then calculate Hellinger distance
+  ops::hellinger_distance_op<DataT, AccT, IdxT> distance_op{};
+
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
+
+  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
+
+  // Finally revert sqrt of x and y
+  raft_sqrt((DataT*)x, x, m * k, raft::sqrt_op{}, stream);
+  if (x != y) { raft_sqrt((DataT*)y, y, n * k, raft::sqrt_op{}, stream); }
+
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(raft::resources const& handle,
+                   distance_tag<DistanceType::JensenShannon> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT*,   // workspace unused
+                   size_t,  // worksize unused
+                   FinOpT fin_op,
+                   bool is_row_major,
+                   DataT)  // metric_arg unused
+{
+  ops::jensen_shannon_distance_op<DataT, AccT, IdxT> distance_op{};
+
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
+
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+
+  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
+}
+
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(raft::resources const& handle,
+                   distance_tag<DistanceType::KLDivergence> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT*,   // workspace unused
+                   size_t,  // worksize unused
+                   FinOpT fin_op,
+                   bool is_row_major,
+                   DataT)  // metric_arg unused
+{
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+
+  auto unaryOp_lambda = [] __device__(DataT input) {
+    const bool x_zero = (input == 0);
+    return (!x_zero) * raft::log(input + x_zero);
+  };
+
+  auto unaryOp_lambda_reverse = [] __device__(DataT input) {
+    // reverse previous log (x) back to x using (e ^ log(x))
+    const bool x_zero = (input == 0);
+    return (!x_zero) * raft::exp(input);
+  };
+
+  // This op takes some shortcuts when x equals y. So its behavior changes based
+  // on this.
+  ops::kl_divergence_op<DataT, AccT, DataT> kl_divergence{is_row_major, x == y};
+
+  if (x != y) {
+    raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
+      (DataT*)y, y, n * k, unaryOp_lambda, stream);
   }
-};
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::RusselRaoExpanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(raft::resources const& handle,
-           const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void*,
-           size_t,
-           FinalLambda fin_op,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::russellRaoImpl<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, fin_op, raft::resource::get_cuda_stream(handle), isRowMajor);
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
+
+  distance_matrix_dispatch<decltype(kl_divergence), DataT, AccT, OutT, FinOpT, IdxT>(
+    kl_divergence, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
+
+  if (x != y) {
+    // Now reverse previous log (x) back to x using (e ^ log(x))
+    raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda_reverse), IdxT>(
+      (DataT*)y, y, n * k, unaryOp_lambda_reverse, stream);
   }
-};
+}
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::KLDivergence,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(raft::resources const& handle,
-           const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void*,
-           size_t,
-           FinalLambda fin_op,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::klDivergenceImpl<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, fin_op, raft::resource::get_cuda_stream(handle), isRowMajor);
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(raft::resources const& handle,
+                   distance_tag<DistanceType::L1> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT*,   // workspace unused
+                   size_t,  // worksize unused
+                   FinOpT fin_op,
+                   bool is_row_major,
+                   DataT)  // metric_arg unused
+{
+  ops::l1_distance_op<DataT, AccT, IdxT> distance_op{};
+
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
+
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
+}
+
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void distance_impl_l2_expanded(  // NOTE: different name
+  bool perform_sqrt,             // dispatch on sqrt
+  const DataT* x,
+  const DataT* y,
+  OutT* out,
+  IdxT m,
+  IdxT n,
+  IdxT k,
+  AccT* workspace,
+  size_t worksize,
+  FinOpT fin_op,
+  cudaStream_t stream,
+  bool is_row_major)
+{
+  // raft distance support inputs as float/double and output as uint8_t/float/double.
+  static_assert(!((sizeof(OutT) > 1) && (sizeof(AccT) != sizeof(OutT))),
+                "OutT can be uint8_t, float, double,"
+                "if sizeof(OutT) > 1 then sizeof(AccT) == sizeof(OutT).");
+
+  ASSERT(!(((x != y) && (worksize < (m + n) * sizeof(AccT))) || (worksize < m * sizeof(AccT))),
+         "workspace size error");
+  ASSERT(workspace != nullptr, "workspace is null");
+
+  DataT* norm_A = workspace;
+  DataT* norm_B = workspace;
+  if (x != y) {
+    norm_B += m;
+    raft::linalg::rowNorm(
+      norm_A, x, k, m, raft::linalg::L2Norm, is_row_major, stream, raft::identity_op{});
+    raft::linalg::rowNorm(
+      norm_B, y, k, n, raft::linalg::L2Norm, is_row_major, stream, raft::identity_op{});
+  } else {
+    raft::linalg::rowNorm(
+      norm_A, x, k, m, raft::linalg::L2Norm, is_row_major, stream, raft::identity_op{});
   }
-};
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::CorrelationExpanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(raft::resources const& handle,
-           const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void* workspace,
-           size_t worksize,
-           FinalLambda fin_op,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::correlationImpl<InType, AccType, OutType, FinalLambda, Index_>(
-      m,
-      n,
-      k,
-      x,
-      y,
-      dist,
-      (AccType*)workspace,
-      worksize,
-      fin_op,
-      raft::resource::get_cuda_stream(handle),
-      isRowMajor);
+  // On CUDA 12:
+  // - always execute normal kernel
+  //
+  // On CUDA 11 and below:
+  // - execute CUTLASS-based kernel on SM_80 and above
+  // - execute normal kernel otherwise.
+
+  if constexpr (__CUDACC_VER_MAJOR__ == 12) {
+    // Always execute legacy kernels on CUDA 12
+    ops::l2_exp_distance_op<DataT, AccT, IdxT> l2_op(perform_sqrt);
+    distance_matrix_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
+      l2_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
+  } else {
+    const auto deviceVersion = getComputeCapability();
+    if (deviceVersion.first >= 8) {
+      // If device is SM_80 or later, use CUTLASS-based kernel.
+      using L2Op = ops::l2_exp_cutlass_op<DataT, AccT>;
+      L2Op l2_op(perform_sqrt);
+
+      distance_matrix_cutlass_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
+        l2_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
+    } else {
+      // Else use "legacy" L2
+      ops::l2_exp_distance_op<DataT, AccT, IdxT> l2_op(perform_sqrt);
+      distance_matrix_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
+        l2_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
+    }
   }
-};
+}
+
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(raft::resources const& handle,
+                   distance_tag<DistanceType::L2Expanded> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT* workspace,
+                   size_t worksize,
+                   FinOpT fin_op,
+                   bool is_row_major,
+                   DataT)  // metric_arg unused
+{
+  bool perform_sqrt   = false;
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+  distance_impl_l2_expanded(
+    perform_sqrt, x, y, out, m, n, k, workspace, worksize, fin_op, stream, is_row_major);
+}
+
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(raft::resources const& handle,
+                   distance_tag<DistanceType::L2SqrtExpanded> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT* workspace,
+                   size_t worksize,
+                   FinOpT fin_op,
+                   bool is_row_major,
+                   DataT)  // metric_arg unused
+{
+  bool perform_sqrt   = true;
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+  distance_impl_l2_expanded(
+    perform_sqrt, x, y, out, m, n, k, workspace, worksize, fin_op, stream, is_row_major);
+}
+
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(raft::resources const& handle,
+                   distance_tag<DistanceType::L2Unexpanded> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT*,   // workspace unused
+                   size_t,  // worksize unused
+                   FinOpT fin_op,
+                   bool is_row_major,
+                   DataT)  // metric_arg unused
+{
+  bool perform_sqrt = false;
+  ops::l2_unexp_distance_op<DataT, AccT, IdxT> l2_op(perform_sqrt);
+
+  // The unexpanded L2 does not require the norms of a and b to be calculated.
+  const DataT* norm_A = nullptr;
+  const DataT* norm_B = nullptr;
+
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+
+  distance_matrix_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    l2_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
+}
+
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(raft::resources const& handle,
+                   distance_tag<DistanceType::L2SqrtUnexpanded> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT*,   // workspace unused
+                   size_t,  // worksize unused
+                   FinOpT fin_op,
+                   bool is_row_major,
+                   DataT)  // metric_arg unused
+{
+  bool perform_sqrt = true;
+  ops::l2_unexp_distance_op<DataT, AccT, IdxT> l2_op(perform_sqrt);
+
+  // The unexpanded L2 does not require the norms of a and b to be calculated.
+  const DataT* norm_A = nullptr;
+  const DataT* norm_B = nullptr;
+
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+
+  distance_matrix_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    l2_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
+}
 
-}  // anonymous namespace
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(raft::resources const& handle,
+                   distance_tag<DistanceType::Linf> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT*,   // workspace unused
+                   size_t,  // worksize unused
+                   FinOpT fin_op,
+                   bool is_row_major,
+                   DataT)  // metric_arg unused
+{
+  ops::l_inf_distance_op<DataT, AccT, IdxT> distance_op{};
+
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
+
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+
+  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
+}
+
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(raft::resources const& handle,
+                   distance_tag<DistanceType::LpUnexpanded> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT*,   // workspace unused
+                   size_t,  // worksize unused
+                   FinOpT fin_op,
+                   bool is_row_major,
+                   DataT metric_arg)
+{
+  ops::lp_unexp_distance_op<DataT, AccT, IdxT> distance_op{metric_arg};
+
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
+
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+
+  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
+}
+
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(raft::resources const& handle,
+                   distance_tag<DistanceType::RusselRaoExpanded> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT*,   // workspace unused
+                   size_t,  // worksize unused
+                   FinOpT fin_op,
+                   bool is_row_major,
+                   DataT)  // metric_arg unused
+{
+  ops::russel_rao_distance_op<DataT, AccT, IdxT> distance_op{k};
+
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
+
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+
+  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
+}
 
 /**
  * @brief Evaluate pairwise distances with the user epilogue lamba allowed
@@ -636,15 +728,17 @@ struct DistanceImpl<raft::distance::DistanceType::CorrelationExpanded,
  * @tparam OutType output type
  * @tparam FinalLambda user-defined epilogue lamba
  * @tparam Index_ Index type
+ *
  * @param x first set of points
  * @param y second set of points
- * @param dist output distance matrix
+ * @param out output distance matrix
  * @param m number of points in x
  * @param n number of points in y
  * @param k dimensionality
  * @param workspace temporary workspace needed for computations
  * @param worksize number of bytes of the workspace
  * @param fin_op the final gemm epilogue lambda
+ * @param stream cuda stream
  * @param isRowMajor whether the matrices are row-major or col-major
  *
  * @note fin_op: This is a device lambda which is supposed to operate upon the
@@ -661,7 +755,7 @@ template <raft::distance::DistanceType distanceType,
 void distance(raft::resources const& handle,
               const InType* x,
               const InType* y,
-              OutType* dist,
+              OutType* out,
               Index_ m,
               Index_ n,
               Index_ k,
@@ -671,8 +765,25 @@ void distance(raft::resources const& handle,
               bool isRowMajor   = true,
               InType metric_arg = 2.0f)
 {
-  DistanceImpl<distanceType, InType, AccType, OutType, FinalLambda, Index_> distImpl;
-  distImpl.run(handle, x, y, dist, m, n, k, workspace, worksize, fin_op, isRowMajor, metric_arg);
+  // raft distance support inputs as float/double and output as uint8_t/float/double.
+  static_assert(!((sizeof(OutType) > 1) && (sizeof(AccType) != sizeof(OutType))),
+                "OutType can be uint8_t, float, double,"
+                "if sizeof(OutType) > 1 then sizeof(AccType) == sizeof(OutType).");
+
+  distance_impl<InType, AccType, OutType, FinalLambda, Index_>(
+    handle,
+    distance_tag<distanceType>{},
+    x,
+    y,
+    out,
+    m,
+    n,
+    k,
+    reinterpret_cast<AccType*>(workspace),
+    worksize,
+    fin_op,
+    isRowMajor,
+    metric_arg);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
@@ -691,24 +802,9 @@ void distance(raft::resources const& handle,
  * @param k dimensionality
  * @param workspace temporary workspace needed for computations
  * @param worksize number of bytes of the workspace
+ * @param stream cuda stream
  * @param isRowMajor whether the matrices are row-major or col-major
- *
- * @note if workspace is passed as nullptr, this will return in
- *  worksize, the number of bytes of workspace required
  */
-
-// Default final op functor which facilitates elementwise operation on
-// final distance value if any.
-template <typename AccType, typename OutType, typename Index>
-struct default_fin_op {
-  __host__ __device__ default_fin_op() noexcept {};
-  // functor signature.
-  __host__ __device__ OutType operator()(AccType d_val, Index g_d_idx) const noexcept
-  {
-    return d_val;
-  }
-};
-
 template <raft::distance::DistanceType distanceType,
           typename InType,
           typename AccType,
@@ -717,7 +813,7 @@ template <raft::distance::DistanceType distanceType,
 void distance(raft::resources const& handle,
               const InType* x,
               const InType* y,
-              OutType* dist,
+              OutType* out,
               Index_ m,
               Index_ n,
               Index_ k,
@@ -726,16 +822,10 @@ void distance(raft::resources const& handle,
               bool isRowMajor   = true,
               InType metric_arg = 2.0f)
 {
-  using final_op_type = default_fin_op<AccType, OutType, Index_>;
-  final_op_type fin_op;
+  auto fin_op = raft::identity_op();
 
-  // raft distance support inputs as float/double and output as uint8_t/float/double.
-  static_assert(!((sizeof(OutType) > 1) && (sizeof(AccType) != sizeof(OutType))),
-                "OutType can be uint8_t, float, double,"
-                "if sizeof(OutType) > 1 then sizeof(AccType) == sizeof(OutType).");
-  distance<distanceType, InType, AccType, OutType, final_op_type, Index_>(
-    handle, x, y, dist, m, n, k, workspace, worksize, fin_op, isRowMajor, metric_arg);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  distance<distanceType, InType, AccType, OutType, decltype(fin_op), Index_>(
+    handle, x, y, out, m, n, k, workspace, worksize, fin_op, isRowMajor, metric_arg);
 }
 
 /**
@@ -775,42 +865,6 @@ size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, In
   return worksize;
 }
 
-/**
- * @defgroup pairwise_distance pairwise distance prims
- * @{
- * @brief Convenience wrapper around 'distance' prim to convert runtime metric
- * into compile time for the purpose of dispatch
- * @tparam Type input/accumulation/output data-type
- * @tparam Index_ indexing type
- * @param x first set of points
- * @param y second set of points
- * @param dist output distance matrix
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- * @param workspace temporary workspace buffer which can get resized as per the
- * needed workspace size
- * @param metric distance metric
- * @param isRowMajor whether the matrices are row-major or col-major
- */
-template <typename Type, typename Index_, raft::distance::DistanceType DistType>
-void pairwise_distance_impl(raft::resources const& handle,
-                            const Type* x,
-                            const Type* y,
-                            Type* dist,
-                            Index_ m,
-                            Index_ n,
-                            Index_ k,
-                            rmm::device_uvector<char>& workspace,
-                            bool isRowMajor,
-                            Type metric_arg = 2.0f)
-{
-  auto worksize = getWorkspaceSize<DistType, Type, Type, Type, Index_>(x, y, m, n, k);
-  workspace.resize(worksize, raft::resource::get_cuda_stream(handle));
-  distance<DistType, Type, Type, Type, Index_>(
-    handle, x, y, dist, m, n, k, workspace.data(), worksize, isRowMajor, metric_arg);
-}
-/** @} */
 };  // namespace detail
 };  // namespace distance
 };  // namespace raft
diff --git a/cpp/include/raft/distance/detail/distance_ops/canberra.cuh b/cpp/include/raft/distance/detail/distance_ops/canberra.cuh
new file mode 100644
index 0000000000..45bea08a95
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/canberra.cuh
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::distance::detail::ops {
+
+/**
+ * @brief The canberra distance matrix calculation
+ *
+ * It computes the following equation:
+ *
+ *  c_ij = sum_k |x_ik - y_kj| / ( |x_ik| + |y_kj| )
+ */
+template <typename DataT, typename AccT, typename IdxT>
+struct canberra_distance_op {
+  // Load norms of input data
+  static constexpr bool use_norms = false;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = true;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize;
+  }
+
+  DI void core(AccT& acc, DataT& x, DataT& y) const
+  {
+    const auto diff = raft::abs(x - y);
+    const auto add  = raft::abs(x) + raft::abs(y);
+    // deal with potential for 0 in denominator by
+    // forcing 0/1 instead
+    acc += ((add != 0) * diff / (add + (add == 0)));
+  };
+
+  template <typename Policy>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+    return;
+  }
+};
+
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/distance_ops/correlation.cuh b/cpp/include/raft/distance/detail/distance_ops/correlation.cuh
new file mode 100644
index 0000000000..3832104280
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/correlation.cuh
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::distance::detail::ops {
+
+/** @brief The correlation distance
+ *
+ * It computes the following equation:
+ *
+ * d(x, y) = ((x - mean(x)) ⋅ (y - mean(y)))
+ *           /
+ *           (|| x - mean(x) ||_2 || y - mean(y) ||_2)
+ */
+template <typename DataT, typename AccT, typename IdxT>
+struct correlation_distance_op {
+  const DataT* x2n;
+  const DataT* y2n;
+  IdxT m;
+  IdxT n;
+  IdxT k;
+
+  correlation_distance_op(
+    bool is_row_major, const DataT* x2n_, const DataT* y2n_, IdxT m_, IdxT n_, IdxT k_) noexcept
+    : x2n(x2n_), y2n(y2n_), m(m_), n(n_), k(k_)
+  {
+    // The distance op is typically created before the row-major/col-major
+    // swapping has been done. So we do it here.
+    if (!is_row_major) {
+      std::swap<const DataT*>(x2n, y2n);
+      std::swap(m, n);
+    }
+  }
+
+  // Load norms of input data
+  static constexpr bool use_norms = true;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = false;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize + (2 * (Policy::Mblk + Policy::Nblk) * sizeof(DataT));
+  }
+
+  DI void core(AccT& acc, DataT& x, DataT& y) const { acc += x * y; };
+
+  template <typename Policy>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+    // Note how we can sneakily get a pointer to shared memory here, to store
+    // more data. If the implementation of PairwiseDistanceMatKernel ever
+    // changes, this will be where we find the bugs.
+    extern __shared__ char smem[];
+
+    DataT regx2n[Policy::AccRowsPerTh], regy2n[Policy::AccColsPerTh];
+
+    DataT* sx2Norm =
+      (DataT*)(&smem[Policy::SmemSize + (Policy::Mblk + Policy::Nblk) * sizeof(DataT)]);
+    DataT* sy2Norm = (&sx2Norm[Policy::Mblk]);
+
+    // Load x & y norms required by this threadblock in shmem buffer
+    if (gridStrideX == blockIdx.x * Policy::Nblk) {
+      for (int i = threadIdx.x; i < Policy::Mblk; i += Policy::Nthreads) {
+        auto idx   = gridStrideY + i;
+        sx2Norm[i] = idx < m ? x2n[idx] : 0;
+      }
+    }
+
+    for (int i = threadIdx.x; i < Policy::Nblk; i += Policy::Nthreads) {
+      auto idx   = gridStrideX + i;
+      sy2Norm[i] = idx < n ? y2n[idx] : 0;
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+      regx2n[i] = sx2Norm[i * Policy::AccThRows + (threadIdx.x / Policy::AccThCols)];
+    }
+#pragma unroll
+    for (int i = 0; i < Policy::AccColsPerTh; ++i) {
+      regy2n[i] = sy2Norm[i * Policy::AccThCols + (threadIdx.x % Policy::AccThCols)];
+    }
+
+#pragma unroll
+    for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+        auto numer   = k * acc[i][j] - (regxn[i] * regyn[j]);
+        auto Q_denom = k * regx2n[i] - (regxn[i] * regxn[i]);
+        auto R_denom = k * regy2n[j] - (regyn[j] * regyn[j]);
+
+        acc[i][j] = 1 - (numer / raft::sqrt(Q_denom * R_denom));
+      }
+    }
+  }
+};
+
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/distance_ops/cosine.cuh b/cpp/include/raft/distance/detail/distance_ops/cosine.cuh
new file mode 100644
index 0000000000..c3f3b75e62
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/cosine.cuh
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::distance::detail::ops {
+
+/**
+ * @brief the expanded cosine distance matrix calculation
+ *
+ * It computes the following equation:
+ *
+ * d(x, y) = 1 - (x ⋅ y) / ( ||x||_2 ||y||_2)
+ */
+template <typename DataT, typename AccT, typename IdxT>
+struct cosine_distance_op {
+  // Load norms of input data
+  static constexpr bool use_norms = true;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = false;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT));
+  }
+
+  DI void core(AccT& acc, DataT& x, DataT& y) const { acc += x * y; };
+
+  template <typename Policy>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+#pragma unroll
+    for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+        acc[i][j] = 1.0 - (acc[i][j] / (regxn[i] * regyn[j]));
+      }
+    }
+  }
+};
+
+template <typename DataT, typename AccT>
+struct cosine_cutlass_op {
+  __device__ cosine_cutlass_op() noexcept {}
+  __device__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const noexcept
+  {
+    return static_cast<AccT>(1.0) - (AccT)(accVal / (aNorm * bNorm));
+  }
+  __device__ AccT operator()(DataT aData) const noexcept { return aData; }
+};
+
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/distance_ops/hamming.cuh b/cpp/include/raft/distance/detail/distance_ops/hamming.cuh
new file mode 100644
index 0000000000..98acf11560
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/hamming.cuh
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::distance::detail::ops {
+
+/**
+ * @brief the Hamming Unexpanded distance matrix calculation
+ *  It computes the following equation:
+ *
+ *    c_ij = sum_k (x_ik != y_kj) / k
+ */
+template <typename DataT, typename AccT, typename IdxT>
+struct hamming_distance_op {
+  IdxT k;
+
+  hamming_distance_op(IdxT k_) noexcept : k(k_) {}
+
+  // Load norms of input data
+  static constexpr bool use_norms = false;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = false;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize;
+  }
+
+  DI void core(AccT& acc, DataT& x, DataT& y) const { acc += (x != y); };
+
+  template <typename Policy>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+    const DataT one_over_k = DataT(1.0) / k;
+#pragma unroll
+    for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+        acc[i][j] *= one_over_k;
+      }
+    }
+  }
+};
+
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh b/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh
new file mode 100644
index 0000000000..c5e2b84ac2
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::distance::detail::ops {
+
+/**
+ * @brief the Hellinger distance matrix calculation
+ *
+ * It computes the following equation:
+ *
+ *  c_ij = sqrt(1 - sum_k sqrt(x_ik * y_kj))
+ *
+ */
+template <typename DataT, typename AccT, typename IdxT>
+struct hellinger_distance_op {
+  // Load norms of input data
+  static constexpr bool use_norms = false;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = false;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize;
+  }
+
+  DI void core(AccT& acc, DataT& x, DataT& y) const
+  {
+    // This is sqrt(x) * sqrt(y).
+    const auto product = x * y;
+    acc += product;
+  };
+
+  template <typename Policy>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+#pragma unroll
+    for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+        // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
+        const auto finalVal  = (1 - acc[i][j]);
+        const auto rectifier = (!signbit(finalVal));
+        acc[i][j]            = raft::sqrt(rectifier * finalVal);
+      }
+    }
+  }
+};
+
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh b/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh
new file mode 100644
index 0000000000..df5aadcf3b
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::distance::detail::ops {
+
+// Describes the computation the jensen_shannon distance
+
+/**
+ * @brief the Jensen Shannon distance matrix calculation
+ *
+ * It computes the following equation:
+ *
+ * c_ij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i))
+ *       + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i)))))
+ */
+template <typename DataT, typename AccT, typename IdxT>
+struct jensen_shannon_distance_op {
+  // Load norms of input data
+  static constexpr bool use_norms = false;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = true;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize;
+  }
+
+  DI void core(AccT& acc, DataT& x, DataT& y) const
+  {
+    const DataT m     = 0.5f * (x + y);
+    const bool m_zero = (m == 0);
+    const auto logM   = (!m_zero) * raft::log(m + m_zero);
+
+    const bool x_zero = (x == 0);
+    const bool y_zero = (y == 0);
+    acc += (-x * (logM - raft::log(x + x_zero))) + (-y * (logM - raft::log(y + y_zero)));
+  };
+
+  template <typename Policy>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+#pragma unroll
+    for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+        acc[i][j] = raft::sqrt(0.5 * acc[i][j]);
+      }
+    }
+  }
+};
+
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh b/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh
new file mode 100644
index 0000000000..526927243f
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::distance::detail::ops {
+
+/**
+ * @brief the KL Divergence distance matrix calculation
+ *
+ * It computes the following equation:
+ *
+ *   c_ij = 0.5 * sum(x * log (x / y));
+ */
+template <typename DataT, typename AccT, typename IdxT>
+struct kl_divergence_op {
+  const bool is_row_major;
+  const bool x_equal_y;
+
+  kl_divergence_op(bool row_major_, bool x_equal_y_ = false) noexcept
+    : is_row_major(row_major_), x_equal_y(x_equal_y_)
+  {
+  }
+
+  // Load norms of input data
+  static constexpr bool use_norms = false;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = true;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize;
+  }
+
+  DI void core(AccT& acc, DataT& x, DataT& y) const
+  {
+    // TODO: make sure that these branches get hoisted out of main loop.. Could
+    // be quite expensive otherwise.
+    if (x_equal_y) {
+      if (is_row_major) {
+        const bool x_zero = (x == 0);
+        const bool y_zero = (y == 0);
+        acc += x * (raft::log(x + x_zero) - (!y_zero) * raft::log(y + y_zero));
+      } else {
+        const bool y_zero = (y == 0);
+        const bool x_zero = (x == 0);
+        acc += y * (raft::log(y + y_zero) - (!x_zero) * raft::log(x + x_zero));
+      }
+    } else {
+      if (is_row_major) {
+        const bool x_zero = (x == 0);
+        acc += x * (raft::log(x + x_zero) - y);
+      } else {
+        const bool y_zero = (y == 0);
+        acc += y * (raft::log(y + y_zero) - x);
+      }
+    }
+  };
+
+  template <typename Policy>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+#pragma unroll
+    for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+        acc[i][j] = (0.5f * acc[i][j]);
+      }
+    }
+  }
+};
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/distance_ops/l1.cuh b/cpp/include/raft/distance/detail/distance_ops/l1.cuh
new file mode 100644
index 0000000000..b02971bac7
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/l1.cuh
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::distance::detail::ops {
+
+/**
+ * @brief the L1 distance matrix calculation
+ *
+ * It computes the following equation:
+ *
+ *   c_ij = sum_k abs(x_ik  - y_kj)
+ */
+template <typename DataT, typename AccT, typename IdxT>
+struct l1_distance_op {
+  // Do not load norms of data, the computation of L1 distance does not use them.
+  static constexpr bool use_norms = false;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = false;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize;
+  }
+
+  DI void core(AccT& acc, DataT& x, DataT& y) const { acc += raft::abs(x - y); };
+
+  template <typename Policy>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+    return;
+  };
+};
+
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
new file mode 100644
index 0000000000..fb00f8d66a
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::distance::detail::ops {
+
+/**
+ * @brief the expanded euclidean distance matrix calculation
+ *
+ * It computes the following equation:
+ *
+ * c_ij = - 2 sum_k x_ik * y_kj + ||x_i.||_2 + ||y_.j||_2
+ *
+ */
+template <typename DataT, typename AccT, typename IdxT>
+struct l2_exp_distance_op {
+  bool sqrt;
+
+  l2_exp_distance_op(bool sqrt_) noexcept : sqrt(sqrt_) {}
+
+  // Load norms of input data
+  static constexpr bool use_norms = true;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = false;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT));
+  }
+
+  DI void core(AccT& acc, DataT& x, DataT& y) const { acc += x * y; };
+
+  template <typename Policy>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+#pragma unroll
+    for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+        DataT val = regxn[i] + regyn[j] - (DataT)2.0 * acc[i][j];
+        acc[i][j] = val * (val > DataT(0.0));
+      }
+    }
+    if (sqrt) {
+#pragma unroll
+      for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+        for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+          acc[i][j] = raft::sqrt(acc[i][j]);
+        }
+      }
+    }
+  }
+};
+
+// Epilogue operator for CUTLASS based kernel
+template <typename DataT, typename AccT>
+struct l2_exp_cutlass_op {
+  bool sqrt;
+
+  __device__ l2_exp_cutlass_op() noexcept : sqrt(false) {}
+  __device__ l2_exp_cutlass_op(bool isSqrt) noexcept : sqrt(isSqrt) {}
+  __device__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const noexcept
+  {
+    AccT outVal = aNorm + bNorm - DataT(2.0) * accVal;
+    // outVal could be negative due to numerical instability, especially when
+    // calculating self distance.
+    // clamp to 0 to avoid potential NaN in sqrt
+    outVal = outVal * (outVal > DataT(0.0));
+    return sqrt ? raft::sqrt(outVal) : outVal;
+  }
+
+  __device__ AccT operator()(DataT aData) const noexcept { return aData; }
+};
+
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh b/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh
new file mode 100644
index 0000000000..e03eb0a97e
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::distance::detail::ops {
+
+/**
+ * @brief the unexpanded euclidean distance matrix calculation
+ *
+ * It computes the following equation:
+ *
+ * c_ij = optional_sqrt ( sum_k (x_ik - y_kj)^2 )
+ */
+template <typename DataT, typename AccT, typename IdxT>
+struct l2_unexp_distance_op {
+  bool sqrt;
+
+  l2_unexp_distance_op(bool sqrt_) noexcept : sqrt(sqrt_) {}
+
+  // Do not load norms of data, the computation of L1 distance does not use them.
+  static constexpr bool use_norms = false;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = false;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize;
+  }
+
+  DI void core(AccT& acc, DataT& x, DataT& y) const
+  {
+    const auto diff = x - y;
+    acc += diff * diff;
+  };
+
+  template <typename Policy>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+    if (sqrt) {
+#pragma unroll
+      for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+        for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+          acc[i][j] = raft::sqrt(acc[i][j]);
+        }
+      }
+    }
+  };
+};
+
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/distance_ops/l_inf.cuh b/cpp/include/raft/distance/detail/distance_ops/l_inf.cuh
new file mode 100644
index 0000000000..caa1379133
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/l_inf.cuh
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::distance::detail::ops {
+
+/**
+ * @brief the L_inf (Chebyshev) distance matrix calculation
+ *
+ * It computes the following equation:
+ *
+ *  c_ij = max_k | x_ik - y_kj |
+ */
+template <typename DataT, typename AccT, typename IdxT>
+struct l_inf_distance_op {
+  // Load norms of input data
+  static constexpr bool use_norms = false;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = false;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize;
+  }
+
+  DI void core(AccT& acc, DataT& x, DataT& y) const
+  {
+    const auto diff = raft::abs(x - y);
+    acc             = raft::max(acc, diff);
+  };
+
+  template <typename Policy>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+    return;
+  }
+};
+
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/distance_ops/lp_unexp.cuh b/cpp/include/raft/distance/detail/distance_ops/lp_unexp.cuh
new file mode 100644
index 0000000000..a4a090d058
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/lp_unexp.cuh
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::distance::detail::ops {
+
+/**
+ * @brief the unexpanded Lp (Minkowski) distance matrix calculation
+ *
+ * It computes the following equation:
+ *
+ *   c_ij = (sum_k |x_ik - y_jk|^p)^(1/p)
+ */
+template <typename DataT, typename AccT, typename IdxT>
+struct lp_unexp_distance_op {
+  DataT p;
+
+  lp_unexp_distance_op(DataT p_) noexcept : p(p_) {}
+
+  // Load norms of input data
+  static constexpr bool use_norms = false;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = true;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize;
+  }
+
+  DI void core(AccT& acc, DataT& x, DataT& y) const
+  {
+    const auto diff = raft::abs(x - y);
+    acc += raft::pow(diff, p);
+  };
+
+  template <typename Policy>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+    const auto one_over_p = 1.0f / p;
+#pragma unroll
+    for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+        acc[i][j] = raft::pow(acc[i][j], one_over_p);
+      }
+    }
+  }
+};
+
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh b/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh
new file mode 100644
index 0000000000..7acd858e49
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::distance::detail::ops {
+
+/**
+ * @brief the Russell Rao distance matrix calculation
+ *
+ * It computes the following equation:
+ *
+ *  c_ij = (k - (sum_k x_ik * y_kj)) / k
+ */
+template <typename DataT, typename AccT, typename IdxT>
+struct russel_rao_distance_op {
+  IdxT k;
+  const float one_over_k;
+
+  russel_rao_distance_op(IdxT k_) noexcept : k(k_), one_over_k(1.0f / k_) {}
+
+  // Load norms of input data
+  static constexpr bool use_norms = false;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = false;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize;
+  }
+
+  DI void core(AccT& acc, DataT& x, DataT& y) const { acc += x * y; };
+
+  template <typename Policy>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+#pragma unroll
+    for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+        acc[i][j] = (k - acc[i][j]) * one_over_k;
+      }
+    }
+  }
+};
+
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/distance_ops/template.cuh b/cpp/include/raft/distance/detail/distance_ops/template.cuh
new file mode 100644
index 0000000000..b0f40123aa
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/template.cuh
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::distance::detail::ops {
+
+// Describes the computation the template distance
+//
+// Fill in the TODO items.
+
+template <typename DataT, typename AccT, typename IdxT>
+struct template_distance_op {
+  TODO member;
+
+  template_distance_op(TODO member_) noexcept : member(member_) {}
+
+  // Load norms of input data
+  static constexpr bool use_norms = TODO;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = false;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy, typename DataT>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize + TODO;
+  }
+
+  DI void core(AccT& acc, DataT& x, DataT& y) const { TODO; };
+
+  template <typename Policy>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+    TODO;
+  }
+};
+
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/euclidean.cuh b/cpp/include/raft/distance/detail/euclidean.cuh
deleted file mode 100644
index 7c5fdb912c..0000000000
--- a/cpp/include/raft/distance/detail/euclidean.cuh
+++ /dev/null
@@ -1,486 +0,0 @@
-/*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/distance/detail/pairwise_distance_cutlass_base.cuh>
-#include <raft/linalg/norm.cuh>
-#include <raft/util/cuda_utils.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-template <typename DataT, typename AccT>
-struct L2ExpandedOp {
-  bool sqrt;
-
-  __device__ L2ExpandedOp() noexcept : sqrt(false) {}
-  __device__ L2ExpandedOp(bool isSqrt) noexcept : sqrt(isSqrt) {}
-  __device__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const noexcept
-  {
-    AccT outVal = aNorm + bNorm - DataT(2.0) * accVal;
-    // outVal could be negative due to numerical instability, especially when
-    // calculating self distance.
-    // clamp to 0 to avoid potential NaN in sqrt
-    outVal = outVal * (outVal > DataT(0.0));
-    return sqrt ? raft::sqrt(outVal) : outVal;
-  }
-
-  __device__ AccT operator()(DataT aData) const noexcept { return aData; }
-};
-
-/**
- * @brief the expanded euclidean distance matrix calculation implementer
- *  It computes the following equation: C = op(A^2 + B^2 - 2AB)
- * @tparam DataT input data-type (for A and B matrices)
- * @tparam AccT   accumulation data-type
- * @tparam OutT   output data-type (for C and D matrices)
- * @tparam IdxT   index data-type
- * @tparam Veclen number of k-elements loaded by each thread for every LDG call
- *                it makes. check contractions.cuh for details.
- * @tparam FinalLambda the final lambda called on final distance value
- * @tparam isRowMajor  true if input/output is row major,
-                       false for column major
- * @param[in]     x input matrix
- * @param[in]     y input matrix
- * @param[in]     xn row norms of input matrix A.
- * @param[in]     yn row norms of input matrix B.
- * @param[in]     m number of rows of A and C/D
- * @param[in]     n number of columns of B and C/D
- * @param[in]     k number of cols of A and rows of B
- * @param[in]     lda leading dimension of A
- * @param[in]     ldb leading dimension of B
- * @param[in]     ldd leading dimension of C/D
- * @param[in]     sqrt if the square root is computed or not
- * @param[output] pD output matrix
- * @param fin_op  the final gemm epilogue lambda
-*  @param stream  cuda stream to launch cuda operations.
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-void euclideanExpImpl(const DataT* x,
-                      const DataT* y,
-                      const DataT* xn,
-                      const DataT* yn,
-                      IdxT m,
-                      IdxT n,
-                      IdxT k,
-                      IdxT lda,
-                      IdxT ldb,
-                      IdxT ldd,
-                      bool sqrt,
-                      OutT* dOutput,
-                      FinalLambda fin_op,
-                      cudaStream_t stream)
-{
-#if (__CUDACC_VER_MAJOR__ < 12)
-  const auto deviceVersion = getComputeCapability();
-  if (deviceVersion.first >= 8) {
-    using L2Op = L2ExpandedOp<DataT, AccT>;
-    L2Op L2_dist_op(sqrt);
-
-    cutlassDistanceKernel<DataT, AccT, OutT, IdxT, VecLen, FinalLambda, L2Op, isRowMajor>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, L2_dist_op, stream);
-
-  } else
-#endif
-  {
-
-    typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-    typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-    typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-    dim3 blk(KPolicy::Nthreads);
-
-    // Accumulation operation lambda
-    auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
-
-    // epilogue operation lambda for final value calculation
-    auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                           DataT * regxn,
-                                           DataT * regyn,
-                                           IdxT gridStrideX,
-                                           IdxT gridStrideY) {
-#pragma unroll
-      for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-#pragma unroll
-        for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-          DataT val = regxn[i] + regyn[j] - (DataT)2.0 * acc[i][j];
-          acc[i][j] = val * (val > DataT(0.0));
-        }
-      }
-      if (sqrt) {
-#pragma unroll
-        for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-#pragma unroll
-          for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-            acc[i][j] = raft::sqrt(acc[i][j]);
-          }
-        }
-      }
-    };
-
-    constexpr size_t shmemSize =
-      KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
-    if (isRowMajor) {
-      auto euclideanExpRowMajor = pairwiseDistanceMatKernelPriorToAmpere<true,
-                                                                         DataT,
-                                                                         AccT,
-                                                                         OutT,
-                                                                         IdxT,
-                                                                         KPolicy,
-                                                                         decltype(core_lambda),
-                                                                         decltype(epilog_lambda),
-                                                                         FinalLambda,
-                                                                         true>;
-      dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpRowMajor);
-
-      euclideanExpRowMajor<<<grid, blk, shmemSize, stream>>>(
-        x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-    } else {
-      auto euclideanExpColMajor = pairwiseDistanceMatKernelPriorToAmpere<true,
-                                                                         DataT,
-                                                                         AccT,
-                                                                         OutT,
-                                                                         IdxT,
-                                                                         KPolicy,
-                                                                         decltype(core_lambda),
-                                                                         decltype(epilog_lambda),
-                                                                         FinalLambda,
-                                                                         false>;
-      dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpColMajor);
-      euclideanExpColMajor<<<grid, blk, shmemSize, stream>>>(
-        x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-    }
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void euclideanExp(IdxT m,
-                  IdxT n,
-                  IdxT k,
-                  IdxT lda,
-                  IdxT ldb,
-                  IdxT ldd,
-                  const DataT* x,
-                  const DataT* y,
-                  const DataT* xn,
-                  const DataT* yn,
-                  bool sqrt,
-                  OutT* dOutput,
-                  FinalLambda fin_op,
-                  cudaStream_t stream)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    euclideanExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    euclideanExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
-  } else {
-    euclideanExpImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
-  }
-}
-
-/**
- * @brief the expanded euclidean distance matrix calculation
- *  It computes the following equation: C = op(A^2 + B^2 - 2AB)
- * @tparam InType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OutType output data-type (for C and D matrices)
- * @tparam FinalLambda the final lambda called by FragmentMultiplyAdd_
- * @tparam Index_ index type
- * @param m number of rows of A and C/D
- * @param n number of columns of B and C/D
- * @param k number of cols of A and rows of B
- * @param pA input matrix
- * @param pB input matrix
- * @param pD output matrix
- * @param enable_sqrt if the square root is computed or not
- * @param workspace temporary workspace needed for computations
- * @param worksize number of bytes of the workspace
- * @param fin_op the final gemm epilogue lambda
- * @param stream cuda stream where to launch work
- * @param isRowMajor whether the input and output matrices are row major
- */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void euclideanAlgo1(Index_ m,
-                    Index_ n,
-                    Index_ k,
-                    const InType* pA,
-                    const InType* pB,
-                    OutType* pD,
-                    bool enable_sqrt,
-                    AccType* workspace,
-                    size_t& worksize,
-                    FinalLambda fin_op,
-                    cudaStream_t stream,
-                    bool isRowMajor)
-{
-  // raft distance support inputs as float/double and output as uint8_t/float/double.
-  static_assert(!((sizeof(OutType) > 1) && (sizeof(AccType) != sizeof(OutType))),
-                "OutType can be uint8_t, float, double,"
-                "if sizeof(OutType) > 1 then sizeof(AccType) == sizeof(OutType).");
-  typedef typename std::conditional<sizeof(OutType) == 1, OutType, AccType>::type ExpOutType;
-  ExpOutType* pDcast = reinterpret_cast<ExpOutType*>(pD);
-
-  ASSERT(
-    !(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || (worksize < m * sizeof(AccType))),
-    "workspace size error");
-  ASSERT(workspace != nullptr, "workspace is null");
-
-  Index_ lda, ldb, ldd;
-  InType* col_vec = workspace;
-  InType* row_vec = workspace;
-  if (pA != pB) {
-    row_vec += m;
-    raft::linalg::rowNorm(
-      col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
-    raft::linalg::rowNorm(
-      row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
-  } else {
-    raft::linalg::rowNorm(
-      col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
-  }
-
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    euclideanExp<InType, AccType, ExpOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, enable_sqrt, pDcast, fin_op, stream);
-  } else {
-    lda = n, ldb = m, ldd = m;
-    euclideanExp<InType, AccType, ExpOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, enable_sqrt, pDcast, fin_op, stream);
-  }
-}
-
-/**
- * @brief the unexpanded euclidean distance matrix calculation
- *  It computes the following equation: cij = op((ai-bj)^2)
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
- * @tparam FinalLambda    final lambda called on final distance value
- *
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of columns of B and C/D
- * @param[in]       k number of cols of A and rows of B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[in]       sqrt if the square root is computed or not
- * @param[output]   pD output matrix
- * @param fin_op    the final gemm epilogue lambda
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-void euclideanUnExpImpl(const DataT* x,
-                        const DataT* y,
-                        IdxT m,
-                        IdxT n,
-                        IdxT k,
-                        IdxT lda,
-                        IdxT ldb,
-                        IdxT ldd,
-                        bool sqrt,
-                        OutT* dOutput,
-                        FinalLambda fin_op,
-                        cudaStream_t stream)
-{
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-  dim3 blk(KPolicy::Nthreads);
-
-  // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    const auto diff = x - y;
-    acc += diff * diff;
-  };
-
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                         DataT * regxn,
-                                         DataT * regyn,
-                                         IdxT gridStrideX,
-                                         IdxT gridStrideY) {
-    if (sqrt) {
-#pragma unroll
-      for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-#pragma unroll
-        for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-          acc[i][j] = raft::sqrt(acc[i][j]);
-        }
-      }
-    }
-  };
-
-  if (isRowMajor) {
-    auto euclideanUnExpRowMajor = pairwiseDistanceMatKernel<false,
-                                                            DataT,
-                                                            AccT,
-                                                            OutT,
-                                                            IdxT,
-                                                            KPolicy,
-                                                            decltype(core_lambda),
-                                                            decltype(epilog_lambda),
-                                                            FinalLambda,
-                                                            true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, euclideanUnExpRowMajor);
-
-    euclideanUnExpRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-
-  } else {
-    auto euclideanUnExpColMajor = pairwiseDistanceMatKernel<false,
-                                                            DataT,
-                                                            AccT,
-                                                            OutT,
-                                                            IdxT,
-                                                            KPolicy,
-                                                            decltype(core_lambda),
-                                                            decltype(epilog_lambda),
-                                                            FinalLambda,
-                                                            false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, euclideanUnExpColMajor);
-
-    euclideanUnExpColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void euclideanUnExp(IdxT m,
-                    IdxT n,
-                    IdxT k,
-                    IdxT lda,
-                    IdxT ldb,
-                    IdxT ldd,
-                    const DataT* x,
-                    const DataT* y,
-                    bool sqrt,
-                    OutT* dOutput,
-                    FinalLambda fin_op,
-                    cudaStream_t stream)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
-  } else {
-    euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
-  }
-}
-
-/**
- * @brief the unexpanded euclidean distance matrix calculation
- *  It computes the following equation: cij = op((ai-bj)^2)
- * @tparam InType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OutType output data-type (for C and D matrices)
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ index type
- * @param m number of rows of A and C/D
- * @param n number of columns of B and C/D
- * @param k number of cols of A and rows of B
- * @param pA input matrix
- * @param pB input matrix
- * @param pD output matrix
- * @param enable_sqrt if the square root is computed or not
- * @param fin_op the final gemm epilogue lambda
- * @param stream cuda stream where to launch work
- * @param isRowMajor whether the input and output matrices are row major
- */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void euclideanAlgo2(Index_ m,
-                    Index_ n,
-                    Index_ k,
-                    const InType* pA,
-                    const InType* pB,
-                    OutType* pD,
-                    bool enable_sqrt,
-                    FinalLambda fin_op,
-                    cudaStream_t stream,
-                    bool isRowMajor)
-{
-  typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type UnExpOutType;
-  UnExpOutType* pDcast = reinterpret_cast<UnExpOutType*>(pD);
-  Index_ lda, ldb, ldd;
-
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    euclideanUnExp<InType, AccType, UnExpOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, enable_sqrt, pDcast, fin_op, stream);
-  } else {
-    lda = n, ldb = m, ldd = m;
-    euclideanUnExp<InType, AccType, UnExpOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, enable_sqrt, pDcast, fin_op, stream);
-  }
-}
-
-};  // end namespace detail
-};  // end namespace distance
-};  // end namespace raft
diff --git a/cpp/include/raft/distance/detail/hamming.cuh b/cpp/include/raft/distance/detail/hamming.cuh
deleted file mode 100644
index bed9d09e3e..0000000000
--- a/cpp/include/raft/distance/detail/hamming.cuh
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include <raft/distance/detail/pairwise_distance_base.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-/**
- * @brief the Hamming distance matrix using the unexpanded form:
- *  It computes the following equation:
-    Cij = sum(x_i != y_i) / k
- *
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
- * @tparam Veclen         number of k-elements loaded by each thread
-                          for every LDG call. details in contractions.cuh
- * @tparam FinalLambda    final lambda called on final distance value
- * @tparam isRowMajor     true if input/output is row major,
-                          false for column major
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of rows of B and C/D
- * @param[in]       k number of cols of A and B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[output]   dOutput output matrix
- * @param[in]       fin_op the final gemm epilogue lambda
- * @param[in]       stream cuda stream to launch work
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-static void hammingUnexpandedImpl(const DataT* x,
-                                  const DataT* y,
-                                  IdxT m,
-                                  IdxT n,
-                                  IdxT k,
-                                  IdxT lda,
-                                  IdxT ldb,
-                                  IdxT ldd,
-                                  OutT* dOutput,
-                                  FinalLambda fin_op,
-                                  cudaStream_t stream)
-{
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-  dim3 blk(KPolicy::Nthreads);
-
-  // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += (x != y); };
-
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [k] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                      DataT * regxn,
-                                      DataT * regyn,
-                                      IdxT gridStrideX,
-                                      IdxT gridStrideY) {
-    const DataT one_over_k = DataT(1.0) / k;
-#pragma unroll
-    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-#pragma unroll
-      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        acc[i][j] *= one_over_k;
-      }
-    }
-  };
-
-  if (isRowMajor) {
-    auto hammingUnexpandedRowMajor = pairwiseDistanceMatKernel<false,
-                                                               DataT,
-                                                               AccT,
-                                                               OutT,
-                                                               IdxT,
-                                                               KPolicy,
-                                                               decltype(core_lambda),
-                                                               decltype(epilog_lambda),
-                                                               FinalLambda,
-                                                               true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, hammingUnexpandedRowMajor);
-
-    hammingUnexpandedRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  } else {
-    auto hammingUnexpandedColMajor = pairwiseDistanceMatKernel<false,
-                                                               DataT,
-                                                               AccT,
-                                                               OutT,
-                                                               IdxT,
-                                                               KPolicy,
-                                                               decltype(core_lambda),
-                                                               decltype(epilog_lambda),
-                                                               FinalLambda,
-                                                               false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, hammingUnexpandedColMajor);
-    hammingUnexpandedColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void hammingUnexpanded(IdxT m,
-                       IdxT n,
-                       IdxT k,
-                       IdxT lda,
-                       IdxT ldb,
-                       IdxT ldd,
-                       const DataT* x,
-                       const DataT* y,
-                       OutT* dOutput,
-                       FinalLambda fin_op,
-                       cudaStream_t stream)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    hammingUnexpandedImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    hammingUnexpandedImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else {
-    hammingUnexpandedImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  }
-}
-
-/**
- * @brief the Hamming Unexpanded distance matrix calculation
- *  It computes the following equation:
-    Cij = sum(x_i != y_i) / k
- *
- * @tparam InType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OutType output data-type (for C and D matrices)
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
- * @param m number of rows of A and C/D
- * @param n number of columns of B and C/D
- * @param k number of cols of A and rows of B
- * @param pA input matrix
- * @param pB input matrix
- * @param pD output matrix
- * @param fin_op the final element-wise epilogue lambda
- * @param stream cuda stream where to launch work
- * @param isRowMajor whether the input and output matrices are row major
- */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void hammingUnexpandedImpl(int m,
-                           int n,
-                           int k,
-                           const InType* pA,
-                           const InType* pB,
-                           OutType* pD,
-                           FinalLambda fin_op,
-                           cudaStream_t stream,
-                           bool isRowMajor)
-{
-  typedef std::is_same<OutType, bool> is_bool;
-  typedef
-    typename std::conditional<is_bool::value, OutType, AccType>::type hammingUnexpandedOutType;
-  Index_ lda, ldb, ldd;
-  hammingUnexpandedOutType* pDcast = reinterpret_cast<hammingUnexpandedOutType*>(pD);
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    hammingUnexpanded<InType, AccType, hammingUnexpandedOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
-
-  } else {
-    lda = n, ldb = m, ldd = m;
-    hammingUnexpanded<InType, AccType, hammingUnexpandedOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
-  }
-}
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/hellinger.cuh b/cpp/include/raft/distance/detail/hellinger.cuh
deleted file mode 100644
index 13507fe84f..0000000000
--- a/cpp/include/raft/distance/detail/hellinger.cuh
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/linalg/unary_op.cuh>
-#include <raft/util/cuda_utils.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-/**
- * @brief the Hellinger distance matrix using the expanded form:
- *  It computes the following equation:
-    cij = sqrt(1 - sum(sqrt(x_k * y_k)))
- * This distance computation modifies A and B by computing a sqrt
- * and then performing a `pow(x, 2)` to convert it back. Because of this,
- * it is possible that the values in A and B might differ slightly
- * after this is invoked.
- *
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
- * @tparam Veclen         number of k-elements loaded by each thread
-                          for every LDG call. details in contractions.cuh
- * @tparam FinalLambda    final lambda called on final distance value
- * @tparam isRowMajor     true if input/output is row major,
-                          false for column major
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of rows of B and C/D
- * @param[in]       k number of cols of A and B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[output]   dOutput output matrix
- * @param[in]       fin_op the final gemm epilogue lambda
- * @param[in]       stream cuda stream to launch work
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-static void hellingerImpl(const DataT* x,
-                          const DataT* y,
-                          IdxT m,
-                          IdxT n,
-                          IdxT k,
-                          IdxT lda,
-                          IdxT ldb,
-                          IdxT ldd,
-                          OutT* dOutput,
-                          FinalLambda fin_op,
-                          cudaStream_t stream)
-{
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-  dim3 blk(KPolicy::Nthreads);
-
-  // First sqrt x and y
-  raft::linalg::unaryOp<DataT, raft::sqrt_op, IdxT>((DataT*)x, x, m * k, raft::sqrt_op{}, stream);
-  if (x != y) {
-    raft::linalg::unaryOp<DataT, raft::sqrt_op, IdxT>((DataT*)y, y, n * k, raft::sqrt_op{}, stream);
-  }
-
-  // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    // This is sqrt(x) * sqrt(y).
-    const auto product = x * y;
-    acc += product;
-  };
-
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                     DataT * regxn,
-                                     DataT * regyn,
-                                     IdxT gridStrideX,
-                                     IdxT gridStrideY) {
-#pragma unroll
-    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-#pragma unroll
-      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
-        const auto finalVal  = (1 - acc[i][j]);
-        const auto rectifier = (!signbit(finalVal));
-        acc[i][j]            = raft::sqrt(rectifier * finalVal);
-      }
-    }
-  };
-
-  if (isRowMajor) {
-    auto hellingerRowMajor = pairwiseDistanceMatKernel<false,
-                                                       DataT,
-                                                       AccT,
-                                                       OutT,
-                                                       IdxT,
-                                                       KPolicy,
-                                                       decltype(core_lambda),
-                                                       decltype(epilog_lambda),
-                                                       FinalLambda,
-                                                       true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, hellingerRowMajor);
-
-    hellingerRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  } else {
-    auto hellingerColMajor = pairwiseDistanceMatKernel<false,
-                                                       DataT,
-                                                       AccT,
-                                                       OutT,
-                                                       IdxT,
-                                                       KPolicy,
-                                                       decltype(core_lambda),
-                                                       decltype(epilog_lambda),
-                                                       FinalLambda,
-                                                       false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, hellingerColMajor);
-    hellingerColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  }
-
-  // Revert sqrt of x and y
-  raft::linalg::unaryOp<DataT, raft::sqrt_op, IdxT>((DataT*)x, x, m * k, raft::sqrt_op{}, stream);
-  if (x != y) {
-    raft::linalg::unaryOp<DataT, raft::sqrt_op, IdxT>((DataT*)y, y, n * k, raft::sqrt_op{}, stream);
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void hellinger(IdxT m,
-               IdxT n,
-               IdxT k,
-               IdxT lda,
-               IdxT ldb,
-               IdxT ldd,
-               const DataT* x,
-               const DataT* y,
-               OutT* dOutput,
-               FinalLambda fin_op,
-               cudaStream_t stream)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    hellingerImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    hellingerImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else {
-    hellingerImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  }
-}
-
-/**
- * @brief the Hellinger distance matrix calculation
- *  It computes the following equation:
-    sqrt(1 - sum(sqrt(x_k * y_k))
- * This distance computation modifies A and B by computing a sqrt
- * and then performing a `pow(x, 2)` to convert it back. Because of this,
- * it is possible that the values in A and B might differ slightly
- * after this is invoked.
- *
- * @tparam InType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OutType output data-type (for C and D matrices)
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
- * @param m number of rows of A and C/D
- * @param n number of columns of B and C/D
- * @param k number of cols of A and rows of B
- * @param pA input matrix
- * @param pB input matrix
- * @param pD output matrix
- * @param fin_op the final element-wise epilogue lambda
- * @param stream cuda stream where to launch work
- * @param isRowMajor whether the input and output matrices are row major
- */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void hellingerImpl(int m,
-                   int n,
-                   int k,
-                   const InType* pA,
-                   const InType* pB,
-                   OutType* pD,
-                   FinalLambda fin_op,
-                   cudaStream_t stream,
-                   bool isRowMajor)
-{
-  typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type hellingerOutType;
-  Index_ lda, ldb, ldd;
-  hellingerOutType* pDcast = reinterpret_cast<hellingerOutType*>(pD);
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    hellinger<InType, AccType, hellingerOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
-
-  } else {
-    lda = n, ldb = m, ldd = m;
-    hellinger<InType, AccType, hellingerOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
-  }
-}
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/jensen_shannon.cuh b/cpp/include/raft/distance/detail/jensen_shannon.cuh
deleted file mode 100644
index f96da01b87..0000000000
--- a/cpp/include/raft/distance/detail/jensen_shannon.cuh
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include <raft/distance/detail/pairwise_distance_base.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-/**
- * @brief the Jensen Shannon distance matrix:
- *  It computes the following equation:
-    Cij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i))
-            + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i)))))
- *
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
- * @tparam Veclen         number of k-elements loaded by each thread
-                          for every LDG call. details in contractions.cuh
- * @tparam FinalLambda    final lambda called on final distance value
- * @tparam isRowMajor     true if input/output is row major,
-                          false for column major
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of rows of B and C/D
- * @param[in]       k number of cols of A and B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[output]   dOutput output matrix
- * @param[in]       fin_op the final gemm epilogue lambda
- * @param[in]       stream cuda stream to launch work
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-static void jensenShannonImpl(const DataT* x,
-                              const DataT* y,
-                              IdxT m,
-                              IdxT n,
-                              IdxT k,
-                              IdxT lda,
-                              IdxT ldb,
-                              IdxT ldd,
-                              OutT* dOutput,
-                              FinalLambda fin_op,
-                              cudaStream_t stream)
-{
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-  dim3 blk(KPolicy::Nthreads);
-
-  // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    const DataT m     = 0.5f * (x + y);
-    const bool m_zero = (m == 0);
-    const auto logM   = (!m_zero) * raft::log(m + m_zero);
-
-    const bool x_zero = (x == 0);
-    const bool y_zero = (y == 0);
-    acc += (-x * (logM - raft::log(x + x_zero))) + (-y * (logM - raft::log(y + y_zero)));
-  };
-
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                     DataT * regxn,
-                                     DataT * regyn,
-                                     IdxT gridStrideX,
-                                     IdxT gridStrideY) {
-#pragma unroll
-    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-#pragma unroll
-      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        acc[i][j] = raft::sqrt(0.5 * acc[i][j]);
-      }
-    }
-  };
-
-  if (isRowMajor) {
-    auto jensenShannonRowMajor = pairwiseDistanceMatKernel<false,
-                                                           DataT,
-                                                           AccT,
-                                                           OutT,
-                                                           IdxT,
-                                                           KPolicy,
-                                                           decltype(core_lambda),
-                                                           decltype(epilog_lambda),
-                                                           FinalLambda,
-                                                           true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, jensenShannonRowMajor);
-
-    jensenShannonRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  } else {
-    auto jensenShannonColMajor = pairwiseDistanceMatKernel<false,
-                                                           DataT,
-                                                           AccT,
-                                                           OutT,
-                                                           IdxT,
-                                                           KPolicy,
-                                                           decltype(core_lambda),
-                                                           decltype(epilog_lambda),
-                                                           FinalLambda,
-                                                           false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, jensenShannonColMajor);
-    jensenShannonColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void jensenShannon(IdxT m,
-                   IdxT n,
-                   IdxT k,
-                   IdxT lda,
-                   IdxT ldb,
-                   IdxT ldd,
-                   const DataT* x,
-                   const DataT* y,
-                   OutT* dOutput,
-                   FinalLambda fin_op,
-                   cudaStream_t stream)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    jensenShannonImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    jensenShannonImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else {
-    jensenShannonImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  }
-}
-
-/**
- * @brief the Jensen Shannon distance matrix calculation
- *  It computes the following equation:
-    Cij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i))
-            + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i)))))
- *
- * @tparam InType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OutType output data-type (for C and D matrices)
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
- * @param m number of rows of A and C/D
- * @param n number of columns of B and C/D
- * @param k number of cols of A and rows of B
- * @param pA input matrix
- * @param pB input matrix
- * @param pD output matrix
- * @param fin_op the final element-wise epilogue lambda
- * @param stream cuda stream where to launch work
- * @param isRowMajor whether the input and output matrices are row major
- */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void jensenShannonImpl(int m,
-                       int n,
-                       int k,
-                       const InType* pA,
-                       const InType* pB,
-                       OutType* pD,
-                       FinalLambda fin_op,
-                       cudaStream_t stream,
-                       bool isRowMajor)
-{
-  typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type jensenShannonOutType;
-  Index_ lda, ldb, ldd;
-  jensenShannonOutType* pDcast = reinterpret_cast<jensenShannonOutType*>(pD);
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    jensenShannon<InType, AccType, jensenShannonOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
-
-  } else {
-    lda = n, ldb = m, ldd = m;
-    jensenShannon<InType, AccType, jensenShannonOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
-  }
-}
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/kl_divergence.cuh b/cpp/include/raft/distance/detail/kl_divergence.cuh
deleted file mode 100644
index 7ebeaf4de9..0000000000
--- a/cpp/include/raft/distance/detail/kl_divergence.cuh
+++ /dev/null
@@ -1,343 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include <raft/distance/detail/pairwise_distance_base.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-/**
- * @brief the KL Divergence distance matrix:
- *  It computes the following equation:
-    Cij = 0.5 * sum(x * log (x / y));
- * This distance computation modifies A or B by computing a log(x)
- * and then performing a `pow(e, log(x))` to convert it back. Because of this,
- * it is possible that the values in A or B might differ slightly
- * after this is invoked.
- *
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
- * @tparam Veclen         number of k-elements loaded by each thread
-                          for every LDG call. details in contractions.cuh
- * @tparam FinalLambda    final lambda called on final distance value
- * @tparam isRowMajor     true if input/output is row major,
-                          false for column major
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of rows of B and C/D
- * @param[in]       k number of cols of A and B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[output]   dOutput output matrix
- * @param[in]       fin_op the final gemm epilogue lambda
- * @param[in]       stream cuda stream to launch work
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-static void klDivergenceImpl(const DataT* x,
-                             const DataT* y,
-                             IdxT m,
-                             IdxT n,
-                             IdxT k,
-                             IdxT lda,
-                             IdxT ldb,
-                             IdxT ldd,
-                             OutT* dOutput,
-                             FinalLambda fin_op,
-                             cudaStream_t stream)
-{
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-  dim3 blk(KPolicy::Nthreads);
-
-  // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    if (isRowMajor) {
-      const bool x_zero = (x == 0);
-      acc += x * (raft::log(x + x_zero) - y);
-    } else {
-      const bool y_zero = (y == 0);
-      acc += y * (raft::log(y + y_zero) - x);
-    }
-  };
-
-  auto core_lambda_x_equal_y = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    if (isRowMajor) {
-      const bool x_zero = (x == 0);
-      const bool y_zero = (y == 0);
-      acc += x * (raft::log(x + x_zero) - (!y_zero) * raft::log(y + y_zero));
-    } else {
-      const bool y_zero = (y == 0);
-      const bool x_zero = (x == 0);
-      acc += y * (raft::log(y + y_zero) - (!x_zero) * raft::log(x + x_zero));
-    }
-  };
-
-  auto unaryOp_lambda = [] __device__(DataT input) {
-    const bool x_zero = (input == 0);
-    return (!x_zero) * raft::log(input + x_zero);
-  };
-
-  auto unaryOp_lambda_reverse = [] __device__(DataT input) {
-    // reverse previous log (x) back to x using (e ^ log(x))
-    const bool x_zero = (input == 0);
-    return (!x_zero) * raft::exp(input);
-  };
-
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                     DataT * regxn,
-                                     DataT * regyn,
-                                     IdxT gridStrideX,
-                                     IdxT gridStrideY) {
-#pragma unroll
-    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-#pragma unroll
-      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        acc[i][j] = (0.5f * acc[i][j]);
-      }
-    }
-  };
-
-  if (isRowMajor) {
-    constexpr auto klDivergenceRowMajor = pairwiseDistanceMatKernel<false,
-                                                                    DataT,
-                                                                    AccT,
-                                                                    OutT,
-                                                                    IdxT,
-                                                                    KPolicy,
-                                                                    decltype(core_lambda),
-                                                                    decltype(epilog_lambda),
-                                                                    FinalLambda,
-                                                                    true>;
-    constexpr auto klDivergenceRowMajorXequalY =
-      pairwiseDistanceMatKernel<false,
-                                DataT,
-                                AccT,
-                                OutT,
-                                IdxT,
-                                KPolicy,
-                                decltype(core_lambda_x_equal_y),
-                                decltype(epilog_lambda),
-                                FinalLambda,
-                                true>;
-    if (x != y) {
-      raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-        (DataT*)y, y, n * k, unaryOp_lambda, stream);
-      dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, klDivergenceRowMajor);
-      klDivergenceRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(x,
-                                                                     y,
-                                                                     nullptr,
-                                                                     nullptr,
-                                                                     m,
-                                                                     n,
-                                                                     k,
-                                                                     lda,
-                                                                     ldb,
-                                                                     ldd,
-                                                                     dOutput,
-                                                                     core_lambda,
-                                                                     epilog_lambda,
-                                                                     fin_op);
-      // Now reverse previous log (x) back to x using (e ^ log(x))
-      raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda_reverse), IdxT>(
-        (DataT*)y, y, n * k, unaryOp_lambda_reverse, stream);
-    } else {
-      dim3 grid =
-        launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, klDivergenceRowMajorXequalY);
-      klDivergenceRowMajorXequalY<<<grid, blk, KPolicy::SmemSize, stream>>>(x,
-                                                                            y,
-                                                                            nullptr,
-                                                                            nullptr,
-                                                                            m,
-                                                                            n,
-                                                                            k,
-                                                                            lda,
-                                                                            ldb,
-                                                                            ldd,
-                                                                            dOutput,
-                                                                            core_lambda_x_equal_y,
-                                                                            epilog_lambda,
-                                                                            fin_op);
-    }
-  } else {
-    constexpr auto klDivergenceColMajor = pairwiseDistanceMatKernel<false,
-                                                                    DataT,
-                                                                    AccT,
-                                                                    OutT,
-                                                                    IdxT,
-                                                                    KPolicy,
-                                                                    decltype(core_lambda),
-                                                                    decltype(epilog_lambda),
-                                                                    FinalLambda,
-                                                                    false>;
-    constexpr auto klDivergenceColMajorXequalY =
-      pairwiseDistanceMatKernel<false,
-                                DataT,
-                                AccT,
-                                OutT,
-                                IdxT,
-                                KPolicy,
-                                decltype(core_lambda_x_equal_y),
-                                decltype(epilog_lambda),
-                                FinalLambda,
-                                false>;
-    if (x != y) {
-      raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-        (DataT*)x, x, m * k, unaryOp_lambda, stream);
-      dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, klDivergenceColMajor);
-      klDivergenceColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(x,
-                                                                     y,
-                                                                     nullptr,
-                                                                     nullptr,
-                                                                     m,
-                                                                     n,
-                                                                     k,
-                                                                     lda,
-                                                                     ldb,
-                                                                     ldd,
-                                                                     dOutput,
-                                                                     core_lambda,
-                                                                     epilog_lambda,
-                                                                     fin_op);
-      // Now reverse previous log (x) back to x using (e ^ log(x))
-      raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda_reverse), IdxT>(
-        (DataT*)x, x, m * k, unaryOp_lambda_reverse, stream);
-    } else {
-      dim3 grid =
-        launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, klDivergenceColMajorXequalY);
-      klDivergenceColMajorXequalY<<<grid, blk, KPolicy::SmemSize, stream>>>(x,
-                                                                            y,
-                                                                            nullptr,
-                                                                            nullptr,
-                                                                            m,
-                                                                            n,
-                                                                            k,
-                                                                            lda,
-                                                                            ldb,
-                                                                            ldd,
-                                                                            dOutput,
-                                                                            core_lambda_x_equal_y,
-                                                                            epilog_lambda,
-                                                                            fin_op);
-    }
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void klDivergence(IdxT m,
-                  IdxT n,
-                  IdxT k,
-                  IdxT lda,
-                  IdxT ldb,
-                  IdxT ldd,
-                  const DataT* x,
-                  const DataT* y,
-                  OutT* dOutput,
-                  FinalLambda fin_op,
-                  cudaStream_t stream)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    klDivergenceImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    klDivergenceImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else {
-    klDivergenceImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  }
-}
-
-/**
- * @brief the KL Divergence distance matrix calculation
- *  It computes the following equation:
-      Cij = 0.5 * sum(x * log (x / y));
- * This distance computation modifies A or B by computing a log(x)
- * and then performing a `pow(e, log(x))` to convert it back. Because of this,
- * it is possible that the values in A or B might differ slightly
- * after this is invoked.
- * @tparam InType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OutType output data-type (for C and D matrices)
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
- * @param m number of rows of A and C/D
- * @param n number of columns of B and C/D
- * @param k number of cols of A and rows of B
- * @param pA input matrix
- * @param pB input matrix
- * @param pD output matrix
- * @param fin_op the final element-wise epilogue lambda
- * @param stream cuda stream where to launch work
- * @param isRowMajor whether the input and output matrices are row major
- */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void klDivergenceImpl(int m,
-                      int n,
-                      int k,
-                      const InType* pA,
-                      const InType* pB,
-                      OutType* pD,
-                      FinalLambda fin_op,
-                      cudaStream_t stream,
-                      bool isRowMajor)
-{
-  typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type klDivergenceOutType;
-  Index_ lda, ldb, ldd;
-  klDivergenceOutType* pDcast = reinterpret_cast<klDivergenceOutType*>(pD);
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    klDivergence<InType, AccType, klDivergenceOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
-
-  } else {
-    lda = n, ldb = m, ldd = m;
-    klDivergence<InType, AccType, klDivergenceOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
-  }
-}
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/l1.cuh b/cpp/include/raft/distance/detail/l1.cuh
deleted file mode 100644
index bf10651b60..0000000000
--- a/cpp/include/raft/distance/detail/l1.cuh
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include <raft/distance/detail/pairwise_distance_base.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-/**
- * @brief the L1 distance matrix calculation implementer
- *  It computes the following equation: cij = op(ai-bj)
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
-
- * @tparam FinalLambda    final lambda called on final distance value
- * @tparam isRowMajor     true if input/output is row major,
-                          false for column major
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of columns of B and C/D
- * @param[in]       k number of cols of A and rows of B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[output]   pD output matrix
- * @param fin_op    the final gemm epilogue lambda
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-static void l1Impl(const DataT* x,
-                   const DataT* y,
-                   IdxT m,
-                   IdxT n,
-                   IdxT k,
-                   IdxT lda,
-                   IdxT ldb,
-                   IdxT ldd,
-                   OutT* dOutput,
-                   FinalLambda fin_op,
-                   cudaStream_t stream)
-{
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-  dim3 blk(KPolicy::Nthreads);
-
-  // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    const auto diff = raft::abs(x - y);
-    acc += diff;
-  };
-
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = raft::void_op();
-
-  if (isRowMajor) {
-    auto l1RowMajor = pairwiseDistanceMatKernel<false,
-                                                DataT,
-                                                AccT,
-                                                OutT,
-                                                IdxT,
-                                                KPolicy,
-                                                decltype(core_lambda),
-                                                decltype(epilog_lambda),
-                                                FinalLambda,
-                                                true>;
-    dim3 grid       = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, l1RowMajor);
-
-    l1RowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  } else {
-    auto l1ColMajor = pairwiseDistanceMatKernel<false,
-                                                DataT,
-                                                AccT,
-                                                OutT,
-                                                IdxT,
-                                                KPolicy,
-                                                decltype(core_lambda),
-                                                decltype(epilog_lambda),
-                                                FinalLambda,
-                                                false>;
-    dim3 grid       = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, l1ColMajor);
-    l1ColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void l1(IdxT m,
-        IdxT n,
-        IdxT k,
-        IdxT lda,
-        IdxT ldb,
-        IdxT ldd,
-        const DataT* x,
-        const DataT* y,
-        OutT* dOutput,
-        FinalLambda fin_op,
-        cudaStream_t stream)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    l1Impl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    l1Impl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else {
-    l1Impl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  }
-}
-
-/**
- * @brief the L1 distance matrix calculation
- *  It computes the following equation: cij = op(ai-bj)
- * @tparam InType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OutType output data-type (for C and D matrices)
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
- * @param m number of rows of A and C/D
- * @param n number of columns of B and C/D
- * @param k number of cols of A and rows of B
- * @param pA input matrix
- * @param pB input matrix
- * @param pD output matrix
- * @param fin_op the final element-wise epilogue lambda
- * @param stream cuda stream where to launch work
- * @param isRowMajor whether the input and output matrices are row major
- */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void l1Impl(int m,
-            int n,
-            int k,
-            const InType* pA,
-            const InType* pB,
-            OutType* pD,
-            FinalLambda fin_op,
-            cudaStream_t stream,
-            bool isRowMajor)
-{
-  typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type L1OutType;
-  Index_ lda, ldb, ldd;
-  L1OutType* pDcast = reinterpret_cast<L1OutType*>(pD);
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    l1<InType, AccType, L1OutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
-
-  } else {
-    lda = n, ldb = m, ldd = m;
-    l1<InType, AccType, L1OutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
-  }
-}
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/minkowski.cuh b/cpp/include/raft/distance/detail/minkowski.cuh
deleted file mode 100644
index 5594cbf5f1..0000000000
--- a/cpp/include/raft/distance/detail/minkowski.cuh
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include <raft/distance/detail/pairwise_distance_base.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-/**
- * @brief the unexpanded Minkowski distance matrix calculation
- *  It computes the following equation: cij = sum(|x - y|^p)^(1/p)
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
- * @tparam Veclen         number of k-elements loaded by each thread
-                          for every LDG call. details in contractions.cuh
- * @tparam FinalLambda    final lambda called on final distance value
- *
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of rows of B and cols of C/D
- * @param[in]       k number of cols of A and B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[output]   pD output matrix
- * @param[in]       fin_op the final gemm epilogue lambda
- * @param[in]       stream cuda stream to launch work
- * @param[in]       the value of `p` for Minkowski (l-p) distances.
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-void minkowskiUnExpImpl(const DataT* x,
-                        const DataT* y,
-                        IdxT m,
-                        IdxT n,
-                        IdxT k,
-                        IdxT lda,
-                        IdxT ldb,
-                        IdxT ldd,
-                        OutT* dOutput,
-                        FinalLambda fin_op,
-                        cudaStream_t stream,
-                        DataT p)
-{
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-  dim3 blk(KPolicy::Nthreads);
-
-  // Accumulation operation lambda
-  auto core_lambda = [p] __device__(AccT & acc, DataT & x, DataT & y) {
-    const auto diff = raft::abs(x - y);
-    acc += raft::pow(diff, p);
-  };
-
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [p] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                      DataT * regxn,
-                                      DataT * regyn,
-                                      IdxT gridStrideX,
-                                      IdxT gridStrideY) {
-    const auto one_over_p = 1.0f / p;
-#pragma unroll
-    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-#pragma unroll
-      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        acc[i][j] = raft::pow(acc[i][j], one_over_p);
-      }
-    }
-  };
-
-  if constexpr (isRowMajor) {
-    auto minkowskiUnExpRowMajor = pairwiseDistanceMatKernel<false,
-                                                            DataT,
-                                                            AccT,
-                                                            OutT,
-                                                            IdxT,
-                                                            KPolicy,
-                                                            decltype(core_lambda),
-                                                            decltype(epilog_lambda),
-                                                            FinalLambda,
-                                                            true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, minkowskiUnExpRowMajor);
-
-    minkowskiUnExpRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-
-  } else {
-    auto minkowskiUnExpColMajor = pairwiseDistanceMatKernel<false,
-                                                            DataT,
-                                                            AccT,
-                                                            OutT,
-                                                            IdxT,
-                                                            KPolicy,
-                                                            decltype(core_lambda),
-                                                            decltype(epilog_lambda),
-                                                            FinalLambda,
-                                                            false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, minkowskiUnExpColMajor);
-
-    minkowskiUnExpColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void minkowskiUnExp(IdxT m,
-                    IdxT n,
-                    IdxT k,
-                    IdxT lda,
-                    IdxT ldb,
-                    IdxT ldd,
-                    const DataT* x,
-                    const DataT* y,
-                    OutT* dOutput,
-                    FinalLambda fin_op,
-                    cudaStream_t stream,
-                    DataT metric_arg)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg);
-  } else {
-    minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg);
-  }
-}
-
-/**
- * @brief the unexpanded minkowski distance matrix calculation
- *  It computes the following equation: cij = sum(|x - y|^p)^(1/p)
- * @tparam InType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OutType output data-type (for C and D matrices)
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ index type
- * @param[in] m number of rows of A and C/D
- * @param[in] n number of rows of B and cols of C/D
- * @param[in] k number of cols of A and B
- * @param[in] pA input matrix
- * @param[in] pB input matrix
- * @param[out] pD output matrix
- * @param[in] fin_op the final gemm epilogue lambda
- * @param[in] stream cuda stream to launch work
- * @param[in] isRowMajor whether the input and output matrices are row major
- * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances.
- */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void minkowskiImpl(Index_ m,
-                   Index_ n,
-                   Index_ k,
-                   const InType* pA,
-                   const InType* pB,
-                   OutType* pD,
-                   FinalLambda fin_op,
-                   cudaStream_t stream,
-                   bool isRowMajor,
-                   InType metric_arg)
-{
-  typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type LpUnexpOutType;
-  LpUnexpOutType* pDcast = reinterpret_cast<LpUnexpOutType*>(pD);
-  Index_ lda, ldb, ldd;
-
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    minkowskiUnExp<InType, AccType, LpUnexpOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream, metric_arg);
-  } else {
-    lda = n, ldb = m, ldd = m;
-    minkowskiUnExp<InType, AccType, LpUnexpOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream, metric_arg);
-  }
-}
-};  // end namespace detail
-};  // end namespace distance
-};  // end namespace raft
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index 293600ed21..0293f10c29 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -271,170 +271,6 @@ struct PairwiseDistances : public BaseClass {
   }
 };  // struct PairwiseDistances
 
-/**
- * @brief the distance matrix calculation kernel for L1, L2 and cosine
- * @tparam useNorms       whether norms are needed
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
- * @tparam Policy         struct which tunes the Contraction kernel
- * @tparam CoreLambda     lambda which implements accumulation operation
- * @tparam EpilogueLambda lambda which implements operation for calculating
-                          final value.
- * @tparam FinalLambda    final lambda called on final distance value
- * @tparam isRowMajor     true if input/output is row major(default),
-                          false for column major
- *
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       xn row norms of input matrix A.
- * @param[in]       yn row norms of input matrix B.
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of columns of B and C/D
- * @param[in]       k number of cols of A and rows of B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[output]   pD output matrix
- * @param core_op   the core lambda
- * @param epilog_op the epilogue lambda
- * @param fin_op    the final gemm epilogue lambda
- */
-
-template <bool useNorms,
-          typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename Policy,
-          typename CoreLambda,
-          typename EpilogueLambda,
-          typename FinalLambda,
-          bool isRowMajor = true,
-          bool writeOut   = true>
-__global__ __launch_bounds__(Policy::Nthreads, 2)
-
-  void pairwiseDistanceMatKernel(const DataT* x,
-                                 const DataT* y,
-                                 const DataT* _xn,
-                                 const DataT* _yn,
-                                 IdxT m,
-                                 IdxT n,
-                                 IdxT k,
-                                 IdxT lda,
-                                 IdxT ldb,
-                                 IdxT ldd,
-                                 OutT* dOutput,
-                                 CoreLambda core_op,
-                                 EpilogueLambda epilog_op,
-                                 FinalLambda fin_op)
-{
-  extern __shared__ char smem[];
-  auto rowEpilog = [] __device__(IdxT starty) { return; };
-
-  PairwiseDistances<useNorms,
-                    DataT,
-                    AccT,
-                    OutT,
-                    IdxT,
-                    Policy,
-                    CoreLambda,
-                    EpilogueLambda,
-                    FinalLambda,
-                    decltype(rowEpilog),
-                    isRowMajor,
-                    writeOut>
-    obj(
-      x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, epilog_op, fin_op, rowEpilog);
-  obj.run();
-}
-
-/**
- * @brief the distance matrix calculation kernel for L2 and cosine
- * for GPU arch < SM 8.0, this version is to make sure we don't recompile
- * these kernels for ampere or higher as we use cutlass kernel for it.
- * @tparam useNorms       whether norms are needed
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
- * @tparam Policy         struct which tunes the Contraction kernel
- * @tparam CoreLambda     lambda which implements accumulation operation
- * @tparam EpilogueLambda lambda which implements operation for calculating
-                          final value.
- * @tparam FinalLambda    final lambda called on final distance value
- * @tparam isRowMajor     true if input/output is row major(default),
-                          false for column major
- *
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       xn row norms of input matrix A.
- * @param[in]       yn row norms of input matrix B.
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of columns of B and C/D
- * @param[in]       k number of cols of A and rows of B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[output]   pD output matrix
- * @param core_op   the core lambda
- * @param epilog_op the epilogue lambda
- * @param fin_op    the final gemm epilogue lambda
- */
-
-template <bool useNorms,
-          typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename Policy,
-          typename CoreLambda,
-          typename EpilogueLambda,
-          typename FinalLambda,
-          bool isRowMajor = true,
-          bool writeOut   = true>
-__global__ __launch_bounds__(Policy::Nthreads, 2)
-
-  void pairwiseDistanceMatKernelPriorToAmpere(const DataT* x,
-                                              const DataT* y,
-                                              const DataT* _xn,
-                                              const DataT* _yn,
-                                              IdxT m,
-                                              IdxT n,
-                                              IdxT k,
-                                              IdxT lda,
-                                              IdxT ldb,
-                                              IdxT ldd,
-                                              OutT* dOutput,
-                                              CoreLambda core_op,
-                                              EpilogueLambda epilog_op,
-                                              FinalLambda fin_op)
-{
-  //#if __CUDA_ARCH__ < 800
-  // TODO: re-enable the CUDA_ARCH guard for below Ampere once cutlass based
-  //  kernels are enabled for CUDA 12.0
-  extern __shared__ char smem[];
-  auto rowEpilog = [] __device__(IdxT starty) { return; };
-
-  PairwiseDistances<useNorms,
-                    DataT,
-                    AccT,
-                    OutT,
-                    IdxT,
-                    Policy,
-                    CoreLambda,
-                    EpilogueLambda,
-                    FinalLambda,
-                    decltype(rowEpilog),
-                    isRowMajor,
-                    writeOut>
-    obj(
-      x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, epilog_op, fin_op, rowEpilog);
-  obj.run();
-  //#endif
-}
-
 template <typename P, typename IdxT, typename T>
 dim3 launchConfigGenerator(IdxT m, IdxT n, std::size_t sMemSize, T func)
 {
@@ -462,4 +298,4 @@ dim3 launchConfigGenerator(IdxT m, IdxT n, std::size_t sMemSize, T func)
 
 };  // namespace detail
 };  // namespace distance
-};  // namespace raft
\ No newline at end of file
+};  // namespace raft
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh
index f39d880da4..2ab5c69b0d 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,8 +19,6 @@
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 
-#if (__CUDACC_VER_MAJOR__ < 12)
-
 // We define CUTLASS_NAMESPACE in case
 // RAFT cmake is not used
 #ifndef CUTLASS_NAMESPACE
@@ -171,8 +169,8 @@ void cutlassDistanceKernel(const DataT* x,
   CUTLASS_CHECK(status);
 }
 
-};      // namespace detail
-};      // namespace distance
-};      // namespace raft
-#endif  //  (__CUDACC_VER_MAJOR__ < 12)
+};  // namespace detail
+};  // namespace distance
+};  // namespace raft
+
 #pragma GCC diagnostic pop
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
new file mode 100644
index 0000000000..9def354600
--- /dev/null
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "kernel_sm60.cuh"
+#include <algorithm>
+#include <cstdio>
+#include <raft/distance/detail/pairwise_distance_cutlass_base.cuh>
+#include <raft/linalg/contractions.cuh>
+#include <utility>
+
+namespace raft::distance::detail {
+
+/**
+ * @brief: Computes minimal common alignment of the rows in a 2D array in bytes
+ *
+ * The 2D matrix `x` is assumed to be row-major. This function computes the
+ * minimal alignment in bytes of the first elements of each row.
+ * Output can be 16, 8, 4, 2, 1.
+ *
+ * @param x        Base pointer of row-major input matrix
+ * @param stride   Stride in number of element between consecutive rows.
+ */
+template <typename DataT>
+size_t alignment_of_2d_array(const DataT* x, size_t stride)
+{
+  auto base           = reinterpret_cast<uintptr_t>(x);
+  size_t stride_bytes = sizeof(DataT) * stride;
+
+  for (int align = 16; align >= 0; align /= 2) {
+    bool base_aligned   = base % align == 0;
+    bool stride_aligned = stride_bytes % align == 0;
+    if (base_aligned && stride_aligned) { return align; }
+  }
+  return 1;
+}
+
+template <int n>
+using vec_len_constant = std::integral_constant<int, n>;
+
+/**
+ * @brief: Converts run-time arguments to compile-time arguments
+ *
+ * Converts run-time arguments row_major and vec_len to compile-time arguments
+ * and dispatches a lambda f with these compile-time arguments.
+ *
+ * This is equivalent to copying and pasting the lambda function `f` in each of
+ * the switch case statements.
+ *
+ * @tparam F         Type of lambda f.
+ * @param row_major  Boolean indicating whether input arrays have row-major layout.
+ * @param vec_len    Integer value 1, 2, or 4 specifying the Veclen template parameter of
+ *                   the KernelPolicy.
+ * @param f          Lambda that takes two std::integral_constant parameters representing
+ *                   row_major and vec_len.
+ */
+template <typename F>
+void dispatch(bool row_major, int vec_len, F&& f)
+{
+  if (row_major) {
+    switch (vec_len) {
+      case 4: f(std::bool_constant<true>(), vec_len_constant<4>()); break;
+      case 2: f(std::bool_constant<true>(), vec_len_constant<2>()); break;
+      default: f(std::bool_constant<true>(), vec_len_constant<1>()); break;
+    }
+  } else {
+    switch (vec_len) {
+      case 4: f(std::bool_constant<false>(), vec_len_constant<4>()); break;
+      case 2: f(std::bool_constant<false>(), vec_len_constant<2>()); break;
+      default: f(std::bool_constant<false>(), vec_len_constant<1>()); break;
+    }
+  }
+}
+
+template <typename OpT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void distance_matrix_dispatch(OpT distance_op,
+                              IdxT m,
+                              IdxT n,
+                              IdxT k,
+                              const DataT* x,
+                              const DataT* y,
+                              const DataT* x_norm,
+                              const DataT* y_norm,
+                              OutT* out,
+                              FinOpT fin_op,
+                              cudaStream_t stream,
+                              bool is_row_major)
+{
+  // Determine leading dimensions and, if column-major, flip order of passing x
+  // and y.
+  IdxT ldx, ldy, ld_out;
+  if (is_row_major) {
+    ldx = k, ldy = k, ld_out = n;
+  } else {
+    // Flip x, y, and m, n.
+    std::swap<const DataT*>(x, y);
+    std::swap<const DataT*>(x_norm, y_norm);
+    std::swap(m, n);
+    ldx = m, ldy = n, ld_out = n;
+  }
+
+  size_t align_x        = alignment_of_2d_array(x, ldx);
+  size_t align_y        = alignment_of_2d_array(y, ldy);
+  size_t byte_alignment = min(align_x, align_y);
+
+  // Since alignment is in bytes, it could be smaller than sizeof(DataT).
+  // Handle this (unlikely) case here.
+  RAFT_EXPECTS(sizeof(DataT) <= byte_alignment,
+               "Input matrix must be aligned to size of elements.");
+
+  // Compute number of elements that can be loaded in one instruction
+  // without causing misalignent errors.
+  int vec_len_aligned;
+  if (byte_alignment % sizeof(DataT) == 0) {
+    // In the future, we might support `int8_t` input. In that case,
+    // byte_alignment / sizeof(DataT) might exceed 4. We maximize at 4 here, to
+    // prevent adding more cases in dispatch (which are expensive to compile).
+    vec_len_aligned = std::min(4, int(byte_alignment / sizeof(DataT)));
+  } else {
+    vec_len_aligned = 1;
+  }
+
+  dispatch(is_row_major, vec_len_aligned, [&](auto row_major, auto vec_len_aligned) {
+    // row_major and vec_len are std::integral_constants of type bool and int
+    // respectively.
+
+    // To keep compile times in check, we only specialize on veclen > 1 when
+    // the inner loop is relatively cheap (< 5 flops).
+    constexpr int vec_len_op = distance_op.expensive_inner_loop ? 1 : vec_len_aligned();
+
+    // Prevent double, vec_len=4 combination (this is not supported)
+    constexpr int vec_len = std::min(vec_len_op, static_cast<int>(16 / sizeof(DataT)));
+
+    typedef typename raft::linalg::Policy4x4<DataT, vec_len>::Policy RowPolicy;
+    typedef typename raft::linalg::Policy4x4<DataT, vec_len>::ColPolicy ColPolicy;
+    typedef typename std::conditional<row_major(), RowPolicy, ColPolicy>::type Policy;
+
+    return pairwise_matrix<Policy, row_major(), DataT, AccT, OutT, IdxT, OpT, FinOpT>(
+      distance_op, fin_op, x, y, x_norm, y_norm, m, n, k, ldx, ldy, ld_out, out, stream);
+  });
+}
+
+template <typename opT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void distance_matrix_cutlass_dispatch(opT cutlass_op,
+                                      IdxT m,
+                                      IdxT n,
+                                      IdxT k,
+                                      const DataT* x,
+                                      const DataT* y,
+                                      const DataT* x_norm,
+                                      const DataT* y_norm,
+                                      OutT* out,
+                                      FinOpT fin_op,
+                                      cudaStream_t stream,
+                                      bool is_row_major)
+{
+  // Determine leading dimensions and possibly flip order of passing x and y if
+  // column_major.
+  IdxT ldx, ldy, ld_out;
+  if (is_row_major) {
+    ldx = k, ldy = k, ld_out = n;
+  } else {
+    std::swap<const DataT*>(x, y);
+    std::swap<const DataT*>(x_norm, y_norm);
+    std::swap(m, n);
+    ldx = m, ldy = n, ld_out = n;
+  }
+
+  size_t align_x        = alignment_of_2d_array(x, ldx);
+  size_t align_y        = alignment_of_2d_array(y, ldy);
+  size_t byte_alignment = min(align_x, align_y);
+
+  // Since alignment is in bytes, it could be smaller than sizeof(DataT).
+  // Handle this (unlikely) case here.
+  RAFT_EXPECTS(sizeof(DataT) <= byte_alignment,
+               "Input matrix must be aligned to size of elements.");
+
+  // Compute number of elements that can be loaded in one instruction
+  // without causing misalignent errors.
+  int vec_len_aligned = (byte_alignment % sizeof(DataT) == 0) ? byte_alignment / sizeof(DataT) : 1;
+
+  dispatch(is_row_major, vec_len_aligned, [&](auto row_major, auto vec_len_aligned) {
+    // row_major and vec_len are std::integral_constants of type bool and int
+    // respectively.
+
+    // Prevent double, vec_len=4 combination (this is not supported)
+    constexpr int vec_len = std::min(vec_len_aligned(), static_cast<int>(16 / sizeof(DataT)));
+
+    cutlassDistanceKernel<DataT, AccT, OutT, IdxT, vec_len, FinOpT, opT, row_major()>(
+      x, y, x_norm, y_norm, m, n, k, ldx, ldy, ld_out, out, fin_op, cutlass_op, stream);
+  });
+}
+
+};  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
new file mode 100644
index 0000000000..7c1052d726
--- /dev/null
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cstddef>
+#include <raft/core/operators.hpp>
+#include <raft/distance/detail/pairwise_distance_base.cuh>
+
+namespace raft::distance::detail {
+
+template <typename Policy,
+          bool row_major,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename opT,
+          typename FinOpT>
+__global__ __launch_bounds__(Policy::Nthreads, 2) void pairwise_matrix_kernel(const DataT* x,
+                                                                              const DataT* y,
+                                                                              const DataT* _xn,
+                                                                              const DataT* _yn,
+                                                                              IdxT m,
+                                                                              IdxT n,
+                                                                              IdxT k,
+                                                                              IdxT lda,
+                                                                              IdxT ldb,
+                                                                              IdxT ldd,
+                                                                              OutT* dOutput,
+                                                                              opT distance_op,
+                                                                              FinOpT fin_op)
+{
+  extern __shared__ char smem[];
+
+  // Wrap operator back into lambdas. This is temporary and should be removed.
+  // See: https://github.com/rapidsai/raft/issues/1323
+  auto core_op = [distance_op] __device__(AccT & acc, DataT & x, DataT & y) {
+    distance_op.core(acc, x, y);
+  };
+  auto epilog_op = [distance_op] __device__(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                                            DataT * regxn,
+                                            DataT * regyn,
+                                            IdxT gridStrideX,
+                                            IdxT gridStrideY) {
+    // Use .template to disambiguate (See:
+    // https://en.cppreference.com/w/cpp/language/dependent_name)
+    distance_op.template epilog<Policy>(acc, regxn, regyn, gridStrideX, gridStrideY);
+  };
+
+  // No support for row_epilog_op.
+  auto row_epilog_op = raft::void_op();
+
+  // Always write output
+  constexpr bool write_out = true;
+  constexpr bool use_norms = distance_op.use_norms;
+  PairwiseDistances<use_norms,
+                    DataT,
+                    AccT,
+                    OutT,
+                    IdxT,
+                    Policy,
+                    decltype(core_op),
+                    decltype(epilog_op),
+                    decltype(fin_op),
+                    decltype(row_epilog_op),
+                    row_major,
+                    write_out>
+    obj(x,
+        y,
+        m,
+        n,
+        k,
+        lda,
+        ldb,
+        ldd,
+        _xn,
+        _yn,
+        dOutput,
+        smem,
+        core_op,
+        epilog_op,
+        fin_op,
+        row_epilog_op);
+  obj.run();
+}
+
+template <typename Policy,
+          bool row_major,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename OpT,
+          typename FinOpT>
+void pairwise_matrix(OpT distance_op,
+                     FinOpT fin_op,
+                     const DataT* x,
+                     const DataT* y,
+                     const DataT* _xn,
+                     const DataT* _yn,
+                     IdxT m,
+                     IdxT n,
+                     IdxT k,
+                     IdxT lda,
+                     IdxT ldb,
+                     IdxT ldd,
+                     OutT* dOutput,
+                     cudaStream_t stream)
+{
+  dim3 blk(Policy::Nthreads);
+  // Use .template to disambiguate (See:
+  // https://en.cppreference.com/w/cpp/language/dependent_name)
+  size_t smem_size = distance_op.template shared_mem_size<Policy>();
+  // Obtain function pointer to kernel
+  auto kernel = pairwise_matrix_kernel<Policy, row_major, DataT, AccT, OutT, IdxT, OpT, FinOpT>;
+  dim3 grid   = launchConfigGenerator<Policy>(m, n, smem_size, kernel);
+
+  kernel<<<grid, blk, smem_size, stream>>>(
+    x, y, _xn, _yn, m, n, k, lda, ldb, ldd, dOutput, distance_op, fin_op);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+};  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/detail/russell_rao.cuh b/cpp/include/raft/distance/detail/russell_rao.cuh
deleted file mode 100644
index 5d516e7830..0000000000
--- a/cpp/include/raft/distance/detail/russell_rao.cuh
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include <raft/distance/detail/pairwise_distance_base.cuh>
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-/**
- * @brief the Russell Rao distance matrix:
- *  It computes the following equation:
-    Cij = (k - sum(x_i * y_i)) / k
- *
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
- * @tparam Veclen         number of k-elements loaded by each thread
-                          for every LDG call. details in contractions.cuh
- * @tparam FinalLambda    final lambda called on final distance value
- * @tparam isRowMajor     true if input/output is row major,
-                          false for column major
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of rows of B and C/D
- * @param[in]       k number of cols of A and B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[output]   dOutput output matrix
- * @param[in]       fin_op the final gemm epilogue lambda
- * @param[in]       stream cuda stream to launch work
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-static void russellRaoImpl(const DataT* x,
-                           const DataT* y,
-                           IdxT m,
-                           IdxT n,
-                           IdxT k,
-                           IdxT lda,
-                           IdxT ldb,
-                           IdxT ldd,
-                           OutT* dOutput,
-                           FinalLambda fin_op,
-                           cudaStream_t stream)
-{
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-  dim3 blk(KPolicy::Nthreads);
-
-  // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
-
-  const float one_over_k = 1.0 / k;
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [k, one_over_k] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn,
-                         DataT * regyn,
-                         IdxT gridStrideX,
-                         IdxT gridStrideY) {
-#pragma unroll
-    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-#pragma unroll
-      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        acc[i][j] = (k - acc[i][j]) * one_over_k;
-      }
-    }
-  };
-
-  if (isRowMajor) {
-    constexpr auto russellRaoRowMajor = pairwiseDistanceMatKernel<false,
-                                                                  DataT,
-                                                                  AccT,
-                                                                  OutT,
-                                                                  IdxT,
-                                                                  KPolicy,
-                                                                  decltype(core_lambda),
-                                                                  decltype(epilog_lambda),
-                                                                  FinalLambda,
-                                                                  true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, russellRaoRowMajor);
-
-    russellRaoRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  } else {
-    constexpr auto russellRaoColMajor = pairwiseDistanceMatKernel<false,
-                                                                  DataT,
-                                                                  AccT,
-                                                                  OutT,
-                                                                  IdxT,
-                                                                  KPolicy,
-                                                                  decltype(core_lambda),
-                                                                  decltype(epilog_lambda),
-                                                                  FinalLambda,
-                                                                  false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, russellRaoColMajor);
-    russellRaoColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void russellRao(IdxT m,
-                IdxT n,
-                IdxT k,
-                IdxT lda,
-                IdxT ldb,
-                IdxT ldd,
-                const DataT* x,
-                const DataT* y,
-                OutT* dOutput,
-                FinalLambda fin_op,
-                cudaStream_t stream)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    russellRaoImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    russellRaoImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else {
-    russellRaoImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  }
-}
-
-/**
- * @brief the Russell Rao distance matrix calculation
- *  It computes the following equation:
-    Cij = (k - sum(x_i * y_i)) / k
- *
- * @tparam InType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OutType output data-type (for C and D matrices)
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
- * @param m number of rows of A and C/D
- * @param n number of columns of B and C/D
- * @param k number of cols of A and rows of B
- * @param pA input matrix
- * @param pB input matrix
- * @param pD output matrix
- * @param fin_op the final element-wise epilogue lambda
- * @param stream cuda stream where to launch work
- * @param isRowMajor whether the input and output matrices are row major
- */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void russellRaoImpl(int m,
-                    int n,
-                    int k,
-                    const InType* pA,
-                    const InType* pB,
-                    OutType* pD,
-                    FinalLambda fin_op,
-                    cudaStream_t stream,
-                    bool isRowMajor)
-{
-  typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type russellRaoOutType;
-  Index_ lda, ldb, ldd;
-  russellRaoOutType* pDcast = reinterpret_cast<russellRaoOutType*>(pD);
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    russellRao<InType, AccType, russellRaoOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
-
-  } else {
-    lda = n, ldb = m, ldd = m;
-    russellRao<InType, AccType, russellRaoOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
-  }
-}
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh
index 843a22c593..5216902635 100644
--- a/cpp/include/raft/distance/distance.cuh
+++ b/cpp/include/raft/distance/distance.cuh
@@ -23,6 +23,7 @@
 #include <raft/distance/detail/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <rmm/device_uvector.hpp>
+#include <type_traits>
 
 #include <raft/core/device_mdspan.hpp>
 
@@ -101,9 +102,6 @@ void distance(raft::resources const& handle,
  * @param worksize number of bytes of the workspace
  * @param isRowMajor whether the matrices are row-major or col-major
  * @param metric_arg metric argument (used for Minkowski distance)
- *
- * @note if workspace is passed as nullptr, this will return in
- *  worksize, the number of bytes of workspace required
  */
 template <raft::distance::DistanceType distanceType,
           typename InType,
@@ -252,71 +250,63 @@ void pairwise_distance(raft::resources const& handle,
                        bool isRowMajor = true,
                        Type metric_arg = 2.0f)
 {
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+
+  auto dispatch = [&](auto distance_type) {
+    auto worksize = getWorkspaceSize<distance_type(), Type, Type, Type, Index_>(x, y, m, n, k);
+    workspace.resize(worksize, stream);
+    detail::distance<distance_type(), Type, Type, Type, Index_>(
+      handle, x, y, dist, m, n, k, workspace.data(), worksize, isRowMajor, metric_arg);
+  };
+
   switch (metric) {
-    case raft::distance::DistanceType::L2Expanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2Expanded>(
-        handle, x, y, dist, m, n, k, workspace, isRowMajor);
+    case DistanceType::Canberra:
+      dispatch(std::integral_constant<DistanceType, DistanceType::Canberra>{});
       break;
-    case raft::distance::DistanceType::L2SqrtExpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2SqrtExpanded>(
-        handle, x, y, dist, m, n, k, workspace, isRowMajor);
+    case DistanceType::CorrelationExpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::CorrelationExpanded>{});
       break;
-    case raft::distance::DistanceType::CosineExpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::CosineExpanded>(
-        handle, x, y, dist, m, n, k, workspace, isRowMajor);
+    case DistanceType::CosineExpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::CosineExpanded>{});
       break;
-    case raft::distance::DistanceType::L1:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L1>(
-        handle, x, y, dist, m, n, k, workspace, isRowMajor);
+    case DistanceType::HammingUnexpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::HammingUnexpanded>{});
       break;
-    case raft::distance::DistanceType::L2Unexpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2Unexpanded>(
-        handle, x, y, dist, m, n, k, workspace, isRowMajor);
+    case DistanceType::HellingerExpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::HellingerExpanded>{});
       break;
-    case raft::distance::DistanceType::L2SqrtUnexpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2SqrtUnexpanded>(
-        handle, x, y, dist, m, n, k, workspace, isRowMajor);
+    case raft::distance::DistanceType::InnerProduct:
+      dispatch(std::integral_constant<DistanceType, DistanceType::InnerProduct>{});
       break;
-    case raft::distance::DistanceType::Linf:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::Linf>(
-        handle, x, y, dist, m, n, k, workspace, isRowMajor);
+    case DistanceType::JensenShannon:
+      dispatch(std::integral_constant<DistanceType, DistanceType::JensenShannon>{});
       break;
-    case raft::distance::DistanceType::HellingerExpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::HellingerExpanded>(
-        handle, x, y, dist, m, n, k, workspace, isRowMajor);
+    case DistanceType::KLDivergence:
+      dispatch(std::integral_constant<DistanceType, DistanceType::KLDivergence>{});
       break;
-    case raft::distance::DistanceType::LpUnexpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::LpUnexpanded>(
-        handle, x, y, dist, m, n, k, workspace, isRowMajor, metric_arg);
+    case DistanceType::L1:
+      dispatch(std::integral_constant<DistanceType, DistanceType::L1>{});
       break;
-    case raft::distance::DistanceType::Canberra:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::Canberra>(
-        handle, x, y, dist, m, n, k, workspace, isRowMajor);
+    case DistanceType::L2Expanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::L2Expanded>{});
       break;
-    case raft::distance::DistanceType::HammingUnexpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::HammingUnexpanded>(
-        handle, x, y, dist, m, n, k, workspace, isRowMajor);
+    case DistanceType::L2SqrtExpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::L2SqrtExpanded>{});
       break;
-    case raft::distance::DistanceType::JensenShannon:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::JensenShannon>(
-        handle, x, y, dist, m, n, k, workspace, isRowMajor);
+    case DistanceType::L2SqrtUnexpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::L2SqrtUnexpanded>{});
       break;
-    case raft::distance::DistanceType::RusselRaoExpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::RusselRaoExpanded>(
-        handle, x, y, dist, m, n, k, workspace, isRowMajor);
+    case DistanceType::L2Unexpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::L2Unexpanded>{});
       break;
-    case raft::distance::DistanceType::KLDivergence:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::KLDivergence>(
-        handle, x, y, dist, m, n, k, workspace, isRowMajor);
+    case DistanceType::Linf:
+      dispatch(std::integral_constant<DistanceType, DistanceType::Linf>{});
       break;
-    case raft::distance::DistanceType::CorrelationExpanded:
-      detail::
-        pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::CorrelationExpanded>(
-          handle, x, y, dist, m, n, k, workspace, isRowMajor);
+    case DistanceType::LpUnexpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::LpUnexpanded>{});
       break;
-    case raft::distance::DistanceType::InnerProduct:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::InnerProduct>(
-        handle, x, y, dist, m, n, k, workspace, isRowMajor);
+    case DistanceType::RusselRaoExpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::RusselRaoExpanded>{});
       break;
     default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
   };
diff --git a/cpp/include/raft/distance/distance_type.hpp b/cpp/include/raft/distance/distance_type.hpp
deleted file mode 100644
index f6eb4614f9..0000000000
--- a/cpp/include/raft/distance/distance_type.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * This file is deprecated and will be removed at some point in a future release.
- * Please use `raft/distance/distance_types.hpp` instead.
- */
-
-#pragma once
-
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use distance_types.hpp instead.")
-
-#include <raft/distance/distance_types.hpp>
\ No newline at end of file
diff --git a/cpp/include/raft/distance/specializations/detail/chebyshev.cuh b/cpp/include/raft/distance/specializations/detail/l_inf.cuh
similarity index 100%
rename from cpp/include/raft/distance/specializations/detail/chebyshev.cuh
rename to cpp/include/raft/distance/specializations/detail/l_inf.cuh
diff --git a/cpp/include/raft/distance/specializations/distance.cuh b/cpp/include/raft/distance/specializations/distance.cuh
index a0c35ca9a8..8daa398b49 100644
--- a/cpp/include/raft/distance/specializations/distance.cuh
+++ b/cpp/include/raft/distance/specializations/distance.cuh
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <raft/distance/specializations/detail/canberra.cuh>
-#include <raft/distance/specializations/detail/chebyshev.cuh>
 #include <raft/distance/specializations/detail/correlation.cuh>
 #include <raft/distance/specializations/detail/cosine.cuh>
 #include <raft/distance/specializations/detail/hamming_unexpanded.cuh>
@@ -31,6 +30,7 @@
 #include <raft/distance/specializations/detail/l2_sqrt_expanded.cuh>
 #include <raft/distance/specializations/detail/l2_sqrt_unexpanded.cuh>
 #include <raft/distance/specializations/detail/l2_unexpanded.cuh>
+#include <raft/distance/specializations/detail/l_inf.cuh>
 #include <raft/distance/specializations/detail/lp_unexpanded.cuh>
 #include <raft/distance/specializations/detail/russel_rao.cuh>
 #include <raft/distance/specializations/fused_l2_nn_min.cuh>
diff --git a/cpp/include/raft/linalg/detail/contractions.cuh b/cpp/include/raft/linalg/detail/contractions.cuh
index 9301580a9e..b15cb222b4 100644
--- a/cpp/include/raft/linalg/detail/contractions.cuh
+++ b/cpp/include/raft/linalg/detail/contractions.cuh
@@ -318,4 +318,4 @@ struct Contractions_NT {
 
 }  // namespace detail
 }  // namespace linalg
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/src/distance/distance/specializations/detail/chebyshev_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/l_inf_double_double_double_int.cu
similarity index 100%
rename from cpp/src/distance/distance/specializations/detail/chebyshev_double_double_double_int.cu
rename to cpp/src/distance/distance/specializations/detail/l_inf_double_double_double_int.cu
diff --git a/cpp/src/distance/distance/specializations/detail/chebyshev_float_float_float_int.cu b/cpp/src/distance/distance/specializations/detail/l_inf_float_float_float_int.cu
similarity index 100%
rename from cpp/src/distance/distance/specializations/detail/chebyshev_float_float_float_int.cu
rename to cpp/src/distance/distance/specializations/detail/l_inf_float_float_float_int.cu
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index df8fa54116..7cd0feba61 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -119,19 +119,19 @@ if(BUILD_TESTS)
     PATH
     test/distance/dist_adj.cu
     test/distance/dist_canberra.cu
-    test/distance/dist_chebyshev.cu
     test/distance/dist_correlation.cu
     test/distance/dist_cos.cu
-    test/distance/dist_euc_exp.cu
-    test/distance/dist_euc_unexp.cu
-    test/distance/dist_eucsqrt_exp.cu
     test/distance/dist_hamming.cu
     test/distance/dist_hellinger.cu
     test/distance/dist_inner_product.cu
     test/distance/dist_jensen_shannon.cu
     test/distance/dist_kl_divergence.cu
     test/distance/dist_l1.cu
-    test/distance/dist_minkowski.cu
+    test/distance/dist_l2_exp.cu
+    test/distance/dist_l2_unexp.cu
+    test/distance/dist_l2_sqrt_exp.cu
+    test/distance/dist_l_inf.cu
+    test/distance/dist_lp_unexp.cu
     test/distance/dist_russell_rao.cu
     test/distance/masked_nn.cu
     test/distance/masked_nn_compress_to_bits.cu
diff --git a/cpp/test/distance/dist_euc_exp.cu b/cpp/test/distance/dist_l2_exp.cu
similarity index 98%
rename from cpp/test/distance/dist_euc_exp.cu
rename to cpp/test/distance/dist_l2_exp.cu
index 567e279691..ae67215e51 100644
--- a/cpp/test/distance/dist_euc_exp.cu
+++ b/cpp/test/distance/dist_l2_exp.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/distance/dist_eucsqrt_exp.cu b/cpp/test/distance/dist_l2_sqrt_exp.cu
similarity index 98%
rename from cpp/test/distance/dist_eucsqrt_exp.cu
rename to cpp/test/distance/dist_l2_sqrt_exp.cu
index d717158649..94d254f44b 100644
--- a/cpp/test/distance/dist_eucsqrt_exp.cu
+++ b/cpp/test/distance/dist_l2_sqrt_exp.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/distance/dist_euc_unexp.cu b/cpp/test/distance/dist_l2_unexp.cu
similarity index 98%
rename from cpp/test/distance/dist_euc_unexp.cu
rename to cpp/test/distance/dist_l2_unexp.cu
index 311ad190e2..d74a41d2a4 100644
--- a/cpp/test/distance/dist_euc_unexp.cu
+++ b/cpp/test/distance/dist_l2_unexp.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/distance/dist_chebyshev.cu b/cpp/test/distance/dist_l_inf.cu
similarity index 98%
rename from cpp/test/distance/dist_chebyshev.cu
rename to cpp/test/distance/dist_l_inf.cu
index abad828de7..b9d6413a10 100644
--- a/cpp/test/distance/dist_chebyshev.cu
+++ b/cpp/test/distance/dist_l_inf.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/distance/dist_minkowski.cu b/cpp/test/distance/dist_lp_unexp.cu
similarity index 98%
rename from cpp/test/distance/dist_minkowski.cu
rename to cpp/test/distance/dist_lp_unexp.cu
index af2661da3a..9d6f5921a7 100644
--- a/cpp/test/distance/dist_minkowski.cu
+++ b/cpp/test/distance/dist_lp_unexp.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.