diff --git a/cpp/include/raft/distance/correlation.cuh b/cpp/include/raft/distance/correlation.cuh
new file mode 100644
index 0000000000..ed3b7a5464
--- /dev/null
+++ b/cpp/include/raft/distance/correlation.cuh
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/cuda_utils.cuh>
+#include <raft/distance/pairwise_distance_base.cuh>
+#include <raft/linalg/reduce.cuh>
+
+namespace raft {
+namespace distance {
+
+/**
+ * @brief the Correlation distance matrix:
+ *
+ * @tparam DataT          input data-type (for A and B matrices)
+ * @tparam AccT           accumulation data-type
+ * @tparam OutT           output data-type (for C and D matrices)
+ * @tparam IdxT           index data-type
+ * @tparam Veclen         number of k-elements loaded by each thread
+                          for every LDG call. details in contractions.cuh
+ * @tparam FinalLambda    final lambda called on final distance value
+ * @tparam isRowMajor     true if input/output is row major,
+                          false for column major
+ * @param[in]       x input matrix
+ * @param[in]       y input matrix
+ * @param[in]       m number of rows of A and C/D
+ * @param[in]       n number of rows of B and C/D
+ * @param[in]       k number of cols of A and B
+ * @param[in]       lda leading dimension of A
+ * @param[in]       ldb leading dimension of B
+ * @param[in]       ldd leading dimension of C/D
+ * @param[output]   dOutput output matrix
+ * @param[in]       fin_op the final gemm epilogue lambda
+ * @param[in]       stream cuda stream to launch work
+ */
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          int VecLen, typename FinalLambda, bool isRowMajor>
+static void correlationImpl(const DataT *x, const DataT *y, const DataT *xn,
+                            const DataT *yn, const DataT *x2n, const DataT *y2n,
+                            IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb,
+                            IdxT ldd, OutT *dOutput, FinalLambda fin_op,
+                            cudaStream_t stream) {
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
+
+  typedef
+    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+
+  dim3 blk(KPolicy::Nthreads);
+
+  // Accumulation operation lambda
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
+    acc += x * y;
+  };
+
+  // epilogue operation lambda for final value calculation
+  auto epilog_lambda = [x2n, y2n, m, n, k] __device__(
+                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         IdxT gridStrideY) {
+    DataT regx2n[KPolicy::AccRowsPerTh], regy2n[KPolicy::AccColsPerTh];
+
+    extern __shared__ char smem[];
+    DataT *sx2Norm =
+      (DataT *)(&smem[KPolicy::SmemSize +
+                      (KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)]);
+    DataT *sy2Norm = (&sx2Norm[KPolicy::Mblk]);
+
+    // Load x & y norms required by this threadblock in shmem buffer
+    if (gridStrideX == blockIdx.x * KPolicy::Nblk) {
+      for (int i = threadIdx.x; i < KPolicy::Mblk; i += KPolicy::Nthreads) {
+        auto idx = gridStrideY + i;
+        sx2Norm[i] = idx < m ? x2n[idx] : 0;
+      }
+    }
+
+    for (int i = threadIdx.x; i < KPolicy::Nblk; i += KPolicy::Nthreads) {
+      auto idx = gridStrideX + i;
+      sy2Norm[i] = idx < n ? y2n[idx] : 0;
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
+      regx2n[i] =
+        sx2Norm[i * KPolicy::AccThRows + (threadIdx.x / KPolicy::AccThCols)];
+    }
+#pragma unroll
+    for (int i = 0; i < KPolicy::AccColsPerTh; ++i) {
+      regy2n[i] =
+        sy2Norm[i * KPolicy::AccThCols + (threadIdx.x % KPolicy::AccThCols)];
+    }
+
+#pragma unroll
+    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
+        auto numer = k * acc[i][j] - (regxn[i] * regyn[j]);
+        auto Q_denom = k * regx2n[i] - (regxn[i] * regxn[i]);
+        auto R_denom = k * regy2n[j] - (regyn[j] * regyn[j]);
+
+        acc[i][j] = 1 - (numer / raft::mySqrt(Q_denom * R_denom));
+      }
+    }
+  };
+
+  constexpr size_t shmemSize =
+    KPolicy::SmemSize + (2 * (KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
+  if (isRowMajor) {
+    constexpr auto correlationRowMajor =
+      pairwiseDistanceMatKernel<true, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                               correlationRowMajor);
+    correlationRowMajor<<<grid, blk, shmemSize, stream>>>(
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda,
+      fin_op);
+  } else {
+    constexpr auto correlationColMajor =
+      pairwiseDistanceMatKernel<true, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                               correlationColMajor);
+    correlationColMajor<<<grid, blk, shmemSize, stream>>>(
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda,
+      fin_op);
+  }
+
+  CUDA_CHECK(cudaGetLastError());
+}
+
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          typename FinalLambda, bool isRowMajor>
+void correlation(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+                 const DataT *x, const DataT *y, const DataT *xn,
+                 const DataT *yn, const DataT *x2n, const DataT *y2n,
+                 OutT *dOutput, FinalLambda fin_op, cudaStream_t stream) {
+  size_t bytesA = sizeof(DataT) * lda;
+  size_t bytesB = sizeof(DataT) * ldb;
+  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
+    correlationImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
+                    isRowMajor>(x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd,
+                                dOutput, fin_op, stream);
+  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
+    correlationImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
+                    isRowMajor>(x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd,
+                                dOutput, fin_op, stream);
+  } else {
+    correlationImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
+      x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+  }
+}
+
+/**
+ * @brief the Correlation distance matrix calculation
+ *
+ * @tparam InType input data-type (for A and B matrices)
+ * @tparam AccType accumulation data-type
+ * @tparam OutType output data-type (for C and D matrices)
+ * @tparam FinalLambda user-defined epilogue lamba
+ * @tparam Index_ Index type
+ * @param m number of rows of A and C/D
+ * @param n number of columns of B and C/D
+ * @param k number of cols of A and rows of B
+ * @param pA input matrix
+ * @param pB input matrix
+ * @param pD output matrix
+ * @param fin_op the final element-wise epilogue lambda
+ * @param stream cuda stream where to launch work
+ * @param isRowMajor whether the input and output matrices are row major
+ */
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_ = int>
+void correlationImpl(int m, int n, int k, const InType *pA, const InType *pB,
+                     OutType *pD, AccType *workspace, size_t &worksize,
+                     FinalLambda fin_op, cudaStream_t stream, bool isRowMajor) {
+  typedef std::is_same<OutType, bool> is_bool;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
+    correlationOutType;
+  Index_ lda, ldb, ldd;
+  correlationOutType *pDcast = reinterpret_cast<correlationOutType *>(pD);
+
+  ASSERT(!(((pA != pB) && (worksize < 2 * (m + n) * sizeof(AccType))) ||
+           (worksize < 2 * m * sizeof(AccType))),
+         "workspace size error");
+  ASSERT(workspace != nullptr, "workspace is null");
+
+  AccType *norm_col_vec = workspace;
+  AccType *norm_row_vec = workspace;
+  AccType *sq_norm_col_vec = workspace;
+  AccType *sq_norm_row_vec = workspace;
+  if (pA != pB) {
+    norm_row_vec += m;
+
+    raft::linalg::reduce(norm_col_vec, pA, k, m, (AccType)0, isRowMajor, true,
+                         stream, false, raft::Nop<InType>(),
+                         raft::Sum<InType>());
+    raft::linalg::reduce(norm_row_vec, pB, k, n, (AccType)0, isRowMajor, true,
+                         stream, false, raft::Nop<InType>(),
+                         raft::Sum<InType>());
+
+    sq_norm_col_vec += (m + n);
+    sq_norm_row_vec = sq_norm_col_vec + m;
+    raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm,
+                          isRowMajor, stream);
+    raft::linalg::rowNorm(sq_norm_row_vec, pB, k, n, raft::linalg::L2Norm,
+                          isRowMajor, stream);
+  } else {
+    raft::linalg::reduce(norm_col_vec, pA, k, m, (AccType)0, isRowMajor, true,
+                         stream, false, raft::Nop<InType>(),
+                         raft::Sum<InType>());
+    sq_norm_col_vec += m;
+    sq_norm_row_vec = sq_norm_col_vec;
+    raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm,
+                          isRowMajor, stream);
+  }
+
+  if (isRowMajor) {
+    lda = k, ldb = k, ldd = n;
+    correlation<InType, AccType, correlationOutType, Index_, FinalLambda, true>(
+      m, n, k, lda, ldb, ldd, pA, pB, norm_col_vec, norm_row_vec,
+      sq_norm_col_vec, sq_norm_row_vec, pDcast, fin_op, stream);
+  } else {
+    lda = n, ldb = m, ldd = m;
+    correlation<InType, AccType, correlationOutType, Index_, FinalLambda,
+                false>(n, m, k, lda, ldb, ldd, pB, pA, norm_row_vec,
+                       norm_col_vec, sq_norm_row_vec, sq_norm_col_vec, pDcast,
+                       fin_op, stream);
+  }
+}
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh
index fc0d07773f..02d8fb6d03 100644
--- a/cpp/include/raft/distance/distance.cuh
+++ b/cpp/include/raft/distance/distance.cuh
@@ -21,11 +21,16 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/canberra.cuh>
 #include <raft/distance/chebyshev.cuh>
+#include <raft/distance/correlation.cuh>
 #include <raft/distance/cosine.cuh>
 #include <raft/distance/euclidean.cuh>
+#include <raft/distance/hamming.cuh>
 #include <raft/distance/hellinger.cuh>
+#include <raft/distance/jensen_shannon.cuh>
+#include <raft/distance/kl_divergence.cuh>
 #include <raft/distance/l1.cuh>
 #include <raft/distance/minkowski.cuh>
+#include <raft/distance/russell_rao.cuh>
 #include <raft/mr/device/buffer.hpp>
 
 namespace raft {
@@ -171,6 +176,72 @@ struct DistanceImpl<raft::distance::DistanceType::Canberra, InType, AccType,
   }
 };
 
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::HammingUnexpanded, InType,
+                    AccType, OutType, FinalLambda, Index_> {
+  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
+           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
+           bool isRowMajor, InType) {
+    raft::distance::hammingUnexpandedImpl<InType, AccType, OutType, FinalLambda,
+                                          Index_>(m, n, k, x, y, dist, fin_op,
+                                                  stream, isRowMajor);
+  }
+};
+
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::JensenShannon, InType,
+                    AccType, OutType, FinalLambda, Index_> {
+  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
+           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
+           bool isRowMajor, InType) {
+    raft::distance::jensenShannonImpl<InType, AccType, OutType, FinalLambda,
+                                      Index_>(m, n, k, x, y, dist, fin_op,
+                                              stream, isRowMajor);
+  }
+};
+
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::RusselRaoExpanded, InType,
+                    AccType, OutType, FinalLambda, Index_> {
+  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
+           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
+           bool isRowMajor, InType) {
+    raft::distance::russellRaoImpl<InType, AccType, OutType, FinalLambda,
+                                   Index_>(m, n, k, x, y, dist, fin_op, stream,
+                                           isRowMajor);
+  }
+};
+
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::KLDivergence, InType, AccType,
+                    OutType, FinalLambda, Index_> {
+  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
+           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
+           bool isRowMajor, InType) {
+    raft::distance::klDivergenceImpl<InType, AccType, OutType, FinalLambda,
+                                     Index_>(m, n, k, x, y, dist, fin_op,
+                                             stream, isRowMajor);
+  }
+};
+
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::CorrelationExpanded, InType,
+                    AccType, OutType, FinalLambda, Index_> {
+  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
+           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
+           cudaStream_t stream, bool isRowMajor, InType) {
+    raft::distance::correlationImpl<InType, AccType, OutType, FinalLambda,
+                                    Index_>(m, n, k, x, y, dist,
+                                            (AccType *)workspace, worksize,
+                                            fin_op, stream, isRowMajor);
+  }
+};
+
 }  // anonymous namespace
 
 /**
@@ -195,11 +266,16 @@ size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n,
                         Index_ k) {
   size_t worksize = 0;
   constexpr bool is_allocated =
-    distanceType <= raft::distance::DistanceType::CosineExpanded;
+    (distanceType <= raft::distance::DistanceType::CosineExpanded) ||
+    (distanceType == raft::distance::DistanceType::CorrelationExpanded);
+  constexpr int numOfBuffers =
+    (distanceType == raft::distance::DistanceType::CorrelationExpanded) ? 2 : 1;
+
   if (is_allocated) {
-    worksize += m * sizeof(AccType);
-    if (x != y) worksize += n * sizeof(AccType);
+    worksize += numOfBuffers * m * sizeof(AccType);
+    if (x != y) worksize += numOfBuffers * n * sizeof(AccType);
   }
+
   return worksize;
 }
 
@@ -366,6 +442,31 @@ void pairwise_distance(const Type *x, const Type *y, Type *dist, Index_ m,
                              raft::distance::DistanceType::Canberra>(
         x, y, dist, m, n, k, workspace, stream, isRowMajor);
       break;
+    case raft::distance::DistanceType::HammingUnexpanded:
+      pairwise_distance_impl<Type, Index_,
+                             raft::distance::DistanceType::HammingUnexpanded>(
+        x, y, dist, m, n, k, workspace, stream, isRowMajor);
+      break;
+    case raft::distance::DistanceType::JensenShannon:
+      pairwise_distance_impl<Type, Index_,
+                             raft::distance::DistanceType::JensenShannon>(
+        x, y, dist, m, n, k, workspace, stream, isRowMajor);
+      break;
+    case raft::distance::DistanceType::RusselRaoExpanded:
+      pairwise_distance_impl<Type, Index_,
+                             raft::distance::DistanceType::RusselRaoExpanded>(
+        x, y, dist, m, n, k, workspace, stream, isRowMajor);
+      break;
+    case raft::distance::DistanceType::KLDivergence:
+      pairwise_distance_impl<Type, Index_,
+                             raft::distance::DistanceType::KLDivergence>(
+        x, y, dist, m, n, k, workspace, stream, isRowMajor);
+      break;
+    case raft::distance::DistanceType::CorrelationExpanded:
+      pairwise_distance_impl<Type, Index_,
+                             raft::distance::DistanceType::CorrelationExpanded>(
+        x, y, dist, m, n, k, workspace, stream, isRowMajor);
+      break;
     default:
       THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
   };
diff --git a/cpp/include/raft/distance/hamming.cuh b/cpp/include/raft/distance/hamming.cuh
new file mode 100644
index 0000000000..08f1020b85
--- /dev/null
+++ b/cpp/include/raft/distance/hamming.cuh
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/distance/pairwise_distance_base.cuh>
+
+namespace raft {
+namespace distance {
+
+/**
+ * @brief the Hamming distance matrix using the unexpanded form:
+ *  It computes the following equation: 
+    Cij = sum(x_i != y_i) / k
+ *
+ * @tparam DataT          input data-type (for A and B matrices)
+ * @tparam AccT           accumulation data-type
+ * @tparam OutT           output data-type (for C and D matrices)
+ * @tparam IdxT           index data-type
+ * @tparam Veclen         number of k-elements loaded by each thread
+                          for every LDG call. details in contractions.cuh
+ * @tparam FinalLambda    final lambda called on final distance value
+ * @tparam isRowMajor     true if input/output is row major,
+                          false for column major
+ * @param[in]       x input matrix
+ * @param[in]       y input matrix
+ * @param[in]       m number of rows of A and C/D
+ * @param[in]       n number of rows of B and C/D
+ * @param[in]       k number of cols of A and B
+ * @param[in]       lda leading dimension of A
+ * @param[in]       ldb leading dimension of B
+ * @param[in]       ldd leading dimension of C/D
+ * @param[output]   dOutput output matrix
+ * @param[in]       fin_op the final gemm epilogue lambda
+ * @param[in]       stream cuda stream to launch work
+ */
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          int VecLen, typename FinalLambda, bool isRowMajor>
+static void hammingUnexpandedImpl(const DataT *x, const DataT *y, IdxT m,
+                                  IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+                                  OutT *dOutput, FinalLambda fin_op,
+                                  cudaStream_t stream) {
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
+
+  typedef
+    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+
+  dim3 blk(KPolicy::Nthreads);
+
+  // Accumulation operation lambda
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
+    acc += (x != y);
+  };
+
+  // epilogue operation lambda for final value calculation
+  auto epilog_lambda = [k] __device__(
+                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         IdxT gridStrideY) {
+    const DataT one_over_k = DataT(1.0) / k;
+#pragma unroll
+    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
+        acc[i][j] *= one_over_k;
+      }
+    }
+  };
+
+  if (isRowMajor) {
+    auto hammingUnexpandedRowMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                               hammingUnexpandedRowMajor);
+
+    hammingUnexpandedRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
+  } else {
+    auto hammingUnexpandedColMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                               hammingUnexpandedColMajor);
+    hammingUnexpandedColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
+  }
+
+  CUDA_CHECK(cudaGetLastError());
+}
+
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          typename FinalLambda, bool isRowMajor>
+void hammingUnexpanded(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+                       const DataT *x, const DataT *y, OutT *dOutput,
+                       FinalLambda fin_op, cudaStream_t stream) {
+  size_t bytesA = sizeof(DataT) * lda;
+  size_t bytesB = sizeof(DataT) * ldb;
+  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
+    hammingUnexpandedImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT),
+                          FinalLambda, isRowMajor>(x, y, m, n, k, lda, ldb, ldd,
+                                                   dOutput, fin_op, stream);
+  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
+    hammingUnexpandedImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT),
+                          FinalLambda, isRowMajor>(x, y, m, n, k, lda, ldb, ldd,
+                                                   dOutput, fin_op, stream);
+  } else {
+    hammingUnexpandedImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+  }
+}
+
+/**
+ * @brief the Hamming Unexpanded distance matrix calculation
+ *  It computes the following equation: 
+    Cij = sum(x_i != y_i) / k
+ *
+ * @tparam InType input data-type (for A and B matrices)
+ * @tparam AccType accumulation data-type
+ * @tparam OutType output data-type (for C and D matrices)
+ * @tparam FinalLambda user-defined epilogue lamba
+ * @tparam Index_ Index type
+ * @param m number of rows of A and C/D
+ * @param n number of columns of B and C/D
+ * @param k number of cols of A and rows of B
+ * @param pA input matrix
+ * @param pB input matrix
+ * @param pD output matrix
+ * @param fin_op the final element-wise epilogue lambda
+ * @param stream cuda stream where to launch work
+ * @param isRowMajor whether the input and output matrices are row major
+ */
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_ = int>
+void hammingUnexpandedImpl(int m, int n, int k, const InType *pA,
+                           const InType *pB, OutType *pD, FinalLambda fin_op,
+                           cudaStream_t stream, bool isRowMajor) {
+  typedef std::is_same<OutType, bool> is_bool;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
+    hammingUnexpandedOutType;
+  Index_ lda, ldb, ldd;
+  hammingUnexpandedOutType *pDcast =
+    reinterpret_cast<hammingUnexpandedOutType *>(pD);
+  if (isRowMajor) {
+    lda = k, ldb = k, ldd = n;
+    hammingUnexpanded<InType, AccType, hammingUnexpandedOutType, Index_,
+                      FinalLambda, true>(m, n, k, lda, ldb, ldd, pA, pB, pDcast,
+                                         fin_op, stream);
+
+  } else {
+    lda = n, ldb = m, ldd = m;
+    hammingUnexpanded<InType, AccType, hammingUnexpandedOutType, Index_,
+                      FinalLambda, false>(n, m, k, lda, ldb, ldd, pB, pA,
+                                          pDcast, fin_op, stream);
+  }
+}
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/jensen_shannon.cuh b/cpp/include/raft/distance/jensen_shannon.cuh
new file mode 100644
index 0000000000..2a94205853
--- /dev/null
+++ b/cpp/include/raft/distance/jensen_shannon.cuh
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/distance/pairwise_distance_base.cuh>
+
+namespace raft {
+namespace distance {
+
+/**
+ * @brief the Jensen Shannon distance matrix:
+ *  It computes the following equation: 
+    Cij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i))
+            + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i)))))
+ *
+ * @tparam DataT          input data-type (for A and B matrices)
+ * @tparam AccT           accumulation data-type
+ * @tparam OutT           output data-type (for C and D matrices)
+ * @tparam IdxT           index data-type
+ * @tparam Veclen         number of k-elements loaded by each thread
+                          for every LDG call. details in contractions.cuh
+ * @tparam FinalLambda    final lambda called on final distance value
+ * @tparam isRowMajor     true if input/output is row major,
+                          false for column major
+ * @param[in]       x input matrix
+ * @param[in]       y input matrix
+ * @param[in]       m number of rows of A and C/D
+ * @param[in]       n number of rows of B and C/D
+ * @param[in]       k number of cols of A and B
+ * @param[in]       lda leading dimension of A
+ * @param[in]       ldb leading dimension of B
+ * @param[in]       ldd leading dimension of C/D
+ * @param[output]   dOutput output matrix
+ * @param[in]       fin_op the final gemm epilogue lambda
+ * @param[in]       stream cuda stream to launch work
+ */
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          int VecLen, typename FinalLambda, bool isRowMajor>
+static void jensenShannonImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
+                              IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+                              OutT *dOutput, FinalLambda fin_op,
+                              cudaStream_t stream) {
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
+
+  typedef
+    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+
+  dim3 blk(KPolicy::Nthreads);
+
+  // Accumulation operation lambda
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
+    const DataT m = 0.5f * (x + y);
+    const bool m_zero = (m == 0);
+    const auto logM = (!m_zero) * raft::myLog(m + m_zero);
+
+    const bool x_zero = (x == 0);
+    const bool y_zero = (y == 0);
+    acc += (-x * (logM - raft::myLog(x + x_zero))) +
+           (-y * (logM - raft::myLog(y + y_zero)));
+  };
+
+  // epilogue operation lambda for final value calculation
+  auto epilog_lambda = [] __device__(
+                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         IdxT gridStrideY) {
+#pragma unroll
+    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
+        acc[i][j] = raft::mySqrt(0.5 * acc[i][j]);
+      }
+    }
+  };
+
+  if (isRowMajor) {
+    auto jensenShannonRowMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                               jensenShannonRowMajor);
+
+    jensenShannonRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
+  } else {
+    auto jensenShannonColMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                               jensenShannonColMajor);
+    jensenShannonColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
+  }
+
+  CUDA_CHECK(cudaGetLastError());
+}
+
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          typename FinalLambda, bool isRowMajor>
+void jensenShannon(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+                   const DataT *x, const DataT *y, OutT *dOutput,
+                   FinalLambda fin_op, cudaStream_t stream) {
+  size_t bytesA = sizeof(DataT) * lda;
+  size_t bytesB = sizeof(DataT) * ldb;
+  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
+    jensenShannonImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
+                      isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
+                                  stream);
+  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
+    jensenShannonImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
+                      isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
+                                  stream);
+  } else {
+    jensenShannonImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+  }
+}
+
+/**
+ * @brief the Jensen Shannon distance matrix calculation
+ *  It computes the following equation: 
+    Cij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i))
+            + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i)))))
+ *
+ * @tparam InType input data-type (for A and B matrices)
+ * @tparam AccType accumulation data-type
+ * @tparam OutType output data-type (for C and D matrices)
+ * @tparam FinalLambda user-defined epilogue lamba
+ * @tparam Index_ Index type
+ * @param m number of rows of A and C/D
+ * @param n number of columns of B and C/D
+ * @param k number of cols of A and rows of B
+ * @param pA input matrix
+ * @param pB input matrix
+ * @param pD output matrix
+ * @param fin_op the final element-wise epilogue lambda
+ * @param stream cuda stream where to launch work
+ * @param isRowMajor whether the input and output matrices are row major
+ */
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_ = int>
+void jensenShannonImpl(int m, int n, int k, const InType *pA, const InType *pB,
+                       OutType *pD, FinalLambda fin_op, cudaStream_t stream,
+                       bool isRowMajor) {
+  typedef std::is_same<OutType, bool> is_bool;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
+    jensenShannonOutType;
+  Index_ lda, ldb, ldd;
+  jensenShannonOutType *pDcast = reinterpret_cast<jensenShannonOutType *>(pD);
+  if (isRowMajor) {
+    lda = k, ldb = k, ldd = n;
+    jensenShannon<InType, AccType, jensenShannonOutType, Index_, FinalLambda,
+                  true>(m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
+
+  } else {
+    lda = n, ldb = m, ldd = m;
+    jensenShannon<InType, AccType, jensenShannonOutType, Index_, FinalLambda,
+                  false>(n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op,
+                         stream);
+  }
+}
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/kl_divergence.cuh b/cpp/include/raft/distance/kl_divergence.cuh
new file mode 100644
index 0000000000..3197b73d10
--- /dev/null
+++ b/cpp/include/raft/distance/kl_divergence.cuh
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/distance/pairwise_distance_base.cuh>
+
+namespace raft {
+namespace distance {
+
+/**
+ * @brief the KL Divergence distance matrix:
+ *  It computes the following equation: 
+    Cij = 0.5 * sum(x * log (x / y));
+ * This distance computation modifies A or B by computing a log(x)
+ * and then performing a `pow(e, log(x))` to convert it back. Because of this,
+ * it is possible that the values in A or B might differ slightly
+ * after this is invoked.
+ *
+ * @tparam DataT          input data-type (for A and B matrices)
+ * @tparam AccT           accumulation data-type
+ * @tparam OutT           output data-type (for C and D matrices)
+ * @tparam IdxT           index data-type
+ * @tparam Veclen         number of k-elements loaded by each thread
+                          for every LDG call. details in contractions.cuh
+ * @tparam FinalLambda    final lambda called on final distance value
+ * @tparam isRowMajor     true if input/output is row major,
+                          false for column major
+ * @param[in]       x input matrix
+ * @param[in]       y input matrix
+ * @param[in]       m number of rows of A and C/D
+ * @param[in]       n number of rows of B and C/D
+ * @param[in]       k number of cols of A and B
+ * @param[in]       lda leading dimension of A
+ * @param[in]       ldb leading dimension of B
+ * @param[in]       ldd leading dimension of C/D
+ * @param[output]   dOutput output matrix
+ * @param[in]       fin_op the final gemm epilogue lambda
+ * @param[in]       stream cuda stream to launch work
+ */
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          int VecLen, typename FinalLambda, bool isRowMajor>
+static void klDivergenceImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
+                             IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+                             OutT *dOutput, FinalLambda fin_op,
+                             cudaStream_t stream) {
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
+
+  typedef
+    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+
+  dim3 blk(KPolicy::Nthreads);
+
+  // Accumulation operation lambda
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
+    if (isRowMajor) {
+      const bool x_zero = (x == 0);
+      acc += x * (raft::myLog(x + x_zero) - y);
+    } else {
+      const bool y_zero = (y == 0);
+      acc += y * (raft::myLog(y + y_zero) - x);
+    }
+  };
+
+  auto core_lambda_x_equal_y = [] __device__(AccT & acc, DataT & x, DataT & y) {
+    if (isRowMajor) {
+      const bool x_zero = (x == 0);
+      const bool y_zero = (y == 0);
+      acc +=
+        x * (raft::myLog(x + x_zero) - (!y_zero) * raft::myLog(y + y_zero));
+    } else {
+      const bool y_zero = (y == 0);
+      const bool x_zero = (x == 0);
+      acc +=
+        y * (raft::myLog(y + y_zero) - (!x_zero) * raft::myLog(x + x_zero));
+    }
+  };
+
+  auto unaryOp_lambda = [] __device__(DataT input) {
+    const bool x_zero = (input == 0);
+    return (!x_zero) * raft::myLog(input + x_zero);
+  };
+
+  auto unaryOp_lambda_reverse = [] __device__(DataT input) {
+    // reverse previous log (x) back to x using (e ^ log(x))
+    const bool x_zero = (input == 0);
+    return (!x_zero) * raft::myExp(input);
+  };
+
+  // epilogue operation lambda for final value calculation
+  auto epilog_lambda = [] __device__(
+                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         IdxT gridStrideY) {
+#pragma unroll
+    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
+        acc[i][j] = (0.5f * acc[i][j]);
+      }
+    }
+  };
+
+  if (isRowMajor) {
+    constexpr auto klDivergenceRowMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, true>;
+    constexpr auto klDivergenceRowMajorXequalY =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda_x_equal_y),
+                                decltype(epilog_lambda), FinalLambda, true>;
+    if (x != y) {
+      raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
+        (DataT *)y, y, n * k, unaryOp_lambda, stream);
+      dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                                 klDivergenceRowMajor);
+      klDivergenceRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
+        x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+        epilog_lambda, fin_op);
+      // Now reverse previous log (x) back to x using (e ^ log(x))
+      raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda_reverse), IdxT>(
+        (DataT *)y, y, n * k, unaryOp_lambda_reverse, stream);
+    } else {
+      dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                                 klDivergenceRowMajorXequalY);
+      klDivergenceRowMajorXequalY<<<grid, blk, KPolicy::SmemSize, stream>>>(
+        x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput,
+        core_lambda_x_equal_y, epilog_lambda, fin_op);
+    }
+  } else {
+    constexpr auto klDivergenceColMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, false>;
+    constexpr auto klDivergenceColMajorXequalY =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda_x_equal_y),
+                                decltype(epilog_lambda), FinalLambda, false>;
+    if (x != y) {
+      raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
+        (DataT *)x, x, m * k, unaryOp_lambda, stream);
+      dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                                 klDivergenceColMajor);
+      klDivergenceColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
+        x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+        epilog_lambda, fin_op);
+      // Now reverse previous log (x) back to x using (e ^ log(x))
+      raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda_reverse), IdxT>(
+        (DataT *)x, x, m * k, unaryOp_lambda_reverse, stream);
+    } else {
+      dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                                 klDivergenceColMajorXequalY);
+      klDivergenceColMajorXequalY<<<grid, blk, KPolicy::SmemSize, stream>>>(
+        x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput,
+        core_lambda_x_equal_y, epilog_lambda, fin_op);
+    }
+  }
+
+  CUDA_CHECK(cudaGetLastError());
+}
+
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          typename FinalLambda, bool isRowMajor>
+void klDivergence(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+                  const DataT *x, const DataT *y, OutT *dOutput,
+                  FinalLambda fin_op, cudaStream_t stream) {
+  size_t bytesA = sizeof(DataT) * lda;
+  size_t bytesB = sizeof(DataT) * ldb;
+  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
+    klDivergenceImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
+                     isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
+                                 stream);
+  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
+    klDivergenceImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
+                     isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
+                                 stream);
+  } else {
+    klDivergenceImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+  }
+}
+
+/**
+ * @brief the KL Divergence distance matrix calculation
+ *  It computes the following equation: 
+      Cij = 0.5 * sum(x * log (x / y));
+ * This distance computation modifies A or B by computing a log(x)
+ * and then performing a `pow(e, log(x))` to convert it back. Because of this,
+ * it is possible that the values in A or B might differ slightly
+ * after this is invoked.
+ * @tparam InType input data-type (for A and B matrices)
+ * @tparam AccType accumulation data-type
+ * @tparam OutType output data-type (for C and D matrices)
+ * @tparam FinalLambda user-defined epilogue lamba
+ * @tparam Index_ Index type
+ * @param m number of rows of A and C/D
+ * @param n number of columns of B and C/D
+ * @param k number of cols of A and rows of B
+ * @param pA input matrix
+ * @param pB input matrix
+ * @param pD output matrix
+ * @param fin_op the final element-wise epilogue lambda
+ * @param stream cuda stream where to launch work
+ * @param isRowMajor whether the input and output matrices are row major
+ */
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_ = int>
+void klDivergenceImpl(int m, int n, int k, const InType *pA, const InType *pB,
+                      OutType *pD, FinalLambda fin_op, cudaStream_t stream,
+                      bool isRowMajor) {
+  typedef std::is_same<OutType, bool> is_bool;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
+    klDivergenceOutType;
+  Index_ lda, ldb, ldd;
+  klDivergenceOutType *pDcast = reinterpret_cast<klDivergenceOutType *>(pD);
+  if (isRowMajor) {
+    lda = k, ldb = k, ldd = n;
+    klDivergence<InType, AccType, klDivergenceOutType, Index_, FinalLambda,
+                 true>(m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
+
+  } else {
+    lda = n, ldb = m, ldd = m;
+    klDivergence<InType, AccType, klDivergenceOutType, Index_, FinalLambda,
+                 false>(n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
+  }
+}
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/include/raft/distance/russell_rao.cuh b/cpp/include/raft/distance/russell_rao.cuh
new file mode 100644
index 0000000000..417fb73b94
--- /dev/null
+++ b/cpp/include/raft/distance/russell_rao.cuh
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/distance/pairwise_distance_base.cuh>
+
+namespace raft {
+namespace distance {
+
+/**
+ * @brief the Russell Rao distance matrix:
+ *  It computes the following equation: 
+    Cij = (k - sum(x_i * y_i)) / k
+ *
+ * @tparam DataT          input data-type (for A and B matrices)
+ * @tparam AccT           accumulation data-type
+ * @tparam OutT           output data-type (for C and D matrices)
+ * @tparam IdxT           index data-type
+ * @tparam Veclen         number of k-elements loaded by each thread
+                          for every LDG call. details in contractions.cuh
+ * @tparam FinalLambda    final lambda called on final distance value
+ * @tparam isRowMajor     true if input/output is row major,
+                          false for column major
+ * @param[in]       x input matrix
+ * @param[in]       y input matrix
+ * @param[in]       m number of rows of A and C/D
+ * @param[in]       n number of rows of B and C/D
+ * @param[in]       k number of cols of A and B
+ * @param[in]       lda leading dimension of A
+ * @param[in]       ldb leading dimension of B
+ * @param[in]       ldd leading dimension of C/D
+ * @param[output]   dOutput output matrix
+ * @param[in]       fin_op the final gemm epilogue lambda
+ * @param[in]       stream cuda stream to launch work
+ */
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          int VecLen, typename FinalLambda, bool isRowMajor>
+static void russellRaoImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
+                           IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
+                           FinalLambda fin_op, cudaStream_t stream) {
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
+  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
+
+  typedef
+    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+
+  dim3 blk(KPolicy::Nthreads);
+
+  // Accumulation operation lambda
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
+    acc += x * y;
+  };
+
+  const float one_over_k = 1.0 / k;
+  // epilogue operation lambda for final value calculation
+  auto epilog_lambda = [k, one_over_k] __device__(
+                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         IdxT gridStrideY) {
+#pragma unroll
+    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
+        acc[i][j] = (k - acc[i][j]) * one_over_k;
+      }
+    }
+  };
+
+  if (isRowMajor) {
+    constexpr auto russellRaoRowMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                               russellRaoRowMajor);
+
+    russellRaoRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
+  } else {
+    constexpr auto russellRaoColMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                               russellRaoColMajor);
+    russellRaoColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
+  }
+
+  CUDA_CHECK(cudaGetLastError());
+}
+
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          typename FinalLambda, bool isRowMajor>
+void russellRao(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+                const DataT *x, const DataT *y, OutT *dOutput,
+                FinalLambda fin_op, cudaStream_t stream) {
+  size_t bytesA = sizeof(DataT) * lda;
+  size_t bytesB = sizeof(DataT) * ldb;
+  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
+    russellRaoImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
+                   isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
+                               stream);
+  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
+    russellRaoImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
+                   isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
+                               stream);
+  } else {
+    russellRaoImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+  }
+}
+
+/**
+ * @brief the Russell Rao distance matrix calculation
+ *  It computes the following equation: 
+    Cij = (k - sum(x_i * y_i)) / k
+ *
+ * @tparam InType input data-type (for A and B matrices)
+ * @tparam AccType accumulation data-type
+ * @tparam OutType output data-type (for C and D matrices)
+ * @tparam FinalLambda user-defined epilogue lamba
+ * @tparam Index_ Index type
+ * @param m number of rows of A and C/D
+ * @param n number of columns of B and C/D
+ * @param k number of cols of A and rows of B
+ * @param pA input matrix
+ * @param pB input matrix
+ * @param pD output matrix
+ * @param fin_op the final element-wise epilogue lambda
+ * @param stream cuda stream where to launch work
+ * @param isRowMajor whether the input and output matrices are row major
+ */
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_ = int>
+void russellRaoImpl(int m, int n, int k, const InType *pA, const InType *pB,
+                    OutType *pD, FinalLambda fin_op, cudaStream_t stream,
+                    bool isRowMajor) {
+  typedef std::is_same<OutType, bool> is_bool;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
+    russellRaoOutType;
+  Index_ lda, ldb, ldd;
+  russellRaoOutType *pDcast = reinterpret_cast<russellRaoOutType *>(pD);
+  if (isRowMajor) {
+    lda = k, ldb = k, ldd = n;
+    russellRao<InType, AccType, russellRaoOutType, Index_, FinalLambda, true>(
+      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
+
+  } else {
+    lda = n, ldb = m, ldd = m;
+    russellRao<InType, AccType, russellRaoOutType, Index_, FinalLambda, false>(
+      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
+  }
+}
+}  // namespace distance
+}  // namespace raft
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index f94a8d9525..0428e09142 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -21,12 +21,17 @@ add_executable(test_raft
     test/distance/dist_adj.cu
     test/distance/dist_canberra.cu
     test/distance/dist_chebyshev.cu
+    test/distance/dist_correlation.cu
     test/distance/dist_cos.cu
     test/distance/dist_euc_exp.cu
     test/distance/dist_euc_unexp.cu
+    test/distance/dist_hamming.cu
     test/distance/dist_hellinger.cu
+    test/distance/dist_jensen_shannon.cu
+    test/distance/dist_kl_divergence.cu
     test/distance/dist_l1.cu
     test/distance/dist_minkowski.cu
+    test/distance/dist_russell_rao.cu
     test/distance/fused_l2_nn.cu
     test/eigen_solvers.cu
     test/handle.cpp
diff --git a/cpp/test/distance/dist_correlation.cu b/cpp/test/distance/dist_correlation.cu
new file mode 100644
index 0000000000..5d84f18e52
--- /dev/null
+++ b/cpp/test/distance/dist_correlation.cu
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include "distance_base.cuh"
+
+namespace raft {
+namespace distance {
+
+template <typename DataType>
+class DistanceCorrelation
+  : public DistanceTest<raft::distance::DistanceType::CorrelationExpanded,
+                        DataType> {};
+
+const std::vector<DistanceInputs<float>> inputsf = {
+  {0.001f, 1024, 1024, 32, true, 1234ULL},
+  {0.001f, 1024, 32, 1024, true, 1234ULL},
+  {0.001f, 32, 1024, 1024, true, 1234ULL},
+  {0.003f, 1024, 1024, 1024, true, 1234ULL},
+  {0.001f, 1024, 1024, 32, false, 1234ULL},
+  {0.001f, 1024, 32, 1024, false, 1234ULL},
+  {0.001f, 32, 1024, 1024, false, 1234ULL},
+  {0.003f, 1024, 1024, 1024, false, 1234ULL},
+};
+typedef DistanceCorrelation<float> DistanceCorrelationF;
+TEST_P(DistanceCorrelationF, Result) {
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationF,
+                        ::testing::ValuesIn(inputsf));
+
+const std::vector<DistanceInputs<double>> inputsd = {
+  {0.001, 1024, 1024, 32, true, 1234ULL},
+  {0.001, 1024, 32, 1024, true, 1234ULL},
+  {0.001, 32, 1024, 1024, true, 1234ULL},
+  {0.003, 1024, 1024, 1024, true, 1234ULL},
+  {0.001, 1024, 1024, 32, false, 1234ULL},
+  {0.001, 1024, 32, 1024, false, 1234ULL},
+  {0.001, 32, 1024, 1024, false, 1234ULL},
+  {0.003, 1024, 1024, 1024, false, 1234ULL},
+};
+typedef DistanceCorrelation<double> DistanceCorrelationD;
+TEST_P(DistanceCorrelationD, Result) {
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationD,
+                        ::testing::ValuesIn(inputsd));
+
+}  // end namespace distance
+}  // end namespace raft
diff --git a/cpp/test/distance/dist_hamming.cu b/cpp/test/distance/dist_hamming.cu
new file mode 100644
index 0000000000..47febd825b
--- /dev/null
+++ b/cpp/test/distance/dist_hamming.cu
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include "distance_base.cuh"
+
+namespace raft {
+namespace distance {
+
+template <typename DataType>
+class DistanceHamming
+  : public DistanceTest<raft::distance::DistanceType::HammingUnexpanded,
+                        DataType> {};
+
+const std::vector<DistanceInputs<float>> inputsf = {
+  {0.001f, 1024, 1024, 32, true, 1234ULL},
+  {0.001f, 1024, 32, 1024, true, 1234ULL},
+  {0.001f, 32, 1024, 1024, true, 1234ULL},
+  {0.003f, 1024, 1024, 1024, true, 1234ULL},
+  {0.001f, 1024, 1024, 32, false, 1234ULL},
+  {0.001f, 1024, 32, 1024, false, 1234ULL},
+  {0.001f, 32, 1024, 1024, false, 1234ULL},
+  {0.003f, 1024, 1024, 1024, false, 1234ULL},
+};
+typedef DistanceHamming<float> DistanceHammingF;
+TEST_P(DistanceHammingF, Result) {
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingF,
+                        ::testing::ValuesIn(inputsf));
+
+const std::vector<DistanceInputs<double>> inputsd = {
+  {0.001, 1024, 1024, 32, true, 1234ULL},
+  {0.001, 1024, 32, 1024, true, 1234ULL},
+  {0.001, 32, 1024, 1024, true, 1234ULL},
+  {0.003, 1024, 1024, 1024, true, 1234ULL},
+  {0.001, 1024, 1024, 32, false, 1234ULL},
+  {0.001, 1024, 32, 1024, false, 1234ULL},
+  {0.001, 32, 1024, 1024, false, 1234ULL},
+  {0.003, 1024, 1024, 1024, false, 1234ULL},
+};
+typedef DistanceHamming<double> DistanceHammingD;
+TEST_P(DistanceHammingD, Result) {
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingD,
+                        ::testing::ValuesIn(inputsd));
+
+}  // end namespace distance
+}  // end namespace raft
diff --git a/cpp/test/distance/dist_jensen_shannon.cu b/cpp/test/distance/dist_jensen_shannon.cu
new file mode 100644
index 0000000000..bc0b56f506
--- /dev/null
+++ b/cpp/test/distance/dist_jensen_shannon.cu
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include "distance_base.cuh"
+
+namespace raft {
+namespace distance {
+
+template <typename DataType>
+class DistanceJensenShannon
+  : public DistanceTest<raft::distance::DistanceType::JensenShannon, DataType> {
+};
+
+const std::vector<DistanceInputs<float>> inputsf = {
+  {0.001f, 1024, 1024, 32, true, 1234ULL},
+  {0.001f, 1024, 32, 1024, true, 1234ULL},
+  {0.001f, 32, 1024, 1024, true, 1234ULL},
+  {0.003f, 1024, 1024, 1024, true, 1234ULL},
+  {0.001f, 1024, 1024, 32, false, 1234ULL},
+  {0.001f, 1024, 32, 1024, false, 1234ULL},
+  {0.001f, 32, 1024, 1024, false, 1234ULL},
+  {0.003f, 1024, 1024, 1024, false, 1234ULL},
+};
+typedef DistanceJensenShannon<float> DistanceJensenShannonF;
+TEST_P(DistanceJensenShannonF, Result) {
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonF,
+                        ::testing::ValuesIn(inputsf));
+
+const std::vector<DistanceInputs<double>> inputsd = {
+  {0.001, 1024, 1024, 32, true, 1234ULL},
+  {0.001, 1024, 32, 1024, true, 1234ULL},
+  {0.001, 32, 1024, 1024, true, 1234ULL},
+  {0.003, 1024, 1024, 1024, true, 1234ULL},
+  {0.001, 1024, 1024, 32, false, 1234ULL},
+  {0.001, 1024, 32, 1024, false, 1234ULL},
+  {0.001, 32, 1024, 1024, false, 1234ULL},
+  {0.003, 1024, 1024, 1024, false, 1234ULL},
+};
+typedef DistanceJensenShannon<double> DistanceJensenShannonD;
+TEST_P(DistanceJensenShannonD, Result) {
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonD,
+                        ::testing::ValuesIn(inputsd));
+
+}  // end namespace distance
+}  // end namespace raft
diff --git a/cpp/test/distance/dist_kl_divergence.cu b/cpp/test/distance/dist_kl_divergence.cu
new file mode 100644
index 0000000000..884ac4b948
--- /dev/null
+++ b/cpp/test/distance/dist_kl_divergence.cu
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include "distance_base.cuh"
+
+namespace raft {
+namespace distance {
+
+template <typename DataType>
+class DistanceKLDivergence
+  : public DistanceTest<raft::distance::DistanceType::KLDivergence, DataType> {
+};
+
+const std::vector<DistanceInputs<float>> inputsf = {
+  {0.001f, 1024, 1024, 32, true, 1234ULL},
+  {0.001f, 1024, 32, 1024, true, 1234ULL},
+  {0.001f, 32, 1024, 1024, true, 1234ULL},
+  {0.003f, 1024, 1024, 1024, true, 1234ULL},
+  {0.001f, 1024, 1024, 32, false, 1234ULL},
+  {0.001f, 1024, 32, 1024, false, 1234ULL},
+  {0.001f, 32, 1024, 1024, false, 1234ULL},
+  {0.003f, 1024, 1024, 1024, false, 1234ULL},
+};
+typedef DistanceKLDivergence<float> DistanceKLDivergenceF;
+TEST_P(DistanceKLDivergenceF, Result) {
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceF,
+                        ::testing::ValuesIn(inputsf));
+
+const std::vector<DistanceInputs<double>> inputsd = {
+  {0.001, 1024, 1024, 32, true, 1234ULL},
+  {0.001, 1024, 32, 1024, true, 1234ULL},
+  {0.001, 32, 1024, 1024, true, 1234ULL},
+  {0.003, 1024, 1024, 1024, true, 1234ULL},
+  {0.001, 1024, 1024, 32, false, 1234ULL},
+  {0.001, 1024, 32, 1024, false, 1234ULL},
+  {0.001, 32, 1024, 1024, false, 1234ULL},
+  {0.003, 1024, 1024, 1024, false, 1234ULL},
+};
+typedef DistanceKLDivergence<double> DistanceKLDivergenceD;
+TEST_P(DistanceKLDivergenceD, Result) {
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceD,
+                        ::testing::ValuesIn(inputsd));
+
+}  // end namespace distance
+}  // end namespace raft
diff --git a/cpp/test/distance/dist_russell_rao.cu b/cpp/test/distance/dist_russell_rao.cu
new file mode 100644
index 0000000000..74ccfb0c2e
--- /dev/null
+++ b/cpp/test/distance/dist_russell_rao.cu
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include "distance_base.cuh"
+
+namespace raft {
+namespace distance {
+
+template <typename DataType>
+class DistanceRussellRao
+  : public DistanceTest<raft::distance::DistanceType::RusselRaoExpanded,
+                        DataType> {};
+
+const std::vector<DistanceInputs<float>> inputsf = {
+  {0.001f, 1024, 1024, 32, true, 1234ULL},
+  {0.001f, 1024, 32, 1024, true, 1234ULL},
+  {0.001f, 32, 1024, 1024, true, 1234ULL},
+  {0.003f, 1024, 1024, 1024, true, 1234ULL},
+  {0.001f, 1024, 1024, 32, false, 1234ULL},
+  {0.001f, 1024, 32, 1024, false, 1234ULL},
+  {0.001f, 32, 1024, 1024, false, 1234ULL},
+  {0.003f, 1024, 1024, 1024, false, 1234ULL},
+};
+typedef DistanceRussellRao<float> DistanceRussellRaoF;
+TEST_P(DistanceRussellRaoF, Result) {
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoF,
+                        ::testing::ValuesIn(inputsf));
+
+const std::vector<DistanceInputs<double>> inputsd = {
+  {0.001, 1024, 1024, 32, true, 1234ULL},
+  {0.001, 1024, 32, 1024, true, 1234ULL},
+  {0.001, 32, 1024, 1024, true, 1234ULL},
+  {0.003, 1024, 1024, 1024, true, 1234ULL},
+  {0.001, 1024, 1024, 32, false, 1234ULL},
+  {0.001, 1024, 32, 1024, false, 1234ULL},
+  {0.001, 32, 1024, 1024, false, 1234ULL},
+  {0.003, 1024, 1024, 1024, false, 1234ULL},
+};
+typedef DistanceRussellRao<double> DistanceRussellRaoD;
+TEST_P(DistanceRussellRaoD, Result) {
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoD,
+                        ::testing::ValuesIn(inputsd));
+
+}  // end namespace distance
+}  // end namespace raft
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index fc7b064205..9e3290593d 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -160,6 +160,138 @@ __global__ void naiveLpUnexpDistanceKernel(DataType *dist, const DataType *x,
   dist[outidx] = acc;
 }
 
+template <typename DataType>
+__global__ void naiveHammingDistanceKernel(DataType *dist, const DataType *x,
+                                           const DataType *y, int m, int n,
+                                           int k, bool isRowMajor) {
+  int midx = threadIdx.x + blockIdx.x * blockDim.x;
+  int nidx = threadIdx.y + blockIdx.y * blockDim.y;
+  if (midx >= m || nidx >= n) return;
+  DataType acc = DataType(0);
+  for (int i = 0; i < k; ++i) {
+    int xidx = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
+    auto a = x[xidx];
+    auto b = y[yidx];
+    acc += (a != b);
+  }
+  acc = acc / k;
+  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  dist[outidx] = acc;
+}
+
+template <typename DataType>
+__global__ void naiveJensenShannonDistanceKernel(DataType *dist,
+                                                 const DataType *x,
+                                                 const DataType *y, int m,
+                                                 int n, int k,
+                                                 bool isRowMajor) {
+  int midx = threadIdx.x + blockIdx.x * blockDim.x;
+  int nidx = threadIdx.y + blockIdx.y * blockDim.y;
+  if (midx >= m || nidx >= n) return;
+  DataType acc = DataType(0);
+  for (int i = 0; i < k; ++i) {
+    int xidx = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
+    auto a = x[xidx];
+    auto b = y[yidx];
+
+    DataType m = 0.5f * (a + b);
+    bool a_zero = a == 0;
+    bool b_zero = b == 0;
+
+    DataType p = (!a_zero * m) / (a_zero + a);
+    DataType q = (!b_zero * m) / (b_zero + b);
+
+    bool p_zero = p == 0;
+    bool q_zero = q == 0;
+
+    acc +=
+      (-a * (!p_zero * log(p + p_zero))) + (-b * (!q_zero * log(q + q_zero)));
+  }
+  acc = raft::mySqrt(0.5f * acc);
+  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  dist[outidx] = acc;
+}
+
+template <typename DataType, typename OutType>
+__global__ void naiveRussellRaoDistanceKernel(OutType *dist, const DataType *x,
+                                              const DataType *y, int m, int n,
+                                              int k, bool isRowMajor) {
+  int midx = threadIdx.x + blockIdx.x * blockDim.x;
+  int nidx = threadIdx.y + blockIdx.y * blockDim.y;
+  if (midx >= m || nidx >= n) return;
+  OutType acc = OutType(0);
+  for (int i = 0; i < k; ++i) {
+    int xidx = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
+    auto a = x[xidx];
+    auto b = y[yidx];
+    acc += (a * b);
+  }
+  acc = (k - acc) / k;
+  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  dist[outidx] = acc;
+}
+
+template <typename DataType, typename OutType>
+__global__ void naiveKLDivergenceDistanceKernel(OutType *dist,
+                                                const DataType *x,
+                                                const DataType *y, int m, int n,
+                                                int k, bool isRowMajor) {
+  int midx = threadIdx.x + blockIdx.x * blockDim.x;
+  int nidx = threadIdx.y + blockIdx.y * blockDim.y;
+  if (midx >= m || nidx >= n) return;
+  OutType acc = OutType(0);
+  for (int i = 0; i < k; ++i) {
+    int xidx = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
+    auto a = x[xidx];
+    auto b = y[yidx];
+    bool b_zero = (b == 0);
+    const auto m = (!b_zero) * (a / b);
+    const bool m_zero = (m == 0);
+    acc += (a * (!m_zero) * log(m + m_zero));
+  }
+  acc = 0.5f * acc;
+  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  dist[outidx] = acc;
+}
+
+template <typename DataType, typename OutType>
+__global__ void naiveCorrelationDistanceKernel(OutType *dist, const DataType *x,
+                                               const DataType *y, int m, int n,
+                                               int k, bool isRowMajor) {
+  int midx = threadIdx.x + blockIdx.x * blockDim.x;
+  int nidx = threadIdx.y + blockIdx.y * blockDim.y;
+  if (midx >= m || nidx >= n) return;
+  OutType acc = OutType(0);
+  auto a_norm = DataType(0);
+  auto b_norm = DataType(0);
+  auto a_sq_norm = DataType(0);
+  auto b_sq_norm = DataType(0);
+  for (int i = 0; i < k; ++i) {
+    int xidx = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
+    auto a = x[xidx];
+    auto b = y[yidx];
+    a_norm += a;
+    b_norm += b;
+    a_sq_norm += (a * a);
+    b_sq_norm += (b * b);
+    acc += (a * b);
+  }
+
+  auto numer = k * acc - (a_norm * b_norm);
+  auto Q_denom = k * a_sq_norm - (a_norm * a_norm);
+  auto R_denom = k * b_sq_norm - (b_norm * b_norm);
+
+  acc = 1 - (numer / raft::mySqrt(Q_denom * R_denom));
+
+  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  dist[outidx] = acc;
+}
+
 template <typename DataType>
 void naiveDistance(DataType *dist, const DataType *x, const DataType *y, int m,
                    int n, int k, raft::distance::DistanceType type,
@@ -193,6 +325,26 @@ void naiveDistance(DataType *dist, const DataType *x, const DataType *y, int m,
       naiveLpUnexpDistanceKernel<DataType>
         <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor, metric_arg);
       break;
+    case raft::distance::DistanceType::HammingUnexpanded:
+      naiveHammingDistanceKernel<DataType>
+        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      break;
+    case raft::distance::DistanceType::JensenShannon:
+      naiveJensenShannonDistanceKernel<DataType>
+        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      break;
+    case raft::distance::DistanceType::RusselRaoExpanded:
+      naiveRussellRaoDistanceKernel<DataType>
+        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      break;
+    case raft::distance::DistanceType::KLDivergence:
+      naiveKLDivergenceDistanceKernel<DataType>
+        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      break;
+    case raft::distance::DistanceType::CorrelationExpanded:
+      naiveCorrelationDistanceKernel<DataType>
+        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      break;
     default:
       FAIL() << "should be here\n";
   }
@@ -247,10 +399,19 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
     raft::allocate(dist_ref, m * n);
     raft::allocate(dist, m * n);
     raft::allocate(dist2, m * n);
-    if (distanceType == raft::distance::DistanceType::HellingerExpanded) {
+    if (distanceType == raft::distance::DistanceType::HellingerExpanded ||
+        distanceType == raft::distance::DistanceType::JensenShannon ||
+        distanceType == raft::distance::DistanceType::KLDivergence) {
       // Hellinger works only on positive numbers
       r.uniform(x, m * k, DataType(0.0), DataType(1.0), stream);
       r.uniform(y, n * k, DataType(0.0), DataType(1.0), stream);
+    } else if (distanceType ==
+               raft::distance::DistanceType::RusselRaoExpanded) {
+      r.uniform(x, m * k, DataType(0.0), DataType(1.0), stream);
+      r.uniform(y, n * k, DataType(0.0), DataType(1.0), stream);
+      // Russel rao works on boolean values.
+      r.bernoulli(x, m * k, 0.5f, stream);
+      r.bernoulli(y, n * k, 0.5f, stream);
     } else {
       r.uniform(x, m * k, DataType(-1.0), DataType(1.0), stream);
       r.uniform(y, n * k, DataType(-1.0), DataType(1.0), stream);