From b4939b784d545e0b3ed092d0ed4822fa82cc36ef Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Mon, 17 Oct 2022 21:59:02 -0400
Subject: [PATCH] Moving kernel gramm primitives to `raft::distance::kernels`
 (#920)

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/raft/pull/920
---
 cpp/CMakeLists.txt                            |  10 +
 cpp/bench/CMakeLists.txt                      |   1 +
 cpp/bench/distance/distance_common.cuh        |   2 +-
 cpp/bench/distance/kernels.cu                 | 123 ++++++
 .../distance/detail/kernels/gram_matrix.cuh   | 218 ++++++++++
 .../detail/kernels/kernel_factory.cuh         |  48 +++
 .../detail/kernels/kernel_matrices.cuh        | 376 ++++++++++++++++
 cpp/include/raft/distance/distance_types.hpp  |  21 +
 cpp/include/raft/distance/kernels.cuh         |  32 ++
 .../specializations/detail/kernels.cuh        |  31 ++
 .../distance/specializations/distance.cuh     |   1 +
 cpp/include/raft/linalg/init.cuh              |  14 +
 cpp/include/raft/util/cache.cuh               | 406 ++++++++++++++++++
 cpp/include/raft/util/fast_int_div.cuh        | 123 ++++++
 .../detail/kernels/gram_matrix_base_double.cu |  19 +
 .../detail/kernels/gram_matrix_base_float.cu  |  19 +
 .../kernels/polynomial_kernel_double_int.cu   |  19 +
 .../kernels/polynomial_kernel_float_int.cu    |  19 +
 .../detail/kernels/rbf_kernel_double.cu       |  19 +
 .../detail/kernels/rbf_kernel_float.cu        |  19 +
 .../detail/kernels/tanh_kernel_double.cu      |  19 +
 .../detail/kernels/tanh_kernel_float.cu       |  19 +
 cpp/test/CMakeLists.txt                       |   1 +
 cpp/test/distance/gram.cu                     | 189 ++++++++
 24 files changed, 1747 insertions(+), 1 deletion(-)
 create mode 100644 cpp/bench/distance/kernels.cu
 create mode 100644 cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
 create mode 100644 cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
 create mode 100644 cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
 create mode 100644 cpp/include/raft/distance/kernels.cuh
 create mode 100644 cpp/include/raft/distance/specializations/detail/kernels.cuh
 create mode 100644 cpp/include/raft/util/cache.cuh
 create mode 100644 cpp/include/raft/util/fast_int_div.cuh
 create mode 100644 cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
 create mode 100644 cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
 create mode 100644 cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
 create mode 100644 cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
 create mode 100644 cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu
 create mode 100644 cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu
 create mode 100644 cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu
 create mode 100644 cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu
 create mode 100644 cpp/test/distance/gram.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 12bebfa2a5..21e2103c4b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -248,11 +248,21 @@ if(RAFT_COMPILE_DIST_LIBRARY)
     src/distance/specializations/detail/chebyshev.cu
     src/distance/specializations/detail/correlation.cu
     src/distance/specializations/detail/cosine.cu
+    src/distance/specializations/detail/cosine.cu
     src/distance/specializations/detail/hamming_unexpanded.cu
     src/distance/specializations/detail/hellinger_expanded.cu
     src/distance/specializations/detail/jensen_shannon_float_float_float_int.cu
     src/distance/specializations/detail/jensen_shannon_float_float_float_uint32.cu
     src/distance/specializations/detail/jensen_shannon_double_double_double_int.cu
+    src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
+    src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
+    src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
+    src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
+# These are somehow missing a kernel definition which is causing a compile error.
+#    src/distance/specializations/detail/kernels/rbf_kernel_double.cu
+#    src/distance/specializations/detail/kernels/rbf_kernel_float.cu
+    src/distance/specializations/detail/kernels/tanh_kernel_double.cu
+    src/distance/specializations/detail/kernels/tanh_kernel_float.cu
     src/distance/specializations/detail/kl_divergence_float_float_float_int.cu
     src/distance/specializations/detail/kl_divergence_float_float_float_uint32.cu
     src/distance/specializations/detail/kl_divergence_double_double_double_int.cu
diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
index 4af3df9a1a..9c6b60d83b 100644
--- a/cpp/bench/CMakeLists.txt
+++ b/cpp/bench/CMakeLists.txt
@@ -85,6 +85,7 @@ if(BUILD_BENCH)
             bench/distance/distance_exp_l2.cu
             bench/distance/distance_l1.cu
             bench/distance/distance_unexp_l2.cu
+            bench/distance/kernels.cu
             bench/main.cpp
             OPTIONAL DIST
             )
diff --git a/cpp/bench/distance/distance_common.cuh b/cpp/bench/distance/distance_common.cuh
index 4f1a8ccab1..73faacce37 100644
--- a/cpp/bench/distance/distance_common.cuh
+++ b/cpp/bench/distance/distance_common.cuh
@@ -15,8 +15,8 @@
  */
 
 #include <common/benchmark.hpp>
-#include <raft/cudart_utils.h>
 #include <raft/distance/distance.cuh>
+#include <raft/util/cudart_utils.hpp>
 #if defined RAFT_DISTANCE_COMPILED
 #include <raft/distance/specializations.cuh>
 #endif
diff --git a/cpp/bench/distance/kernels.cu b/cpp/bench/distance/kernels.cu
new file mode 100644
index 0000000000..5c9c2cc2ed
--- /dev/null
+++ b/cpp/bench/distance/kernels.cu
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#if defined RAFT_DISTANCE_COMPILED
+#include <raft/distance/specializations.cuh>
+#endif
+
+#include <common/benchmark.hpp>
+#include <memory>
+#include <raft/core/handle.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/distance/kernels.cuh>
+#include <raft/random/rng.cuh>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace raft::bench::distance::kernels {
+
+using namespace raft::distance::kernels;
+struct GramTestParams {
+  int m;  // m parameter of the GEMM
+  int k;  // k parameter of the GEMM
+  int n;  // n parameter of the GEMM
+  KernelParams kernel_params;
+  bool is_row_major;
+};  // struct GramTestParams
+
+template <typename T>
+struct GramMatrix : public fixture {
+  GramMatrix(const GramTestParams& p)
+    : params(p), handle(stream), A(0, stream), B(0, stream), C(0, stream)
+  {
+    kernel = std::unique_ptr<GramMatrixBase<T>>(
+      KernelFactory<T>::create(p.kernel_params, handle.get_cublas_handle()));
+
+    A.resize(params.m * params.k, stream);
+    B.resize(params.k * params.n, stream);
+    C.resize(params.m * params.n, stream);
+    raft::random::Rng r(123456ULL);
+    r.uniform(A.data(), params.m * params.k, T(-1.0), T(1.0), stream);
+    r.uniform(B.data(), params.k * params.n, T(-1.0), T(1.0), stream);
+  }
+
+  ~GramMatrix()
+  {
+    A.release();
+    B.release();
+    C.release();
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    if (!this->kernel) { state.SkipWithError("Kernel matrix is not initialized"); }
+    loop_on_state(state, [this]() {
+      (*this->kernel)(A.data(),
+                      this->params.m,
+                      this->params.k,
+                      B.data(),
+                      this->params.n,
+                      C.data(),
+                      this->params.is_row_major,
+                      this->stream);
+    });
+  }
+
+ private:
+  const raft::handle_t handle;
+  std::unique_ptr<GramMatrixBase<T>> kernel;
+  GramTestParams params;
+
+  rmm::device_uvector<T> A;  // input matrix A, size [m * k]
+  rmm::device_uvector<T> B;  // input matrix B, size [n * k]
+  rmm::device_uvector<T> C;  // output matrix C, size [m*n]
+};
+
+static std::vector<GramTestParams> getInputs()
+{
+  std::vector<GramTestParams> param_vec;
+  std::vector<KernelParams> kernel_params{KernelParams{LINEAR, 3, 1, 0},
+                                          KernelParams{POLYNOMIAL, 2, 1.3, 1},
+                                          KernelParams{TANH, 2, 0.5, 2.4},
+                                          KernelParams{RBF, 2, 0.5, 0}};
+  struct TestSize {
+    int m;
+    int k;
+    int n;
+  };
+  std::vector<TestSize> data_size{{4096, 10, 1024},
+                                  {4096, 100, 1024},
+                                  {4096, 1000, 1024},
+                                  {4096, 10000, 1024},
+                                  {100000, 10, 1024},
+                                  {100000, 100, 1024},
+                                  {100000, 1000, 1024}};
+
+  param_vec.reserve(kernel_params.size() * data_size.size());
+  for (TestSize s : data_size) {
+    for (auto kernel : kernel_params) {
+      for (bool row_major : {false, true}) {
+        param_vec.push_back(GramTestParams{s.m, s.k, s.n, kernel, row_major});
+      }
+    }
+  }
+  return param_vec;
+}
+
+RAFT_BENCH_REGISTER(GramMatrix<float>, "", getInputs());
+RAFT_BENCH_REGISTER(GramMatrix<double>, "", getInputs());
+
+}  // namespace raft::bench::distance::kernels
diff --git a/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
new file mode 100644
index 0000000000..54ac490ca4
--- /dev/null
+++ b/cpp/include/raft/distance/detail/kernels/gram_matrix.cuh
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/distance.cuh>
+
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/gemm.cuh>
+
+namespace raft::distance::kernels::detail {
+
+/**
+ * Base class for general Gram matrices
+ * A Gram matrix is the Hermitian matrix of inner probucts G_ik = <x_i, x_k>
+ * Here, the  inner product is evaluated for all elements from vectors sets X1,
+ * and X2.
+ *
+ * To be more precise, on exit the output buffer will store:
+ * - if is_row_major == true: out[j+k*n1] = <x1_j, x2_k>,
+ * - if is_row_major == false: out[j*n2 + k] = <x1_j, x2_k>,
+ * where x1_j is the j-th vector from the x1 set and x2_k is the k-th vector
+ * from the x2 set.
+ */
+template <typename math_t>
+class GramMatrixBase {
+  cublasHandle_t cublas_handle;
+
+ public:
+  GramMatrixBase(cublasHandle_t cublas_handle) : cublas_handle(cublas_handle){};
+
+  virtual ~GramMatrixBase(){};
+
+  /** Convenience function to evaluate the Gram matrix for two vector sets.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1
+   * @param ld2 leading dimension of x2
+   * @param ld_out leading dimension of out
+   */
+  virtual void operator()(const math_t* x1,
+                          int n1,
+                          int n_cols,
+                          const math_t* x2,
+                          int n2,
+                          math_t* out,
+                          bool is_row_major,
+                          cudaStream_t stream,
+                          int ld1    = 0,
+                          int ld2    = 0,
+                          int ld_out = 0)
+  {
+    if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; }
+    if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; }
+    if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; }
+    evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+  }
+
+  /** Evaluate the Gram matrix for two vector sets using simple dot product.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
+   */
+  virtual void evaluate(const math_t* x1,
+                        int n1,
+                        int n_cols,
+                        const math_t* x2,
+                        int n2,
+                        math_t* out,
+                        bool is_row_major,
+                        cudaStream_t stream,
+                        int ld1,
+                        int ld2,
+                        int ld_out)
+  {
+    linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+  }
+
+  // private:
+  // The following methods should be private, they are kept public to avoid:
+  // "error: The enclosing parent function ("distance") for an extended
+  // __device__ lambda cannot have private or protected access within its class"
+
+  /** Calculates the Gram matrix using simple dot product between vector sets.
+   *
+   * out = x1 * x2
+   *
+   * Can be used as a building block for more complex kernel functions.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of colums (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1
+   * @param ld2 leading dimension of x2
+   * @param ld_out leading dimension of out
+   */
+  void linear(const math_t* x1,
+              int n1,
+              int n_cols,
+              const math_t* x2,
+              int n2,
+              math_t* out,
+              bool is_row_major,
+              cudaStream_t stream,
+              int ld1,
+              int ld2,
+              int ld_out)
+  {
+    math_t alpha = 1.0;
+    math_t beta  = 0.0;
+    if (is_row_major) {
+      // #TODO: Call from public API when ready
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
+                                                       CUBLAS_OP_T,
+                                                       CUBLAS_OP_N,
+                                                       n2,
+                                                       n1,
+                                                       n_cols,
+                                                       &alpha,
+                                                       x2,
+                                                       ld2,
+                                                       x1,
+                                                       ld1,
+                                                       &beta,
+                                                       out,
+                                                       ld_out,
+                                                       stream));
+    } else {
+      // #TODO: Call from public API when ready
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
+                                                       CUBLAS_OP_N,
+                                                       CUBLAS_OP_T,
+                                                       n1,
+                                                       n2,
+                                                       n_cols,
+                                                       &alpha,
+                                                       x1,
+                                                       ld1,
+                                                       x2,
+                                                       ld2,
+                                                       &beta,
+                                                       out,
+                                                       ld_out,
+                                                       stream));
+    }
+  }
+
+  /** Calculates the Gram matrix using Euclidean distance.
+   *
+   * Can be used as a building block for more complex kernel functions.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1
+   * @param ld2 leading dimension of x2
+   * @param ld_out leading dimension of out
+   */
+  virtual void distance(const math_t* x1,
+                        int n1,
+                        int n_cols,
+                        const math_t* x2,
+                        int n2,
+                        math_t* out,
+                        bool is_row_major,
+                        cudaStream_t stream,
+                        int ld1,
+                        int ld2,
+                        int ld_out)
+  {
+    raft::distance::distance<raft::distance::DistanceType::L2Unexpanded, math_t, math_t, math_t>(
+      x1, x2, out, n1, n2, n_cols, stream, is_row_major);
+  }
+};
+};  // end namespace raft::distance::kernels::detail
\ No newline at end of file
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh b/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
new file mode 100644
index 0000000000..1aa6809bcd
--- /dev/null
+++ b/cpp/include/raft/distance/detail/kernels/kernel_factory.cuh
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "gram_matrix.cuh"
+#include "kernel_matrices.cuh"
+#include <raft/distance/distance_types.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft::distance::kernels::detail {
+
+template <typename math_t>
+class KernelFactory {
+ public:
+  static GramMatrixBase<math_t>* create(KernelParams params, cublasHandle_t cublas_handle)
+  {
+    GramMatrixBase<math_t>* res;
+    // KernelParams is not templated, we convert the parameters to math_t here:
+    math_t coef0 = params.coef0;
+    math_t gamma = params.gamma;
+    switch (params.kernel) {
+      case LINEAR: res = new GramMatrixBase<math_t>(cublas_handle); break;
+      case POLYNOMIAL:
+        res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0, cublas_handle);
+        break;
+      case TANH: res = new TanhKernel<math_t>(gamma, coef0, cublas_handle); break;
+      case RBF: res = new RBFKernel<math_t>(gamma); break;
+      default: throw raft::exception("Kernel not implemented");
+    }
+    return res;
+  }
+};
+
+};  // end namespace raft::distance::kernels::detail
diff --git a/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
new file mode 100644
index 0000000000..6d59e1c7c5
--- /dev/null
+++ b/cpp/include/raft/distance/detail/kernels/kernel_matrices.cuh
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "gram_matrix.cuh"
+#include <raft/util/cuda_utils.cuh>
+
+#include <raft/distance/distance.cuh>
+#include <raft/linalg/gemm.cuh>
+
+namespace raft::distance::kernels::detail {
+
+/** Epiloge function for polynomial kernel without padding.
+ * Calculates output = (gain*in + offset)^exponent
+ * @param inout device vector in column major format, size [len]
+ * @param len array length
+ * @param exponent
+ * @param gain
+ * @param offset
+ */
+template <typename math_t, typename exp_t>
+__global__ void polynomial_kernel_nopad(
+  math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset)
+{
+  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
+       tid += blockDim.x * gridDim.x) {
+    inout[tid] = pow(gain * inout[tid] + offset, exponent);
+  }
+}
+
+/** Epiloge function for polynomial kernel with padding.
+ * Calculates output = (gain*input + offset)^exponent
+ * @param inout device vector in column major format, size [ld * cols]
+ * @param ld leading dimension of the inout buffer
+ * @param rows number of rows (rows <= ld)
+ * @param cols number of colums
+ * @param exponent
+ * @param gain
+ * @param offset
+ */
+template <typename math_t, typename exp_t>
+__global__ void polynomial_kernel(
+  math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset)
+{
+  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
+       tidy += blockDim.y * gridDim.y)
+    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
+         tidx += blockDim.x * gridDim.x) {
+      inout[tidx + tidy * ld] = pow(gain * inout[tidx + tidy * ld] + offset, exponent);
+    }
+}
+
+/** Epiloge function for tanh kernel without padding.
+ * Calculates output = tanh(gain*input + offset)
+ * @param inout device vector, size [len]
+ * @param len length of the input vector
+ * @param gain
+ * @param offset
+ */
+template <typename math_t>
+__global__ void tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset)
+{
+  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
+       tid += blockDim.x * gridDim.x) {
+    inout[tid] = tanh(gain * inout[tid] + offset);
+  }
+}
+
+/** Epiloge function for tanh kernel without padding.
+ * Calculates output = tanh(gain*input + offset)
+ * @param inout device vector in column major format, size [ld * cols]
+ * @param ld leading dimension of the inout buffer
+ * @param rows number of rows (rows <= ld)
+ * @param cols number of colums
+ * @param gain
+ * @param offset
+ */
+template <typename math_t>
+__global__ void tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset)
+{
+  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
+       tidy += blockDim.y * gridDim.y)
+    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
+         tidx += blockDim.x * gridDim.x) {
+      inout[tidx + tidy * ld] = tanh(gain * inout[tidx + tidy * ld] + offset);
+    }
+}
+
+/**
+ * Create a kernel matrix using polynomial kernel function.
+ */
+template <typename math_t, typename exp_t>
+class PolynomialKernel : public GramMatrixBase<math_t> {
+  exp_t exponent;
+  math_t gain;
+  math_t offset;
+
+  void applyKernel(
+    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
+  {
+    const int n_minor = is_row_major ? cols : rows;
+    if (ld == n_minor) {
+      polynomial_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
+        inout, rows * cols, exponent, gain, offset);
+    } else {
+      int n1 = is_row_major ? cols : rows;
+      int n2 = is_row_major ? rows : cols;
+      polynomial_kernel<<<dim3(raft::ceildiv(n1, 32), raft::ceildiv(n2, 4), 1),
+                          dim3(32, 4, 1),
+                          0,
+                          stream>>>(inout, ld, n1, n2, exponent, gain, offset);
+    }
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+
+ public:
+  /**
+   * Constructs a polynomial kernel object.
+   * It evaluates the kernel matrix using the following formula:
+   * K_ij = (gain*<x1_i, x2_k> + offset)^exponent
+   *
+   * @tparam math_t floating point type
+   * @tparam exp_t type of exponent
+   * @param exponent
+   * @param gain
+   * @param offset
+   * @param cublas_handle
+   */
+  PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t cublas_handle)
+    : GramMatrixBase<math_t>(cublas_handle), exponent(exponent), gain(gain), offset(offset)
+  {
+  }
+
+  /** Evaluate kernel matrix using polynomial kernel.
+   *
+   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of features in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1
+   * @param ld2 leading dimension of x2
+   * @param ld_out leading dimension of out
+   */
+  void evaluate(const math_t* x1,
+                int n1,
+                int n_cols,
+                const math_t* x2,
+                int n2,
+                math_t* out,
+                bool is_row_major,
+                cudaStream_t stream,
+                int ld1,
+                int ld2,
+                int ld_out)
+  {
+    GramMatrixBase<math_t>::linear(
+      x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
+  }
+};
+
+/**
+ * Create a kernel matrix using tanh kernel function.
+ */
+template <typename math_t>
+class TanhKernel : public GramMatrixBase<math_t> {
+  math_t gain, offset;
+
+  void applyKernel(
+    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
+  {
+    const int n_minor = is_row_major ? cols : rows;
+    if (ld == n_minor) {
+      tanh_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
+        inout, rows * cols, gain, offset);
+    } else {
+      int n1 = is_row_major ? cols : rows;
+      int n2 = is_row_major ? rows : cols;
+      tanh_kernel<<<dim3(raft::ceildiv(n1, 32), raft::ceildiv(n2, 4), 1),
+                    dim3(32, 4, 1),
+                    0,
+                    stream>>>(inout, ld, n1, n2, gain, offset);
+    }
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+
+ public:
+  /**
+   * Constructs a tanh kernel object.
+   * It evaluates the kernel matrix using the following formula:
+   * K_ij = tanh(gain*<x1_i, x2_k> + offset)
+   *
+   * @tparam math_t floating point type
+   * @param gain
+   * @param offset
+   * @param cublas_handle
+   */
+  TanhKernel(math_t gain, math_t offset, cublasHandle_t cublas_handle)
+    : GramMatrixBase<math_t>(cublas_handle), gain(gain), offset(offset)
+  {
+  }
+
+  /** Evaluate kernel matrix using tanh kernel.
+   *
+   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] x1 device array of vectors,
+   *  size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of features in x1 and x2
+   * @param [in] x2 device array of vectors,
+   *   size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
+   */
+  void evaluate(const math_t* x1,
+                int n1,
+                int n_cols,
+                const math_t* x2,
+                int n2,
+                math_t* out,
+                bool is_row_major,
+                cudaStream_t stream,
+                int ld1,
+                int ld2,
+                int ld_out)
+  {
+    GramMatrixBase<math_t>::linear(
+      x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
+  }
+};
+
+/**
+ * Create a kernel matrix using RBF kernel function.
+ */
+template <typename math_t>
+class RBFKernel : public GramMatrixBase<math_t> {
+  math_t gain;
+
+  void applyKernel(
+    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
+  {
+    const int n_minor = is_row_major ? cols : rows;
+    if (ld == n_minor) {
+      rbf_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
+        inout, rows * cols, gain);
+    } else {
+      int n1 = is_row_major ? cols : rows;
+      int n2 = is_row_major ? rows : cols;
+      rbf_kernel<<<dim3(raft::ceildiv(n1, 32), raft::ceildiv(n2, 4), 1),
+                   dim3(32, 4, 1),
+                   0,
+                   stream>>>(inout, ld, n1, n2, gain);
+    }
+  }
+
+ public:
+  /**
+   * Constructs a RBF kernel object.
+   * It evaluates the kernel matrix using the following formula:
+   * K_ij = exp(-gain*|x1_i- x2_k|^2)
+   *
+   * @tparam math_t floating point type
+   * @param gain
+   */
+  RBFKernel(math_t gain) : GramMatrixBase<math_t>(NULL), gain(gain) {}
+
+  /** Evaluate kernel matrix using RBF kernel.
+   *
+   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and | | euclidean distance.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of features in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1, currently only ld1 == n1 is supported
+   * @param ld2 leading dimension of x2, currently only ld2 == n2 is supported
+   * @param ld_out leading dimension of out, only ld_out == n1 is supported
+   */
+  void evaluate(const math_t* x1,
+                int n1,
+                int n_cols,
+                const math_t* x2,
+                int n2,
+                math_t* out,
+                bool is_row_major,
+                cudaStream_t stream,
+                int ld1,
+                int ld2,
+                int ld_out)
+  {
+    int minor1    = is_row_major ? n_cols : n1;
+    int minor2    = is_row_major ? n_cols : n2;
+    int minor_out = is_row_major ? n2 : n1;
+    ASSERT(ld1 == minor1, "RBF Kernel distance does not support ld1 parameter");
+    ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter");
+    ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter");
+    distance(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+  }
+
+  /** Customize distance function withe RBF epilogue */
+  void distance(const math_t* x1,
+                int n1,
+                int n_cols,
+                const math_t* x2,
+                int n2,
+                math_t* out,
+                bool is_row_major,
+                cudaStream_t stream,
+                int ld1,
+                int ld2,
+                int ld_out)
+  {
+    math_t gain   = this->gain;
+    using index_t = int64_t;
+
+    auto fin_op = [gain] __device__(math_t d_val, index_t idx) { return exp(-gain * d_val); };
+    raft::distance::distance<raft::distance::DistanceType::L2Unexpanded,
+                             math_t,
+                             math_t,
+                             math_t,
+                             decltype(fin_op),
+                             index_t>(const_cast<math_t*>(x1),
+                                      const_cast<math_t*>(x2),
+                                      out,
+                                      n1,
+                                      n2,
+                                      n_cols,
+                                      NULL,
+                                      0,
+                                      fin_op,
+                                      stream,
+                                      is_row_major);
+  }
+};
+
+};  // end namespace raft::distance::kernels::detail
diff --git a/cpp/include/raft/distance/distance_types.hpp b/cpp/include/raft/distance/distance_types.hpp
index f75263b00d..f5ed68af4a 100644
--- a/cpp/include/raft/distance/distance_types.hpp
+++ b/cpp/include/raft/distance/distance_types.hpp
@@ -65,5 +65,26 @@ enum DistanceType : unsigned short {
   /** Precomputed (special value) **/
   Precomputed = 100
 };
+
+namespace kernels {
+enum KernelType { LINEAR, POLYNOMIAL, RBF, TANH };
+
+/**
+ * Parameters for kernel matrices.
+ * The following kernels are implemented:
+ * - LINEAR \f[ K(x_1,x_2) = <x_1,x_2>, \f] where \f$< , >\f$ is the dot product
+ * - POLYNOMIAL \f[ K(x_1, x_2) = (\gamma <x_1,x_2> + \mathrm{coef0})^\mathrm{degree} \f]
+ * - RBF \f[ K(x_1, x_2) = \exp(- \gamma |x_1-x_2|^2) \f]
+ * - TANH \f[ K(x_1, x_2) = \tanh(\gamma <x_1,x_2> + \mathrm{coef0}) \f]
+ */
+struct KernelParams {
+  // Kernel function parameters
+  KernelType kernel;  //!< Type of the kernel function
+  int degree;         //!< Degree of polynomial kernel (ignored by others)
+  double gamma;       //!< multiplier in the
+  double coef0;       //!< additive constant in poly and tanh kernels
+};
+}  // end namespace kernels
+
 };  // namespace distance
 };  // end namespace raft
diff --git a/cpp/include/raft/distance/kernels.cuh b/cpp/include/raft/distance/kernels.cuh
new file mode 100644
index 0000000000..86f9f82406
--- /dev/null
+++ b/cpp/include/raft/distance/kernels.cuh
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/distance/detail/kernels/gram_matrix.cuh>
+#include <raft/distance/detail/kernels/kernel_factory.cuh>
+#include <raft/util/cuda_utils.cuh>
+
+#include <raft/distance/distance.cuh>
+#include <raft/linalg/gemm.cuh>
+
+namespace raft::distance::kernels {
+
+// TODO: Need to expose formal APIs for this that are more consistent w/ other APIs in RAFT
+using raft::distance::kernels::detail::GramMatrixBase;
+using raft::distance::kernels::detail::KernelFactory;
+
+};  // end namespace raft::distance::kernels
diff --git a/cpp/include/raft/distance/specializations/detail/kernels.cuh b/cpp/include/raft/distance/specializations/detail/kernels.cuh
new file mode 100644
index 0000000000..75c9c023e8
--- /dev/null
+++ b/cpp/include/raft/distance/specializations/detail/kernels.cuh
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/kernels/gram_matrix.cuh>
+#include <raft/distance/detail/kernels/kernel_matrices.cuh>
+
+extern template class raft::distance::kernels::detail::GramMatrixBase<double>;
+extern template class raft::distance::kernels::detail::GramMatrixBase<float>;
+
+extern template class raft::distance::kernels::detail::PolynomialKernel<double, int>;
+extern template class raft::distance::kernels::detail::PolynomialKernel<float, int>;
+
+extern template class raft::distance::kernels::detail::TanhKernel<double>;
+extern template class raft::distance::kernels::detail::TanhKernel<float>;
+
+// These are somehow missing a kernel definition which is causing a compile error
+// extern template class raft::distance::kernels::detail::RBFKernel<double>;
+// extern template class raft::distance::kernels::detail::RBFKernel<float>;
\ No newline at end of file
diff --git a/cpp/include/raft/distance/specializations/distance.cuh b/cpp/include/raft/distance/specializations/distance.cuh
index 73d075f260..053441d68a 100644
--- a/cpp/include/raft/distance/specializations/distance.cuh
+++ b/cpp/include/raft/distance/specializations/distance.cuh
@@ -23,6 +23,7 @@
 #include <raft/distance/specializations/detail/hamming_unexpanded.cuh>
 #include <raft/distance/specializations/detail/hellinger_expanded.cuh>
 #include <raft/distance/specializations/detail/jensen_shannon.cuh>
+#include <raft/distance/specializations/detail/kernels.cuh>
 #include <raft/distance/specializations/detail/kl_divergence.cuh>
 #include <raft/distance/specializations/detail/l1.cuh>
 #include <raft/distance/specializations/detail/l2_expanded.cuh>
diff --git a/cpp/include/raft/linalg/init.cuh b/cpp/include/raft/linalg/init.cuh
index 2fdf9dceb9..5a810bf2ba 100644
--- a/cpp/include/raft/linalg/init.cuh
+++ b/cpp/include/raft/linalg/init.cuh
@@ -19,6 +19,7 @@
 #pragma once
 
 #include "detail/init.hpp"
+#include <raft/util/cudart_utils.hpp>
 
 namespace raft {
 namespace linalg {
@@ -54,6 +55,19 @@ void range(T* out, int n, cudaStream_t stream)
   detail::range(out, n, stream);
 }
 
+/**
+ * @brief Zeros the output.
+ *
+ * \param [out] out device array, size [n]
+ * \param [in] n length of the array
+ * \param [in] stream cuda stream
+ */
+template <typename T>
+void zero(T* out, int n, cudaStream_t stream)
+{
+  RAFT_CUDA_TRY(cudaMemsetAsync(static_cast<void*>(out), 0, n * sizeof(T), stream));
+}
+
 }  // namespace linalg
 }  // namespace raft
 
diff --git a/cpp/include/raft/util/cache.cuh b/cpp/include/raft/util/cache.cuh
new file mode 100644
index 0000000000..8394ce83b8
--- /dev/null
+++ b/cpp/include/raft/util/cache.cuh
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cub/cub.cuh>
+
+#include <raft/core/interruptible.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/util/cache_util.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <cstddef>
+
+namespace raft::cache {
+
+/**
+ * @brief Associative cache with least recently used replacement policy.
+ *
+ * SW managed cache in device memory, for ML algos where we can trade memory
+ * access for computation. The two main functions of this class are the
+ * management of cache indices, and methods to retrieve/store data using the
+ * cache indices.
+ *
+ * The index management can be considered as a hash map<int, int>, where the int
+ * keys are the original vector indices that we want to store, and the values are
+ * the cache location of these vectors. The keys are hashed into a bucket
+ * whose size equals the associativity. These are the cache sets. If a cache
+ * set is full, then new indices are stored by replacing the oldest entries.
+ *
+ * Using this index mapping we implement methods to store and retrive data from
+ * the cache buffer, where a unit of data that we are storing is math_t[n_vec].
+ * For example in SVM we store full columns of the kernel matrix at each cache
+ * entry.
+ *
+ * Note: we should have a look if the index management could be simplified using
+ * concurrent_unordered_map.cuh from cudf. See Issue #914.
+ *
+ * Example usage:
+ * @code{.cpp}
+ *
+ * // An expensive calculation that we want to accelerate with caching:
+ * // we have n keys, and for each key we generate a vector with m elements.
+ * // The keys and the output values are stored in GPU memory.
+ * void calc(int *key, int n, int m, float *out, cudaStream_t stream) {
+ *   for (k=0; k<n; k++) {
+ *     // use key[k] to generate out[i + m*k],  where i=0..m-1
+ *   }
+ * }
+ *
+ * // We assume that our ML algo repeatedly calls calc, and the set of keys have
+ * // an overlap. We will use the cache to avoid repeated calculations.
+ *
+ * // Assume we have raft::handle_t& h, and cudaStream_t stream
+ * Cache<float> cache(h.get_device_allocator(), stream, m);
+ *
+ * // A buffer that we will reuse to store the cache indices.
+ * rmm::device_uvector<int> cache_idx(h.get_device_allocator(), stream, n);
+ *
+ * void cached_calc(int *key, int n, int m, float *out, stream) {
+ *   int n_cached = 0;
+ *
+ *   cache.GetCacheIdxPartitioned(key, n, cache_idx.data(), &n_cached,
+ *                                cudaStream_t stream);
+ *
+ *   // Note: GetCacheIdxPartitioned has reordered the keys so that
+ *   // key[0..n_cached-1] are the keys already in the cache.
+ *   // We collect the corresponding values
+ *   cache.GetVecs(cache_idx.data(), n_cached, out, stream);
+ *
+ *   // Calculate the elements not in the cache
+ *   int non_cached = n - n_cached;
+ *   if (non_cached > 0) {
+ *     int *key_new = key + n_cached;
+ *     int *cache_idx_new = cache_idx.data() + n_cached;
+ *     float *out_new = out + n_cached * m;
+ *     // AssignCacheIdx can permute the keys, therefore it has to come before
+ *     // we call calc.
+ *     // Note: a call to AssignCacheIdx should always be preceded with
+ *     // GetCacheIdxPartitioned, because that initializes the cache_idx_new array
+ *     // with the cache set (hash bucket) that correspond to the keys.
+ *     // The cache idx will be assigned from that cache set.
+ *     cache.AssignCacheIdx(key_new, non_cached, cache_idx_new, stream);
+ *
+ *     calc(key_new, non_cached, m, out_new, stream);
+ *
+ *     // Store the calculated vectors into the cache.
+ *     cache.StoreVecs(out_new, non_cached, non_cached, cache_idx_new, stream);
+ *    }
+ * }
+ * @endcode
+ */
+template <typename math_t, int associativity = 32>
+class Cache {
+ public:
+  /**
+   * @brief Construct a Cache object
+   *
+   * @tparam math_t type of elements to be cached
+   * @tparam associativity number of vectors in a cache set
+   *
+   * @param stream cuda stream
+   * @param n_vec number of elements in a single vector that is stored in a
+   *   cache entry
+   * @param cache_size in MiB
+   */
+  Cache(cudaStream_t stream, int n_vec, float cache_size = 200)
+    : n_vec(n_vec),
+      cache_size(cache_size),
+      cache(0, stream),
+      cached_keys(0, stream),
+      cache_time(0, stream),
+      is_cached(0, stream),
+      ws_tmp(0, stream),
+      idx_tmp(0, stream),
+      d_num_selected_out(stream),
+      d_temp_storage(0, stream)
+  {
+    ASSERT(n_vec > 0, "Parameter n_vec: shall be larger than zero");
+    ASSERT(associativity > 0, "Associativity shall be larger than zero");
+    ASSERT(cache_size >= 0, "Cache size should not be negative");
+
+    // Calculate how many vectors would fit the cache
+    int n_cache_vecs = (cache_size * 1024 * 1024) / (sizeof(math_t) * n_vec);
+
+    // The available memory shall be enough for at least one cache set
+    if (n_cache_vecs >= associativity) {
+      n_cache_sets = n_cache_vecs / associativity;
+      n_cache_vecs = n_cache_sets * associativity;
+      cache.resize(n_cache_vecs * n_vec, stream);
+      cached_keys.resize(n_cache_vecs, stream);
+      cache_time.resize(n_cache_vecs, stream);
+      RAFT_CUDA_TRY(
+        cudaMemsetAsync(cached_keys.data(), 0, cached_keys.size() * sizeof(int), stream));
+      RAFT_CUDA_TRY(cudaMemsetAsync(cache_time.data(), 0, cache_time.size() * sizeof(int), stream));
+    } else {
+      if (cache_size > 0) {
+        RAFT_LOG_WARN(
+          "Warning: not enough memory to cache a single set of "
+          "rows, not using cache");
+      }
+      n_cache_sets = 0;
+      cache_size   = 0;
+    }
+    RAFT_LOG_DEBUG(
+      "Creating cache with size=%f MiB, to store %d vectors, in "
+      "%d sets with associativity=%d",
+      cache_size,
+      n_cache_vecs,
+      n_cache_sets,
+      associativity);
+  }
+
+  Cache(const Cache& other) = delete;
+
+  Cache& operator=(const Cache& other) = delete;
+
+  /** @brief Collect cached data into contiguous memory space.
+   *
+   * On exit, the tile array is filled the following way:
+   * out[i + n_vec*k] = cache[i + n_vec * idx[k]]), where i=0..n_vec-1,
+   * k = 0..n-1
+   *
+   * Idx values less than 0 are ignored.
+   *
+   * @param [in] idx cache indices, size [n]
+   * @param [in] n the number of vectors that need to be collected
+   * @param [out] out vectors collected from cache, size [n_vec*n]
+   * @param [in] stream cuda stream
+   */
+  void GetVecs(const int* idx, int n, math_t* out, cudaStream_t stream)
+  {
+    if (n > 0) {
+      get_vecs<<<raft::ceildiv(n * n_vec, TPB), TPB, 0, stream>>>(cache.data(), n_vec, idx, n, out);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+    }
+  }
+
+  /** @brief Store vectors of data into the cache.
+   *
+   * Roughly the opposite of GetVecs, but the input vectors can be scattered
+   * in memory. The cache is updated using the following formula:
+   *
+   * cache[i + cache_idx[k]*n_vec] = tile[i + tile_idx[k]*n_vec],
+   * for i=0..n_vec-1, k=0..n-1
+   *
+   * If tile_idx==nullptr, then we assume tile_idx[k] = k.
+   *
+   * Elements within a vector should be contiguous in memory (i.e. column vectors
+   * for column major data storage, or row vectors of row major data).
+   *
+   * @param [in] tile stores the data to be cashed cached, size [n_vec x n_tile]
+   * @param [in] n_tile number of vectors in tile (at least n)
+   * @param [in] n number of vectors that need to be stored in the cache (a subset
+   *   of all the vectors in the tile)
+   * @param [in] cache_idx cache indices for storing the vectors (negative values
+   *   are ignored), size [n]
+   * @param [in] stream cuda stream
+   * @param [in] tile_idx indices of vectors that need to be stored
+   */
+  void StoreVecs(const math_t* tile,
+                 int n_tile,
+                 int n,
+                 int* cache_idx,
+                 cudaStream_t stream,
+                 const int* tile_idx = nullptr)
+  {
+    if (n > 0) {
+      store_vecs<<<raft::ceildiv(n * n_vec, TPB), TPB, 0, stream>>>(
+        tile, n_tile, n_vec, tile_idx, n, cache_idx, cache.data(), cache.size() / n_vec);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+    }
+  }
+
+  /** @brief Map a set of keys to cache indices.
+   *
+   * For each k in 0..n-1, if keys[k] is found in the cache, then cache_idx[k]
+   * will tell the corresponding cache idx, and is_cached[k] is set to true.
+   *
+   * If keys[k] is not found in the cache, then is_cached[k] is set to false.
+   * In this case we assign the cache set for keys[k], and cache_idx[k] will
+   * store the cache set.
+   *
+   * @note in order to retrieve the cached vector j=cache_idx[k] from the cache,
+   *  we have to access cache[i + j*n_vec], where i=0..n_vec-1.
+   *
+   * @note: do not use simultaneous GetCacheIdx and AssignCacheIdx
+   *
+   * @param [in] keys device array of keys, size [n]
+   * @param [in] n number of keys
+   * @param [out] cache_idx device array of cache indices corresponding to the
+   *   input keys, size [n]
+   * @param [out] is_cached whether the element is already available in the
+   *   cache, size [n]
+   * @param [in] stream
+   */
+  void GetCacheIdx(int* keys, int n, int* cache_idx, bool* is_cached, cudaStream_t stream)
+  {
+    n_iter++;  // we increase the iteration counter, that is used to time stamp
+    // accessing entries from the cache
+    get_cache_idx<<<raft::ceildiv(n, TPB), TPB, 0, stream>>>(keys,
+                                                             n,
+                                                             cached_keys.data(),
+                                                             n_cache_sets,
+                                                             associativity,
+                                                             cache_time.data(),
+                                                             cache_idx,
+                                                             is_cached,
+                                                             n_iter);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+
+  /** @brief Map a set of keys to cache indices.
+   *
+   * Same as GetCacheIdx, but partitions the keys, and cache_idx arrays in a way
+   * that keys[0..n_cached-1] and cache_idx[0..n_cached-1] store the indices of
+   * vectors that are found in the cache, while keys[n_cached..n-1] are the
+   * indices of vectors that are not found in the cache. For the vectors not
+   * found in the cache, cache_idx[n_cached..n-1] stores the cache set, and this
+   * can be used to call AssignCacheIdx.
+   *
+   * @param [inout] keys device array of keys, size [n]
+   * @param [in] n number of indices
+   * @param [out] cache_idx device array of cache indices corresponding to
+   *   the input keys, size [n]
+   * @param [out] n_cached number of elements that are cached
+   * @param [in] stream cuda stream
+   */
+  void GetCacheIdxPartitioned(int* keys, int n, int* cache_idx, int* n_cached, cudaStream_t stream)
+  {
+    ResizeTmpBuffers(n, stream);
+
+    GetCacheIdx(keys, n, ws_tmp.data(), is_cached.data(), stream);
+
+    // Group cache indices as [already cached, non_cached]
+    cub::DevicePartition::Flagged(d_temp_storage.data(),
+                                  d_temp_storage_size,
+                                  ws_tmp.data(),
+                                  is_cached.data(),
+                                  cache_idx,
+                                  d_num_selected_out.data(),
+                                  n,
+                                  stream);
+
+    raft::update_host(n_cached, d_num_selected_out.data(), 1, stream);
+
+    // Similarily re-group the input indices
+    raft::copy(ws_tmp.data(), keys, n, stream);
+    cub::DevicePartition::Flagged(d_temp_storage.data(),
+                                  d_temp_storage_size,
+                                  ws_tmp.data(),
+                                  is_cached.data(),
+                                  keys,
+                                  d_num_selected_out.data(),
+                                  n,
+                                  stream);
+
+    raft::interruptible::synchronize(stream);
+  }
+
+  /**
+   * @brief Assign cache location to a set of keys.
+   *
+   * Note: call GetCacheIdx first, to get the cache_set assigned to the keys.
+   * Keys that cannot be cached are assigned to -1.
+   *
+   * @param [inout] keys device array of keys, size [n]
+   * @param [in] n number of elements that we want to cache
+   * @param [inout] cidx on entry: cache_set, on exit: assigned cache_idx or -1,
+   *   size[n]
+   * @param [in] stream cuda stream
+   */
+  void AssignCacheIdx(int* keys, int n, int* cidx, cudaStream_t stream)
+  {
+    if (n <= 0) return;
+    cub::DeviceRadixSort::SortPairs(d_temp_storage.data(),
+                                    d_temp_storage_size,
+                                    cidx,
+                                    ws_tmp.data(),
+                                    keys,
+                                    idx_tmp.data(),
+                                    n,
+                                    0,
+                                    sizeof(int) * 8,
+                                    stream);
+
+    raft::copy(keys, idx_tmp.data(), n, stream);
+
+    // set it to -1
+    RAFT_CUDA_TRY(cudaMemsetAsync(cidx, 255, n * sizeof(int), stream));
+    const int nthreads = associativity <= 32 ? associativity : 32;
+
+    assign_cache_idx<nthreads, associativity><<<n_cache_sets, nthreads, 0, stream>>>(
+      keys, n, ws_tmp.data(), cached_keys.data(), n_cache_sets, cache_time.data(), n_iter, cidx);
+
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+    if (debug_mode) RAFT_CUDA_TRY(cudaDeviceSynchronize());
+  }
+
+  /** Return approximate cache size in MiB. */
+  float GetSizeInMiB() const { return cache_size; }
+
+  /**
+   * Returns the number of vectors that can be cached.
+   */
+  int GetSize() const { return cached_keys.size(); }
+
+ private:
+  int n_vec;         //!< Number of elements in a cached vector
+  float cache_size;  //!< in MiB
+  int n_cache_sets;  //!< number of cache sets
+
+  const int TPB = 256;  //!< threads per block for kernel launch
+  int n_iter    = 0;    //!< Counter for time stamping cache operation
+
+  bool debug_mode = false;
+
+  rmm::device_uvector<math_t> cache;     //!< The value of cached vectors
+  rmm::device_uvector<int> cached_keys;  //!< Keys stored at each cache loc
+  rmm::device_uvector<int> cache_time;   //!< Time stamp for LRU cache
+
+  // Helper arrays for GetCacheIdx
+  rmm::device_uvector<bool> is_cached;
+  rmm::device_uvector<int> ws_tmp;
+  rmm::device_uvector<int> idx_tmp;
+
+  // Helper arrays for cub
+  rmm::device_scalar<int> d_num_selected_out;
+  rmm::device_uvector<char> d_temp_storage;
+  size_t d_temp_storage_size = 0;
+
+  void ResizeTmpBuffers(int n, cudaStream_t stream)
+  {
+    if (ws_tmp.size() < static_cast<std::size_t>(n)) {
+      ws_tmp.resize(n, stream);
+      is_cached.resize(n, stream);
+      idx_tmp.resize(n, stream);
+      cub::DevicePartition::Flagged(NULL,
+                                    d_temp_storage_size,
+                                    cached_keys.data(),
+                                    is_cached.data(),
+                                    cached_keys.data(),
+                                    d_num_selected_out.data(),
+                                    n,
+                                    stream);
+      d_temp_storage.resize(d_temp_storage_size, stream);
+    }
+  }
+};
+};  // namespace raft::cache
diff --git a/cpp/include/raft/util/fast_int_div.cuh b/cpp/include/raft/util/fast_int_div.cuh
new file mode 100644
index 0000000000..a0cb8f0f53
--- /dev/null
+++ b/cpp/include/raft/util/fast_int_div.cuh
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/cuda_utils.cuh>
+#include <stdint.h>
+
+namespace raft::util {
+
+/**
+ * @brief Perform fast integer division and modulo using a known divisor
+ * From Hacker's Delight, Second Edition, Chapter 10
+ *
+ * @note This currently only supports 32b signed integers
+ * @todo Extend support for signed divisors
+ */
+struct FastIntDiv {
+  /**
+   * @defgroup HostMethods Ctor's that are accessible only from host
+   * @{
+   * @brief Host-only ctor's
+   * @param _d the divisor
+   */
+  FastIntDiv(int _d) : d(_d) { computeScalars(); }
+  FastIntDiv& operator=(int _d)
+  {
+    d = _d;
+    computeScalars();
+    return *this;
+  }
+  /** @} */
+
+  /**
+   * @defgroup DeviceMethods Ctor's which even the device-side can access
+   * @{
+   * @brief host and device ctor's
+   * @param other source object to be copied from
+   */
+  HDI FastIntDiv(const FastIntDiv& other) : d(other.d), m(other.m), p(other.p) {}
+  HDI FastIntDiv& operator=(const FastIntDiv& other)
+  {
+    d = other.d;
+    m = other.m;
+    p = other.p;
+    return *this;
+  }
+  /** @} */
+
+  /** divisor */
+  int d;
+  /** the term 'm' as found in the reference chapter */
+  unsigned m;
+  /** the term 'p' as found in the reference chapter */
+  int p;
+
+ private:
+  void computeScalars()
+  {
+    if (d == 1) {
+      m = 0;
+      p = 1;
+      return;
+    } else if (d < 0) {
+      ASSERT(false, "FastIntDiv: division by negative numbers not supported!");
+    } else if (d == 0) {
+      ASSERT(false, "FastIntDiv: got division by zero!");
+    }
+    int64_t nc = ((1LL << 31) / d) * d - 1;
+    p          = 31;
+    int64_t twoP, rhs;
+    do {
+      ++p;
+      twoP = 1LL << p;
+      rhs  = nc * (d - twoP % d);
+    } while (twoP <= rhs);
+    m = (twoP + d - twoP % d) / d;
+  }
+};  // struct FastIntDiv
+
+/**
+ * @brief Division overload, so that FastIntDiv can be transparently switched
+ *        to even on device
+ * @param n numerator
+ * @param divisor the denominator
+ * @return the quotient
+ */
+HDI int operator/(int n, const FastIntDiv& divisor)
+{
+  if (divisor.d == 1) return n;
+  int ret = (int64_t(divisor.m) * int64_t(n)) >> divisor.p;
+  if (n < 0) ++ret;
+  return ret;
+}
+
+/**
+ * @brief Modulo overload, so that FastIntDiv can be transparently switched
+ *        to even on device
+ * @param n numerator
+ * @param divisor the denominator
+ * @return the remainder
+ */
+HDI int operator%(int n, const FastIntDiv& divisor)
+{
+  int quotient  = n / divisor;
+  int remainder = n - quotient * divisor.d;
+  return remainder;
+}
+
+};  // namespace raft::util
diff --git a/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu b/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
new file mode 100644
index 0000000000..c893e9a358
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_double.cu
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/kernels/gram_matrix.cuh>
+
+template class raft::distance::kernels::detail::GramMatrixBase<double>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu b/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
new file mode 100644
index 0000000000..3265f828e6
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/kernels/gram_matrix_base_float.cu
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/kernels/gram_matrix.cuh>
+
+template class raft::distance::kernels::detail::GramMatrixBase<float>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu b/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
new file mode 100644
index 0000000000..0edf45a6f1
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_double_int.cu
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/kernels/kernel_matrices.cuh>
+
+template class raft::distance::kernels::detail::PolynomialKernel<double, int>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu b/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
new file mode 100644
index 0000000000..a719175e6b
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/kernels/polynomial_kernel_float_int.cu
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/kernels/kernel_matrices.cuh>
+
+template class raft::distance::kernels::detail::PolynomialKernel<float, int>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu b/cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu
new file mode 100644
index 0000000000..6577e1b6c7
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/kernels/rbf_kernel_double.cu
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/kernels/kernel_matrices.cuh>
+
+template class raft::distance::kernels::detail::RBFKernel<double>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu b/cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu
new file mode 100644
index 0000000000..1d2582cf81
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/kernels/rbf_kernel_float.cu
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/kernels/kernel_matrices.cuh>
+
+template class raft::distance::kernels::detail::RBFKernel<float>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu b/cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu
new file mode 100644
index 0000000000..13d5159504
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/kernels/tanh_kernel_double.cu
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/kernels/kernel_matrices.cuh>
+
+template class raft::distance::kernels::detail::TanhKernel<double>;
\ No newline at end of file
diff --git a/cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu b/cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu
new file mode 100644
index 0000000000..ee62de7d34
--- /dev/null
+++ b/cpp/src/distance/specializations/detail/kernels/tanh_kernel_float.cu
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/distance/detail/kernels/kernel_matrices.cuh>
+
+template class raft::distance::kernels::detail::TanhKernel<float>;
\ No newline at end of file
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index b10de9d1cc..07ec85bf1e 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -116,6 +116,7 @@ if(BUILD_TESTS)
             test/distance/dist_minkowski.cu
             test/distance/dist_russell_rao.cu
             test/distance/fused_l2_nn.cu
+            test/distance/gram.cu
             OPTIONAL DIST
     )
 
diff --git a/cpp/test/distance/gram.cu b/cpp/test/distance/gram.cu
new file mode 100644
index 0000000000..cf7215bddb
--- /dev/null
+++ b/cpp/test/distance/gram.cu
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined RAFT_DISTANCE_COMPILED
+#include <raft/distance/specializations.hpp>
+#endif
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <raft/distance/distance_types.hpp>
+#include <raft/distance/kernels.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft::distance::kernels {
+
+// Get the offset of element [i,k].
+HDI int get_offset(int i, int k, int ld, bool is_row_major)
+{
+  return is_row_major ? i * ld + k : i + k * ld;
+}
+
+struct GramMatrixInputs {
+  int n1;      // feature vectors in matrix 1
+  int n2;      // featuer vectors in matrix 2
+  int n_cols;  // number of elements in a feature vector
+  bool is_row_major;
+  KernelParams kernel;
+  int ld1;
+  int ld2;
+  int ld_out;
+  // We will generate random input using the dimensions given here.
+  // The reference output is calculated by a custom kernel.
+};
+
+std::ostream& operator<<(std::ostream& os, const GramMatrixInputs& p)
+{
+  std::vector<std::string> kernel_names{"linear", "poly", "rbf", "tanh"};
+  os << "/" << p.n1 << "x" << p.n2 << "x" << p.n_cols << "/"
+     << (p.is_row_major ? "RowMajor/" : "ColMajor/") << kernel_names[p.kernel.kernel] << "/ld_"
+     << p.ld1 << "x" << p.ld2 << "x" << p.ld_out;
+  return os;
+}
+
+const std::vector<GramMatrixInputs> inputs = {
+  {42, 137, 2, false, {KernelType::LINEAR}},
+  {42, 137, 2, true, {KernelType::LINEAR}},
+  {42, 137, 2, false, {KernelType::LINEAR}, 64, 179, 181},
+  {42, 137, 2, true, {KernelType::LINEAR}, 64, 179, 181},
+  {137, 42, 2, false, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}},
+  {137, 42, 2, true, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}},
+  {137, 42, 2, false, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 144},
+  {137, 42, 2, true, {KernelType::POLYNOMIAL, 2, 0.5, 2.4}, 159, 73, 144},
+  {42, 137, 2, false, {KernelType::TANH, 0, 0.5, 2.4}},
+  {42, 137, 2, true, {KernelType::TANH, 0, 0.5, 2.4}},
+  {42, 137, 2, false, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 49},
+  {42, 137, 2, true, {KernelType::TANH, 0, 0.5, 2.4}, 64, 155, 143},
+  {3, 4, 2, false, {KernelType::RBF, 0, 0.5}},
+  {42, 137, 2, false, {KernelType::RBF, 0, 0.5}},
+  {42, 137, 2, true, {KernelType::RBF, 0, 0.5}},
+  // Distance kernel does not support LD parameter yet.
+  //{42, 137, 2, false, {KernelType::RBF, 0, 0.5}, 64, 155, 49},
+  // {42, 137, 2, true, {KernelType::RBF, 0, 0.5}, 64, 155, 143},
+};
+
+template <typename math_t>
+class GramMatrixTest : public ::testing::TestWithParam<GramMatrixInputs> {
+ protected:
+  GramMatrixTest()
+    : params(GetParam()), stream(0), x1(0, stream), x2(0, stream), gram(0, stream), gram_host(0)
+  {
+    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
+
+    if (params.ld1 == 0) { params.ld1 = params.is_row_major ? params.n_cols : params.n1; }
+    if (params.ld2 == 0) { params.ld2 = params.is_row_major ? params.n_cols : params.n2; }
+    if (params.ld_out == 0) { params.ld_out = params.is_row_major ? params.n2 : params.n1; }
+    // Derive the size of the ouptut from the offset of the last element.
+    size_t size = get_offset(params.n1 - 1, params.n_cols - 1, params.ld1, params.is_row_major) + 1;
+    x1.resize(size, stream);
+    size = get_offset(params.n2 - 1, params.n_cols - 1, params.ld2, params.is_row_major) + 1;
+    x2.resize(size, stream);
+    size = get_offset(params.n1 - 1, params.n2 - 1, params.ld_out, params.is_row_major) + 1;
+
+    gram.resize(size, stream);
+    RAFT_CUDA_TRY(cudaMemsetAsync(gram.data(), 0, gram.size() * sizeof(math_t), stream));
+    gram_host.resize(gram.size());
+    std::fill(gram_host.begin(), gram_host.end(), 0);
+
+    raft::random::Rng r(42137ULL);
+    r.uniform(x1.data(), x1.size(), math_t(0), math_t(1), stream);
+    r.uniform(x2.data(), x2.size(), math_t(0), math_t(1), stream);
+  }
+
+  ~GramMatrixTest() override { RAFT_CUDA_TRY_NO_THROW(cudaStreamDestroy(stream)); }
+
+  // Calculate the Gram matrix on the host.
+  void naiveKernel()
+  {
+    std::vector<math_t> x1_host(x1.size());
+    raft::update_host(x1_host.data(), x1.data(), x1.size(), stream);
+    std::vector<math_t> x2_host(x2.size());
+    raft::update_host(x2_host.data(), x2.data(), x2.size(), stream);
+    handle.sync_stream(stream);
+
+    for (int i = 0; i < params.n1; i++) {
+      for (int j = 0; j < params.n2; j++) {
+        float d = 0;
+        for (int k = 0; k < params.n_cols; k++) {
+          if (params.kernel.kernel == KernelType::RBF) {
+            math_t diff = x1_host[get_offset(i, k, params.ld1, params.is_row_major)] -
+                          x2_host[get_offset(j, k, params.ld2, params.is_row_major)];
+            d += diff * diff;
+          } else {
+            d += x1_host[get_offset(i, k, params.ld1, params.is_row_major)] *
+                 x2_host[get_offset(j, k, params.ld2, params.is_row_major)];
+          }
+        }
+        int idx  = get_offset(i, j, params.ld_out, params.is_row_major);
+        math_t v = 0;
+        switch (params.kernel.kernel) {
+          case (KernelType::LINEAR): gram_host[idx] = d; break;
+          case (KernelType::POLYNOMIAL):
+            v              = params.kernel.gamma * d + params.kernel.coef0;
+            gram_host[idx] = std::pow(v, params.kernel.degree);
+            break;
+          case (KernelType::TANH):
+            gram_host[idx] = std::tanh(params.kernel.gamma * d + params.kernel.coef0);
+            break;
+          case (KernelType::RBF): gram_host[idx] = exp(-params.kernel.gamma * d); break;
+        }
+      }
+    }
+  }
+
+  void runTest()
+  {
+    std::unique_ptr<GramMatrixBase<math_t>> kernel = std::unique_ptr<GramMatrixBase<math_t>>(
+      KernelFactory<math_t>::create(params.kernel, handle.get_cublas_handle()));
+
+    kernel->evaluate(x1.data(),
+                     params.n1,
+                     params.n_cols,
+                     x2.data(),
+                     params.n2,
+                     gram.data(),
+                     params.is_row_major,
+                     stream,
+                     params.ld1,
+                     params.ld2,
+                     params.ld_out);
+    naiveKernel();
+    ASSERT_TRUE(raft::devArrMatchHost(
+      gram_host.data(), gram.data(), gram.size(), raft::CompareApprox<math_t>(1e-6f)));
+  }
+
+  raft::handle_t handle;
+  cudaStream_t stream = 0;
+  GramMatrixInputs params;
+
+  rmm::device_uvector<math_t> x1;
+  rmm::device_uvector<math_t> x2;
+  rmm::device_uvector<math_t> gram;
+  std::vector<math_t> gram_host;
+};
+
+typedef GramMatrixTest<float> GramMatrixTestFloat;
+typedef GramMatrixTest<double> GramMatrixTestDouble;
+
+TEST_P(GramMatrixTestFloat, Gram) { runTest(); }
+
+INSTANTIATE_TEST_SUITE_P(GramMatrixTests, GramMatrixTestFloat, ::testing::ValuesIn(inputs));
+};  // end namespace raft::distance::kernels
\ No newline at end of file