From 8d3e8a0ce255710c4154693a859b66fa722a4d36 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 2 Sep 2022 22:21:22 +0200
Subject: [PATCH 01/60] contractions: Concentrate tile index calculations

The calculation of the tile indices are now performed in ldgXY(). This
will make it possible to remove all state related to the tile index out
of the class in the next commit.

Note that the calculation of the tile index can depend on which
overloaded constructor is called(!)
---
 .../detail/pairwise_distance_base.cuh         | 27 ++----
 .../raft/linalg/detail/contractions.cuh       | 84 +++++++++++++------
 .../knn/detail/epsilon_neighborhood.cuh       | 10 +--
 3 files changed, 72 insertions(+), 49 deletions(-)

diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index 445b4bac52..c401b90601 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -138,27 +138,14 @@ struct PairwiseDistances : public BaseClass {
   DI void updateIndicesY()
   {
     const auto stride = P::Nblk * gridDim.x;
-    if (isRowMajor) {
-      this->y += stride * this->ldb;
-    } else {
-      this->y += stride;
-    }
-    this->yrowid += stride;
+    this->increment_grid_idx_n(stride);
   }
 
   DI void updateIndicesXY()
   {
     const auto stride = P::Mblk * gridDim.y;
-    if (isRowMajor) {
-      this->x += stride * this->lda;
-      this->yrowid = IdxT(blockIdx.x) * P::Nblk + this->srowid;
-      this->y      = yBase + this->yrowid * this->ldb;
-    } else {
-      this->x += stride;
-      this->yrowid = IdxT(blockIdx.x) * P::Nblk;
-      this->y      = yBase + this->yrowid + this->srowid * this->ldb;
-    }
-    this->xrowid += stride;
+    this->increment_grid_idx_m(stride);
+    this->reset_grid_idx_n();
   }
 
   DI void ldgNextGridStride(IdxT gridStrideX, IdxT gridStrideY)
@@ -187,7 +174,7 @@ struct PairwiseDistances : public BaseClass {
 
     this->stsXY();
     __syncthreads();
-    this->pageWr ^= 1;
+    this->switch_write_buffer();
   }
 
   DI void loop()
@@ -197,15 +184,15 @@ struct PairwiseDistances : public BaseClass {
       accumulate();  // on the previous k-block
       this->stsXY();
       __syncthreads();
-      this->pageWr ^= 1;
-      this->pageRd ^= 1;
+      this->switch_write_buffer();
+      this->switch_read_buffer();
     }
     accumulate();  // last iteration
     // This is needed for making sure next grid stride of
     // non-norm based metrics uses previously accumulated buffer so
     // it doesn't make shmem dirty until previous iteration
     // is complete.
-    this->pageRd ^= 1;
+    this->switch_read_buffer();
   }
 
   DI void accumulate()
diff --git a/cpp/include/raft/linalg/detail/contractions.cuh b/cpp/include/raft/linalg/detail/contractions.cuh
index e247f39bc7..346ec34771 100644
--- a/cpp/include/raft/linalg/detail/contractions.cuh
+++ b/cpp/include/raft/linalg/detail/contractions.cuh
@@ -40,14 +40,15 @@ struct Contractions_NT {
   /** leading dimension in Output D */
   IdxT ldd;
 
-  /** current thread's global mem row id for X data */
-  IdxT xrowid;
-  /** current thread's global mem row id for Y data */
-  IdxT yrowid;
   /** global memory pointer to X matrix */
-  const DataT* x;
+  const DataT* x_base;
   /** global memory pointer to Y matrix */
-  const DataT* y;
+  const DataT* y_base;
+
+  /** Support variables to provide backward compatibility **/
+  IdxT grid_idx_m = 0;
+  IdxT grid_idx_n = 0;
+  bool first_constructor_called;
 
   /** current thread's smem row id */
   int srowid;
@@ -94,10 +95,8 @@ struct Contractions_NT {
       k(_k),
       lda(_k),
       ldb(_k),
-      xrowid(IdxT(blockIdx.x) * P::Mblk + threadIdx.x / P::LdgThRow),
-      yrowid(IdxT(blockIdx.y) * P::Nblk + threadIdx.x / P::LdgThRow),
-      x(_x + xrowid * lda),
-      y(_y + yrowid * ldb),
+      x_base(_x),
+      y_base(_y),
       srowid(threadIdx.x / P::LdgThRow),
       scolid((threadIdx.x % P::LdgThRow) * P::Veclen),
       accrowid(threadIdx.x / P::AccThCols),
@@ -105,7 +104,8 @@ struct Contractions_NT {
       sx((DataT*)_smem),
       sy(&(sx[P::SmemPageX])),
       pageWr(0),
-      pageRd(0)
+      pageRd(0),
+      first_constructor_called(true)
   {
   }
 
@@ -133,6 +133,8 @@ struct Contractions_NT {
       lda(_lda),
       ldb(_ldb),
       ldd(_ldd),
+      x_base(_x),
+      y_base(_y),
       srowid(threadIdx.x / P::LdgThRow),
       scolid((threadIdx.x % P::LdgThRow) * P::Veclen),
       accrowid(threadIdx.x / P::AccThCols),
@@ -140,19 +142,9 @@ struct Contractions_NT {
       sx((DataT*)_smem),
       sy(&(sx[P::SmemPageX])),
       pageWr(0),
-      pageRd(0)
+      pageRd(0),
+      first_constructor_called(false)
   {
-    if (isRowMajor) {
-      xrowid = IdxT(blockIdx.y) * P::Mblk + srowid;
-      yrowid = IdxT(blockIdx.x) * P::Nblk + srowid;
-      x      = _x + xrowid * lda;
-      y      = _y + yrowid * ldb;
-    } else {
-      xrowid = IdxT(blockIdx.y) * P::Mblk;
-      yrowid = IdxT(blockIdx.x) * P::Nblk;
-      x      = _x + xrowid + srowid * lda;
-      y      = _y + yrowid + srowid * ldb;
-    }
   }
 
  protected:
@@ -166,6 +158,12 @@ struct Contractions_NT {
     ldgY(kidx);
   }
 
+  DI void ldgXY(IdxT tile_idx_m, IdxT tile_idx_n, IdxT kidx)
+  {
+    ldgX(tile_idx_m, kidx);
+    ldgY(tile_idx_n, kidx);
+  }
+
   /**
    * @brief Store current block of X/Y from registers to smem
    * @param[in] kidx current start index of k to be loaded
@@ -186,9 +184,35 @@ struct Contractions_NT {
     ldsY(kidx, sy + pageRd * P::SmemPage);
   }
 
+  DI void increment_grid_idx_m(IdxT by) { grid_idx_m += by; }
+
+  DI void increment_grid_idx_n(IdxT by) { grid_idx_n += by; }
+
+  DI void reset_grid_idx_n() { grid_idx_n = 0; }
+
+  DI void switch_read_buffer() { this->pageRd ^= 1; }
+
+  DI void switch_write_buffer() { this->pageWr ^= 1; }
+
  private:
   DI void ldgX(IdxT kidx)
   {
+    // Backward compatible way to determine the tile index. This depends on
+    // whether the first or the second constructor was called. The first
+    // constructor is called in epsilon_neighborhood.cuh and the second
+    // constructor is called in pairwise_distance_base.cuh.
+    if (first_constructor_called) {
+      ldgX(IdxT(blockIdx.x) * P::Mblk, kidx);
+    } else {
+      ldgX(grid_idx_m + IdxT(blockIdx.y) * P::Mblk, kidx);
+    }
+  }
+
+  DI void ldgX(IdxT tile_idx_m, IdxT kidx)
+  {
+    IdxT xrowid = isRowMajor ? tile_idx_m + srowid : tile_idx_m;
+    auto x      = isRowMajor ? x_base + xrowid * lda : x_base + xrowid + srowid * lda;
+
     if (isRowMajor) {
       auto numRows = m;
       auto koffset = kidx + scolid;
@@ -222,6 +246,18 @@ struct Contractions_NT {
 
   DI void ldgY(IdxT kidx)
   {
+    if (first_constructor_called) {
+      ldgY(IdxT(blockIdx.y) * P::Nblk, kidx);
+    } else {
+      ldgY(grid_idx_n + IdxT(blockIdx.x) * P::Nblk, kidx);
+    }
+  }
+
+  DI void ldgY(IdxT tile_idx_n, IdxT kidx)
+  {
+    IdxT yrowid = isRowMajor ? tile_idx_n + srowid : tile_idx_n;
+    auto y      = isRowMajor ? y_base + yrowid * ldb : y_base + yrowid + srowid * ldb;
+
     if (isRowMajor) {
       auto numRows = n;
       auto koffset = kidx + scolid;
@@ -315,4 +351,4 @@ struct Contractions_NT {
 
 }  // namespace detail
 }  // namespace linalg
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
index e4843acee9..7616083796 100644
--- a/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
+++ b/cpp/include/raft/spatial/knn/detail/epsilon_neighborhood.cuh
@@ -64,7 +64,7 @@ struct EpsUnexpL2SqNeighborhood : public BaseClass {
  private:
   DI void prolog()
   {
-    this->ldgXY(0);
+    this->ldgXY(IdxT(blockIdx.x) * P::Mblk, IdxT(blockIdx.y) * P::Nblk, 0);
 #pragma unroll
     for (int i = 0; i < P::AccRowsPerTh; ++i) {
 #pragma unroll
@@ -74,18 +74,18 @@ struct EpsUnexpL2SqNeighborhood : public BaseClass {
     }
     this->stsXY();
     __syncthreads();
-    this->pageWr ^= 1;
+    this->switch_write_buffer();
   }
 
   DI void loop()
   {
     for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) {
-      this->ldgXY(kidx);
+      this->ldgXY(IdxT(blockIdx.x) * P::Mblk, IdxT(blockIdx.y) * P::Nblk, kidx);
       accumulate();  // on the previous k-block
       this->stsXY();
       __syncthreads();
-      this->pageWr ^= 1;
-      this->pageRd ^= 1;
+      this->switch_write_buffer();
+      this->switch_read_buffer();
     }
     accumulate();  // last iteration
   }

From cb7baab5e501ae16841c9a82fc2c6af8d99521db Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 2 Sep 2022 22:51:49 +0200
Subject: [PATCH 02/60] pairwise_distance_base: Remove all ldgXY(0) calls

This commit moves all grid and tile indexing logic into the caller.
Contractions_NT is now only responsible for *intra*-tile indexing.

Due to the complexity of the epilog function, the ldgNextGridStride
function is not yet called from within the main loop. That is the next
goal so that we have all the grid and tile indexing localized in the
loop.
---
 .../detail/pairwise_distance_base.cuh         | 121 ++++++++++--------
 .../raft/linalg/detail/contractions.cuh       |  45 +------
 2 files changed, 67 insertions(+), 99 deletions(-)

diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index c401b90601..15bf334ffb 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -87,6 +87,12 @@ struct PairwiseDistances : public BaseClass {
   FinalLambda fin_op;
   rowEpilogueLambda rowEpilog_op;
 
+
+  const IdxT grid_stride_m;
+  const IdxT grid_stride_n;
+  const IdxT grid_offset_m;
+  const IdxT grid_offset_n;
+
   AccT acc[P::AccRowsPerTh][P::AccColsPerTh];
 
  public:
@@ -116,53 +122,63 @@ struct PairwiseDistances : public BaseClass {
       core_op(_core_op),
       epilog_op(_epilog_op),
       fin_op(_fin_op),
-      rowEpilog_op(_rowEpilog_op)
+      rowEpilog_op(_rowEpilog_op),
+      grid_stride_m(P::Nblk * gridDim.y),
+      grid_stride_n(P::Mblk * gridDim.x),
+      grid_offset_m(P::Mblk * blockIdx.y),
+      grid_offset_n(P::Nblk * blockIdx.x)
   {
   }
 
   DI void run()
   {
-    for (auto gridStrideY = blockIdx.y * P::Mblk; gridStrideY < this->m;
-         gridStrideY += P::Mblk * gridDim.y) {
-      for (auto gridStrideX = blockIdx.x * P::Nblk; gridStrideX < this->n;
-           gridStrideX += P::Nblk * gridDim.x) {
-        prolog(gridStrideX, gridStrideY);
-        loop();
-        epilog(gridStrideX, gridStrideY);
+    for (auto tile_idx_m = grid_offset_m; tile_idx_m < this->m; tile_idx_m += grid_stride_m) {
+      this->ldgXY(tile_idx_m, grid_offset_n, 0);
+      for (auto tile_idx_n = grid_offset_n; tile_idx_n < this->n; tile_idx_n += grid_stride_n) {
+        reset_accumulator();
+        this->stsXY();
+        __syncthreads();
+        this->switch_write_buffer();
+
+        for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) {
+            this->ldgXY(tile_idx_m, tile_idx_n, kidx);
+            // Process all data in shared memory (previous k-block) and
+            // accumulate in registers.
+            accumulate();
+            this->stsXY();
+            __syncthreads();
+            this->switch_write_buffer();
+            this->switch_read_buffer();
+        }
+        accumulate();  // last iteration
+        // This is needed for making sure next grid stride of
+        // non-norm based metrics uses previously accumulated buffer so
+        // it doesn't make shmem dirty until previous iteration
+        // is complete.
+        this->switch_read_buffer();
+
+        epilog(tile_idx_n, tile_idx_m);
       }
-      rowEpilog_op(gridStrideY);
+      rowEpilog_op(tile_idx_m);
     }
   }
 
  private:
-  DI void updateIndicesY()
-  {
-    const auto stride = P::Nblk * gridDim.x;
-    this->increment_grid_idx_n(stride);
-  }
-
-  DI void updateIndicesXY()
-  {
-    const auto stride = P::Mblk * gridDim.y;
-    this->increment_grid_idx_m(stride);
-    this->reset_grid_idx_n();
-  }
-
-  DI void ldgNextGridStride(IdxT gridStrideX, IdxT gridStrideY)
+  DI void ldgNextGridStride(IdxT tile_idx_n, IdxT tile_idx_m)
   {
     // Fetch next grid stride ldg if within range
-    if ((gridStrideX + gridDim.x * P::Nblk) < this->n) {
-      updateIndicesY();
-      this->ldgXY(0);
-    } else if ((gridStrideY + gridDim.y * P::Mblk) < this->m) {
-      updateIndicesXY();
-      this->ldgXY(0);
+    const auto next_tile_tile_idx_n = tile_idx_n + grid_stride_n;
+    const auto next_tile_tile_idx_m = tile_idx_m + grid_stride_m;
+    if ((next_tile_tile_idx_n) < this->n) {
+      this->ldgXY(tile_idx_m, next_tile_tile_idx_n, 0);
+    } else if ((next_tile_tile_idx_m) < this->m) {
+      this->ldgXY(next_tile_tile_idx_m, grid_offset_n, 0);
     }
   }
 
-  DI void prolog(IdxT gridStrideX, IdxT gridStrideY)
+  DI void prolog(IdxT tile_idx_n, IdxT tile_idx_m)
   {
-    if (gridStrideX == blockIdx.x * P::Nblk) { this->ldgXY(0); }
+    if (tile_idx_n == blockIdx.x * P::Nblk) { this->ldgXY(0); }
 
 #pragma unroll
     for (int i = 0; i < P::AccRowsPerTh; ++i) {
@@ -177,22 +193,15 @@ struct PairwiseDistances : public BaseClass {
     this->switch_write_buffer();
   }
 
-  DI void loop()
-  {
-    for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) {
-      this->ldgXY(kidx);
-      accumulate();  // on the previous k-block
-      this->stsXY();
-      __syncthreads();
-      this->switch_write_buffer();
-      this->switch_read_buffer();
+  DI void reset_accumulator() {
+    // Reset accumulator registers to zero.
+#pragma unroll
+    for (int i = 0; i < P::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < P::AccColsPerTh; ++j) {
+        acc[i][j] = BaseClass::Zero;
+      }
     }
-    accumulate();  // last iteration
-    // This is needed for making sure next grid stride of
-    // non-norm based metrics uses previously accumulated buffer so
-    // it doesn't make shmem dirty until previous iteration
-    // is complete.
-    this->switch_read_buffer();
   }
 
   DI void accumulate()
@@ -213,22 +222,22 @@ struct PairwiseDistances : public BaseClass {
     }
   }
 
-  DI void epilog(IdxT gridStrideX, IdxT gridStrideY)
+  DI void epilog(IdxT tile_idx_n, IdxT tile_idx_m)
   {
     if (useNorms) {
       DataT* sxNorm = (DataT*)(&smem[P::SmemSize]);
       DataT* syNorm = (&sxNorm[P::Mblk]);
 
       // Load x & y norms required by this threadblock in shmem buffer
-      if (gridStrideX == blockIdx.x * P::Nblk) {
+      if (tile_idx_n == blockIdx.x * P::Nblk) {
         for (int i = threadIdx.x; i < P::Mblk; i += P::Nthreads) {
-          auto idx  = gridStrideY + i;
+          auto idx  = tile_idx_m + i;
           sxNorm[i] = idx < this->m ? xn[idx] : 0;
         }
       }
 
       for (int i = threadIdx.x; i < P::Nblk; i += P::Nthreads) {
-        auto idx  = gridStrideX + i;
+        auto idx  = tile_idx_n + i;
         syNorm[i] = idx < this->n ? yn[idx] : 0;
       }
 
@@ -245,17 +254,17 @@ struct PairwiseDistances : public BaseClass {
       }
 
       // Overlap ldg with epilog computation
-      ldgNextGridStride(gridStrideX, gridStrideY);
-      epilog_op(acc, regxn, regyn, gridStrideX, gridStrideY);
+      ldgNextGridStride(tile_idx_n, tile_idx_m);
+      epilog_op(acc, regxn, regyn, tile_idx_n, tile_idx_m);
     } else {
       // Overlap ldg with epilog computation
-      ldgNextGridStride(gridStrideX, gridStrideY);
-      epilog_op(acc, nullptr, nullptr, gridStrideX, gridStrideY);
+      ldgNextGridStride(tile_idx_n, tile_idx_m);
+      epilog_op(acc, nullptr, nullptr, tile_idx_n, tile_idx_m);
     }
 
     if (writeOut) {
-      IdxT starty = gridStrideY + this->accrowid;
-      IdxT startx = gridStrideX + this->acccolid;
+      IdxT starty = tile_idx_m + this->accrowid;
+      IdxT startx = tile_idx_n + this->acccolid;
 
 #pragma unroll
       for (int i = 0; i < P::AccRowsPerTh; ++i) {
diff --git a/cpp/include/raft/linalg/detail/contractions.cuh b/cpp/include/raft/linalg/detail/contractions.cuh
index 346ec34771..f2d71117f7 100644
--- a/cpp/include/raft/linalg/detail/contractions.cuh
+++ b/cpp/include/raft/linalg/detail/contractions.cuh
@@ -45,11 +45,6 @@ struct Contractions_NT {
   /** global memory pointer to Y matrix */
   const DataT* y_base;
 
-  /** Support variables to provide backward compatibility **/
-  IdxT grid_idx_m = 0;
-  IdxT grid_idx_n = 0;
-  bool first_constructor_called;
-
   /** current thread's smem row id */
   int srowid;
   /** current thread's smem column id */
@@ -104,8 +99,7 @@ struct Contractions_NT {
       sx((DataT*)_smem),
       sy(&(sx[P::SmemPageX])),
       pageWr(0),
-      pageRd(0),
-      first_constructor_called(true)
+      pageRd(0)
   {
   }
 
@@ -142,8 +136,7 @@ struct Contractions_NT {
       sx((DataT*)_smem),
       sy(&(sx[P::SmemPageX])),
       pageWr(0),
-      pageRd(0),
-      first_constructor_called(false)
+      pageRd(0)
   {
   }
 
@@ -152,12 +145,6 @@ struct Contractions_NT {
    * @brief Load current block of X/Y from global memory to registers
    * @param[in] kidx current start index of k to be loaded
    */
-  DI void ldgXY(IdxT kidx)
-  {
-    ldgX(kidx);
-    ldgY(kidx);
-  }
-
   DI void ldgXY(IdxT tile_idx_m, IdxT tile_idx_n, IdxT kidx)
   {
     ldgX(tile_idx_m, kidx);
@@ -184,30 +171,11 @@ struct Contractions_NT {
     ldsY(kidx, sy + pageRd * P::SmemPage);
   }
 
-  DI void increment_grid_idx_m(IdxT by) { grid_idx_m += by; }
-
-  DI void increment_grid_idx_n(IdxT by) { grid_idx_n += by; }
-
-  DI void reset_grid_idx_n() { grid_idx_n = 0; }
-
   DI void switch_read_buffer() { this->pageRd ^= 1; }
 
   DI void switch_write_buffer() { this->pageWr ^= 1; }
 
  private:
-  DI void ldgX(IdxT kidx)
-  {
-    // Backward compatible way to determine the tile index. This depends on
-    // whether the first or the second constructor was called. The first
-    // constructor is called in epsilon_neighborhood.cuh and the second
-    // constructor is called in pairwise_distance_base.cuh.
-    if (first_constructor_called) {
-      ldgX(IdxT(blockIdx.x) * P::Mblk, kidx);
-    } else {
-      ldgX(grid_idx_m + IdxT(blockIdx.y) * P::Mblk, kidx);
-    }
-  }
-
   DI void ldgX(IdxT tile_idx_m, IdxT kidx)
   {
     IdxT xrowid = isRowMajor ? tile_idx_m + srowid : tile_idx_m;
@@ -244,15 +212,6 @@ struct Contractions_NT {
     }
   }
 
-  DI void ldgY(IdxT kidx)
-  {
-    if (first_constructor_called) {
-      ldgY(IdxT(blockIdx.y) * P::Nblk, kidx);
-    } else {
-      ldgY(grid_idx_n + IdxT(blockIdx.x) * P::Nblk, kidx);
-    }
-  }
-
   DI void ldgY(IdxT tile_idx_n, IdxT kidx)
   {
     IdxT yrowid = isRowMajor ? tile_idx_n + srowid : tile_idx_n;

From 066bf3b22d90412e280e77492e42160e0fa011ce Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 2 Sep 2022 23:40:32 +0200
Subject: [PATCH 03/60] pairwise_distance_base: Move all logic into run loop

This commit removes the epilog function and moves its functionality into
the run loop. The next step might be to see if the ldgNextGridStride()
method has to be called the current location, or if performance is the
same if its called at the start of the loop.
---
 .../detail/pairwise_distance_base.cuh         | 128 ++++++++----------
 1 file changed, 57 insertions(+), 71 deletions(-)

diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index 15bf334ffb..78effeca6d 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -87,7 +87,6 @@ struct PairwiseDistances : public BaseClass {
   FinalLambda fin_op;
   rowEpilogueLambda rowEpilog_op;
 
-
   const IdxT grid_stride_m;
   const IdxT grid_stride_n;
   const IdxT grid_offset_m;
@@ -141,14 +140,14 @@ struct PairwiseDistances : public BaseClass {
         this->switch_write_buffer();
 
         for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) {
-            this->ldgXY(tile_idx_m, tile_idx_n, kidx);
-            // Process all data in shared memory (previous k-block) and
-            // accumulate in registers.
-            accumulate();
-            this->stsXY();
-            __syncthreads();
-            this->switch_write_buffer();
-            this->switch_read_buffer();
+          this->ldgXY(tile_idx_m, tile_idx_n, kidx);
+          // Process all data in shared memory (previous k-block) and
+          // accumulate in registers.
+          accumulate();
+          this->stsXY();
+          __syncthreads();
+          this->switch_write_buffer();
+          this->switch_read_buffer();
         }
         accumulate();  // last iteration
         // This is needed for making sure next grid stride of
@@ -157,14 +156,25 @@ struct PairwiseDistances : public BaseClass {
         // is complete.
         this->switch_read_buffer();
 
-        epilog(tile_idx_n, tile_idx_m);
+        if (useNorms) {
+          DataT regxn[P::AccRowsPerTh], regyn[P::AccColsPerTh];
+          load_norms(tile_idx_m, tile_idx_n, regxn, regyn);
+          // Overlap ldg with epilog computation
+          ldgNextGridStride(tile_idx_m, tile_idx_n);
+          epilog_op(acc, regxn, regyn, tile_idx_n, tile_idx_m);
+        } else {
+          // Overlap ldg with epilog computation
+          ldgNextGridStride(tile_idx_m, tile_idx_n);
+          epilog_op(acc, nullptr, nullptr, tile_idx_n, tile_idx_m);
+        }
+        if (writeOut) { store_output(tile_idx_m, tile_idx_n); }
       }
       rowEpilog_op(tile_idx_m);
     }
   }
 
  private:
-  DI void ldgNextGridStride(IdxT tile_idx_n, IdxT tile_idx_m)
+  DI void ldgNextGridStride(IdxT tile_idx_m, IdxT tile_idx_n)
   {
     // Fetch next grid stride ldg if within range
     const auto next_tile_tile_idx_n = tile_idx_n + grid_stride_n;
@@ -176,24 +186,8 @@ struct PairwiseDistances : public BaseClass {
     }
   }
 
-  DI void prolog(IdxT tile_idx_n, IdxT tile_idx_m)
+  DI void reset_accumulator()
   {
-    if (tile_idx_n == blockIdx.x * P::Nblk) { this->ldgXY(0); }
-
-#pragma unroll
-    for (int i = 0; i < P::AccRowsPerTh; ++i) {
-#pragma unroll
-      for (int j = 0; j < P::AccColsPerTh; ++j) {
-        acc[i][j] = BaseClass::Zero;
-      }
-    }
-
-    this->stsXY();
-    __syncthreads();
-    this->switch_write_buffer();
-  }
-
-  DI void reset_accumulator() {
     // Reset accumulator registers to zero.
 #pragma unroll
     for (int i = 0; i < P::AccRowsPerTh; ++i) {
@@ -222,60 +216,52 @@ struct PairwiseDistances : public BaseClass {
     }
   }
 
-  DI void epilog(IdxT tile_idx_n, IdxT tile_idx_m)
+  DI void load_norms(IdxT tile_idx_m,
+                     IdxT tile_idx_n,
+                     DataT (&regxn)[P::AccRowsPerTh],
+                     DataT (&regyn)[P::AccColsPerTh])
   {
-    if (useNorms) {
-      DataT* sxNorm = (DataT*)(&smem[P::SmemSize]);
-      DataT* syNorm = (&sxNorm[P::Mblk]);
-
-      // Load x & y norms required by this threadblock in shmem buffer
-      if (tile_idx_n == blockIdx.x * P::Nblk) {
-        for (int i = threadIdx.x; i < P::Mblk; i += P::Nthreads) {
-          auto idx  = tile_idx_m + i;
-          sxNorm[i] = idx < this->m ? xn[idx] : 0;
-        }
-      }
-
-      for (int i = threadIdx.x; i < P::Nblk; i += P::Nthreads) {
-        auto idx  = tile_idx_n + i;
-        syNorm[i] = idx < this->n ? yn[idx] : 0;
+    DataT* sxNorm = (DataT*)(&smem[P::SmemSize]);
+    DataT* syNorm = (&sxNorm[P::Mblk]);
+
+    // Load x & y norms required by this threadblock in shmem buffer
+    if (tile_idx_n == blockIdx.x * P::Nblk) {
+      for (int i = threadIdx.x; i < P::Mblk; i += P::Nthreads) {
+        auto idx  = tile_idx_m + i;
+        sxNorm[i] = idx < this->m ? xn[idx] : 0;
       }
+    }
 
-      __syncthreads();
+    for (int i = threadIdx.x; i < P::Nblk; i += P::Nthreads) {
+      auto idx  = tile_idx_n + i;
+      syNorm[i] = idx < this->n ? yn[idx] : 0;
+    }
+    __syncthreads();
 
-      DataT regxn[P::AccRowsPerTh], regyn[P::AccColsPerTh];
 #pragma unroll
-      for (int i = 0; i < P::AccRowsPerTh; ++i) {
-        regxn[i] = sxNorm[i * P::AccThRows + (threadIdx.x / P::AccThCols)];
-      }
+    for (int i = 0; i < P::AccRowsPerTh; ++i) {
+      regxn[i] = sxNorm[i * P::AccThRows + (threadIdx.x / P::AccThCols)];
+    }
 #pragma unroll
-      for (int i = 0; i < P::AccColsPerTh; ++i) {
-        regyn[i] = syNorm[i * P::AccThCols + (threadIdx.x % P::AccThCols)];
-      }
-
-      // Overlap ldg with epilog computation
-      ldgNextGridStride(tile_idx_n, tile_idx_m);
-      epilog_op(acc, regxn, regyn, tile_idx_n, tile_idx_m);
-    } else {
-      // Overlap ldg with epilog computation
-      ldgNextGridStride(tile_idx_n, tile_idx_m);
-      epilog_op(acc, nullptr, nullptr, tile_idx_n, tile_idx_m);
+    for (int i = 0; i < P::AccColsPerTh; ++i) {
+      regyn[i] = syNorm[i * P::AccThCols + (threadIdx.x % P::AccThCols)];
     }
+  }
 
-    if (writeOut) {
-      IdxT starty = tile_idx_m + this->accrowid;
-      IdxT startx = tile_idx_n + this->acccolid;
+  DI void store_output(IdxT tile_idx_m, IdxT tile_idx_n)
+  {
+    IdxT starty = tile_idx_m + this->accrowid;
+    IdxT startx = tile_idx_n + this->acccolid;
 
 #pragma unroll
-      for (int i = 0; i < P::AccRowsPerTh; ++i) {
-        auto rowId = starty + i * P::AccThRows;
+    for (int i = 0; i < P::AccRowsPerTh; ++i) {
+      auto rowId = starty + i * P::AccThRows;
 #pragma unroll
-        for (int j = 0; j < P::AccColsPerTh; ++j) {
-          auto colId = startx + j * P::AccThCols;
-          if (rowId < this->m && colId < this->n) {
-            // Promote to 64 bit index for final write, as output array can be > 2^31
-            dOutput[std::size_t(rowId) * this->n + colId] = fin_op(acc[i][j], 0);
-          }
+      for (int j = 0; j < P::AccColsPerTh; ++j) {
+        auto colId = startx + j * P::AccThCols;
+        if (rowId < this->m && colId < this->n) {
+          // Promote to 64 bit index for final write, as output array can be > 2^31
+          dOutput[std::size_t(rowId) * this->n + colId] = fin_op(acc[i][j], 0);
         }
       }
     }

From a15d5fc1ecad74f04190ee351436b79db3127b8b Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 5 Oct 2022 16:17:56 +0200
Subject: [PATCH 04/60] pairwise_distance_base: Fix typo

This results in subtle issues with non-square KernelPolicy, as found in
fusedL2KNN.
---
 cpp/include/raft/distance/detail/pairwise_distance_base.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index 78effeca6d..5da3b6f8c1 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -122,8 +122,8 @@ struct PairwiseDistances : public BaseClass {
       epilog_op(_epilog_op),
       fin_op(_fin_op),
       rowEpilog_op(_rowEpilog_op),
-      grid_stride_m(P::Nblk * gridDim.y),
-      grid_stride_n(P::Mblk * gridDim.x),
+      grid_stride_m(P::Mblk * gridDim.y),
+      grid_stride_n(P::Nblk * gridDim.x),
       grid_offset_m(P::Mblk * blockIdx.y),
       grid_offset_n(P::Nblk * blockIdx.x)
   {

From 71c6da65ca544fe29dc0ab7f10e90b0cc72dca09 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 11 Jan 2023 16:01:18 +0100
Subject: [PATCH 05/60] Remove deprecated header

---
 cpp/include/raft/distance/distance.hpp        | 23 --------------
 cpp/include/raft/distance/distance_type.hpp   | 27 ----------------
 cpp/include/raft/distance/fused_l2_nn.hpp     | 31 -------------------
 cpp/include/raft/distance/specializations.hpp | 31 -------------------
 4 files changed, 112 deletions(-)
 delete mode 100644 cpp/include/raft/distance/distance.hpp
 delete mode 100644 cpp/include/raft/distance/distance_type.hpp
 delete mode 100644 cpp/include/raft/distance/fused_l2_nn.hpp
 delete mode 100644 cpp/include/raft/distance/specializations.hpp

diff --git a/cpp/include/raft/distance/distance.hpp b/cpp/include/raft/distance/distance.hpp
deleted file mode 100644
index e5d39be86b..0000000000
--- a/cpp/include/raft/distance/distance.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * This file is deprecated and will be removed in release 22.06.
- * Please use the cuh version instead.
- */
-
-#pragma once
-
-#include <raft/distance/distance.cuh>
\ No newline at end of file
diff --git a/cpp/include/raft/distance/distance_type.hpp b/cpp/include/raft/distance/distance_type.hpp
deleted file mode 100644
index f6eb4614f9..0000000000
--- a/cpp/include/raft/distance/distance_type.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * This file is deprecated and will be removed at some point in a future release.
- * Please use `raft/distance/distance_types.hpp` instead.
- */
-
-#pragma once
-
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use distance_types.hpp instead.")
-
-#include <raft/distance/distance_types.hpp>
\ No newline at end of file
diff --git a/cpp/include/raft/distance/fused_l2_nn.hpp b/cpp/include/raft/distance/fused_l2_nn.hpp
deleted file mode 100644
index 74ad0974f4..0000000000
--- a/cpp/include/raft/distance/fused_l2_nn.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * This file is deprecated and will be removed in release 22.06.
- * Please use the cuh version instead.
- */
-
-/**
- * DISCLAIMER: this file is deprecated: use fused_l2_nn.cuh instead
- */
-
-#pragma once
-
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the cuh version instead.")
-
-#include "fused_l2_nn.cuh"
diff --git a/cpp/include/raft/distance/specializations.hpp b/cpp/include/raft/distance/specializations.hpp
deleted file mode 100644
index 04afb73036..0000000000
--- a/cpp/include/raft/distance/specializations.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * This file is deprecated and will be removed in release 22.06.
- * Please use the cuh version instead.
- */
-
-/**
- * DISCLAIMER: this file is deprecated: use specializations.cuh instead
- */
-
-#pragma once
-
-#pragma message(__FILE__                                                  \
-                " is deprecated and will be removed in a future release." \
-                " Please use the cuh version instead.")
-
-#include "specializations.cuh"

From 4bbedf660d8e1f2c01ab2bd8066a96cc7bd40307 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 12 Jan 2023 13:50:51 +0100
Subject: [PATCH 06/60] Replace lambdas by raft::void_op

---
 cpp/include/raft/distance/detail/pairwise_distance_base.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index 5da3b6f8c1..140664f394 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -328,7 +328,7 @@ __global__ __launch_bounds__(Policy::Nthreads, 2)
                                  FinalLambda fin_op)
 {
   extern __shared__ char smem[];
-  auto rowEpilog = [] __device__(IdxT starty) { return; };
+  auto rowEpilog = raft::void_op();
 
   PairwiseDistances<useNorms,
                     DataT,
@@ -412,7 +412,7 @@ __global__ __launch_bounds__(Policy::Nthreads, 2)
   // TODO: re-enable the CUDA_ARCH guard for below Ampere once cutlass based
   //  kernels are enabled for CUDA 12.0
   extern __shared__ char smem[];
-  auto rowEpilog = [] __device__(IdxT starty) { return; };
+  auto rowEpilog = raft::void_op();
 
   PairwiseDistances<useNorms,
                     DataT,

From c3d1f6e6c2d40646ebc761e97954aaddfb31c12c Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 12 Jan 2023 17:14:05 +0100
Subject: [PATCH 07/60] Use an operator for L1 distance

---
 .../distance/detail/distance_operators.cuh    |  51 ++++++++
 cpp/include/raft/distance/detail/l1.cuh       |  48 ++-----
 .../distance/detail/pairwise_distance_op.cuh  | 118 ++++++++++++++++++
 3 files changed, 180 insertions(+), 37 deletions(-)
 create mode 100644 cpp/include/raft/distance/detail/distance_operators.cuh
 create mode 100644 cpp/include/raft/distance/detail/pairwise_distance_op.cuh

diff --git a/cpp/include/raft/distance/detail/distance_operators.cuh b/cpp/include/raft/distance/detail/distance_operators.cuh
new file mode 100644
index 0000000000..4abaeaaf8b
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_operators.cuh
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/distance/detail/pairwise_distance_base.cuh>
+
+namespace raft::distance::detail {
+
+
+// Describes the computation the l1 distance
+struct l1_distance_op {
+  // Whether norms of data should be loaded.
+  static constexpr bool use_norms = false;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy>
+  constexpr size_t shared_mem_size() {
+    return Policy::SmemSize;
+  }
+
+  template <typename AccT, typename DataT>
+  DI void core(AccT & acc, DataT & x, DataT & y) const {
+    acc += raft::abs(x - y);
+  };
+
+  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT * regxn,
+                 DataT * regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const {
+    return;
+  };
+
+};
+
+}  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/detail/l1.cuh b/cpp/include/raft/distance/detail/l1.cuh
index bf10651b60..8eee0ae220 100644
--- a/cpp/include/raft/distance/detail/l1.cuh
+++ b/cpp/include/raft/distance/detail/l1.cuh
@@ -16,11 +16,15 @@
 
 #pragma once
 #include <raft/distance/detail/pairwise_distance_base.cuh>
+#include <raft/distance/detail/pairwise_distance_op.cuh>
+#include <raft/distance/detail/distance_operators.cuh>
 
 namespace raft {
 namespace distance {
 namespace detail {
 
+
+
 /**
  * @brief the L1 distance matrix calculation implementer
  *  It computes the following equation: cij = op(ai-bj)
@@ -69,45 +73,15 @@ static void l1Impl(const DataT* x,
 
   dim3 blk(KPolicy::Nthreads);
 
-  // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    const auto diff = raft::abs(x - y);
-    acc += diff;
-  };
+  l1_distance_op distance_op{};
 
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = raft::void_op();
+  using PCT = params_CT<DataT, AccT, OutT, IdxT, KPolicy, raft::distance::detail::l1_distance_op, FinalLambda, isRowMajor>;
 
-  if (isRowMajor) {
-    auto l1RowMajor = pairwiseDistanceMatKernel<false,
-                                                DataT,
-                                                AccT,
-                                                OutT,
-                                                IdxT,
-                                                KPolicy,
-                                                decltype(core_lambda),
-                                                decltype(epilog_lambda),
-                                                FinalLambda,
-                                                true>;
-    dim3 grid       = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, l1RowMajor);
-
-    l1RowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  } else {
-    auto l1ColMajor = pairwiseDistanceMatKernel<false,
-                                                DataT,
-                                                AccT,
-                                                OutT,
-                                                IdxT,
-                                                KPolicy,
-                                                decltype(core_lambda),
-                                                decltype(epilog_lambda),
-                                                FinalLambda,
-                                                false>;
-    dim3 grid       = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, l1ColMajor);
-    l1ColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  }
+  auto kernel = pairwiseDistanceOpKernel<PCT>;
+  dim3 grid       = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, kernel);
+
+  kernel<<<grid, blk, KPolicy::SmemSize, stream>>>(
+    x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, distance_op, fin_op);
 
   RAFT_CUDA_TRY(cudaGetLastError());
 }
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_op.cuh b/cpp/include/raft/distance/detail/pairwise_distance_op.cuh
new file mode 100644
index 0000000000..91c66a2217
--- /dev/null
+++ b/cpp/include/raft/distance/detail/pairwise_distance_op.cuh
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <raft/core/operators.hpp>
+#include <raft/linalg/contractions.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <raft/util/vectorized.cuh>
+
+#include <cstddef>
+
+namespace raft::distance::detail {
+
+
+
+template <typename data_type,
+          typename accumulate_type,
+          typename out_type,
+          typename index_type,
+
+          typename policy,
+          // Op (L2, L1, etc...)
+          typename op_type,
+          typename final_op_type,
+          bool row_major>
+struct params_CT {
+  using DataT = data_type;
+  using AccT = accumulate_type;
+  using OutT = out_type;
+  using IdxT = index_type;
+
+  using PolicyT = policy;
+
+  using opT = op_type;
+  using FinOpT = final_op_type;
+  static constexpr bool is_row_major = row_major;
+};
+
+template <typename PCT>
+__global__ __launch_bounds__(PCT::PolicyT::Nthreads, 2)
+
+  void pairwiseDistanceOpKernel(
+    const typename PCT::DataT* x,
+    const typename PCT::DataT* y,
+    const typename PCT::DataT* _xn,
+    const typename PCT::DataT* _yn,
+    typename PCT::IdxT m,
+    typename PCT::IdxT n,
+    typename PCT::IdxT k,
+    typename PCT::IdxT lda,
+    typename PCT::IdxT ldb,
+    typename PCT::IdxT ldd,
+    typename PCT::OutT* dOutput,
+    typename PCT::opT distance_op,
+    typename PCT::FinOpT fin_op)
+{
+  using AccT = typename PCT::AccT;
+  using DataT = typename PCT::DataT;
+  using OutT = typename PCT::OutT;
+  using IdxT = typename PCT::IdxT;
+
+  using Policy = typename PCT::PolicyT;
+
+  // Instantiate PCT to access constexpr members.
+  PCT compile_time_params{};
+
+  extern __shared__ char smem[];
+
+  // Wrap operator back into lambdas. This is temporary and should be removed. (TODO)
+  auto core_op = [distance_op] __device__(AccT & acc, DataT & x, DataT & y) {
+    // use .template to disambiguate (See: https://en.cppreference.com/w/cpp/language/dependent_name)
+    distance_op.template core<AccT, DataT>(acc, x, y);
+  };
+  auto epilog_op = [distance_op] __device__(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                                     DataT * regxn,
+                                     DataT * regyn,
+                                     IdxT gridStrideX,
+                                     IdxT gridStrideY) {
+    distance_op.template epilog<Policy, AccT, DataT, IdxT>(acc, regxn, regyn, gridStrideX, gridStrideY);
+  };
+
+  // No support for row_epilog_op.
+  auto row_epilog_op = raft::void_op();
+  // Always write output
+  constexpr bool write_out = true;
+  constexpr bool use_norms = distance_op.use_norms;
+  PairwiseDistances<use_norms,
+                    DataT,
+                    AccT,
+                    OutT,
+                    IdxT,
+                    Policy,
+                    decltype(core_op),
+                    decltype(epilog_op),
+                    decltype(fin_op),
+                    decltype(row_epilog_op),
+                    compile_time_params.is_row_major,
+                    write_out>
+    obj(
+      x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, epilog_op, fin_op, row_epilog_op);
+  obj.run();
+
+}
+};  // namespace detail

From 3e3478b05c2afb145a4dc5f22318b31d9c868848 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 12 Jan 2023 18:25:56 +0100
Subject: [PATCH 08/60] Add launch function

This is more general than just for L1. Making use of it more is work in
progress.
---
 cpp/include/raft/distance/detail/l1.cuh | 45 +++++++++++++++++++------
 1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/cpp/include/raft/distance/detail/l1.cuh b/cpp/include/raft/distance/detail/l1.cuh
index 8eee0ae220..2ad6895b27 100644
--- a/cpp/include/raft/distance/detail/l1.cuh
+++ b/cpp/include/raft/distance/detail/l1.cuh
@@ -24,6 +24,35 @@ namespace distance {
 namespace detail {
 
 
+template <typename PCT>
+static void distance_matrix_launch(
+    typename PCT::opT distance_op,
+    typename PCT::FinOpT fin_op,
+    const typename PCT::DataT* x,
+    const typename PCT::DataT* y,
+    const typename PCT::DataT* _xn,
+    const typename PCT::DataT* _yn,
+    typename PCT::IdxT m,
+    typename PCT::IdxT n,
+    typename PCT::IdxT k,
+    typename PCT::IdxT lda,
+    typename PCT::IdxT ldb,
+    typename PCT::IdxT ldd,
+    typename PCT::OutT* dOutput,
+    cudaStream_t stream)
+{
+  using Policy = typename PCT::PolicyT;
+
+  dim3 blk(Policy::Nthreads);
+  size_t smem_size = distance_op.template shared_mem_size<Policy>();
+  dim3 grid       = launchConfigGenerator<Policy>(m, n, smem_size, pairwiseDistanceOpKernel<PCT>);
+
+  pairwiseDistanceOpKernel<PCT><<<grid, blk, smem_size, stream>>>(
+    x, y, _xn, _yn, m, n, k, lda, ldb, ldd, dOutput, distance_op, fin_op);
+
+  RAFT_CUDA_TRY(cudaGetLastError());
+
+}
 
 /**
  * @brief the L1 distance matrix calculation implementer
@@ -68,22 +97,18 @@ static void l1Impl(const DataT* x,
 {
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
   typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
-  dim3 blk(KPolicy::Nthreads);
-
   l1_distance_op distance_op{};
 
   using PCT = params_CT<DataT, AccT, OutT, IdxT, KPolicy, raft::distance::detail::l1_distance_op, FinalLambda, isRowMajor>;
 
-  auto kernel = pairwiseDistanceOpKernel<PCT>;
-  dim3 grid       = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, kernel);
-
-  kernel<<<grid, blk, KPolicy::SmemSize, stream>>>(
-    x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, distance_op, fin_op);
-
-  RAFT_CUDA_TRY(cudaGetLastError());
+  distance_matrix_launch<PCT>(
+    distance_op, fin_op,        // Operations
+    x, y, nullptr, nullptr,     // Input data
+    m, n, k, lda, ldb, ldd,     // Dimensions
+    dOutput,                    // Output data
+    stream);                    // CUDA stream
 }
 
 template <typename DataT,

From 264a9d27703b53fb6aed9a9fad2cbb9d70a011ee Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 13 Jan 2023 11:58:43 +0100
Subject: [PATCH 09/60] l1: Replace run-time -> compile-time dispatch

---
 cpp/include/raft/distance/detail/l1.cuh       | 246 ++++++++----------
 .../distance/detail/pairwise_distance_op.cuh  | 163 +++++++++---
 2 files changed, 235 insertions(+), 174 deletions(-)

diff --git a/cpp/include/raft/distance/detail/l1.cuh b/cpp/include/raft/distance/detail/l1.cuh
index 2ad6895b27..7645421fbf 100644
--- a/cpp/include/raft/distance/detail/l1.cuh
+++ b/cpp/include/raft/distance/detail/l1.cuh
@@ -15,182 +15,156 @@
  */
 
 #pragma once
+#include <raft/distance/detail/distance_operators.cuh>
 #include <raft/distance/detail/pairwise_distance_base.cuh>
 #include <raft/distance/detail/pairwise_distance_op.cuh>
-#include <raft/distance/detail/distance_operators.cuh>
 
 namespace raft {
 namespace distance {
 namespace detail {
 
-
 template <typename PCT>
-static void distance_matrix_launch(
-    typename PCT::opT distance_op,
-    typename PCT::FinOpT fin_op,
-    const typename PCT::DataT* x,
-    const typename PCT::DataT* y,
-    const typename PCT::DataT* _xn,
-    const typename PCT::DataT* _yn,
-    typename PCT::IdxT m,
-    typename PCT::IdxT n,
-    typename PCT::IdxT k,
-    typename PCT::IdxT lda,
-    typename PCT::IdxT ldb,
-    typename PCT::IdxT ldd,
-    typename PCT::OutT* dOutput,
-    cudaStream_t stream)
+static void distance_matrix_launch(typename PCT::opT distance_op,
+                                   typename PCT::FinOpT fin_op,
+                                   const typename PCT::DataT* x,
+                                   const typename PCT::DataT* y,
+                                   const typename PCT::DataT* _xn,
+                                   const typename PCT::DataT* _yn,
+                                   typename PCT::IdxT m,
+                                   typename PCT::IdxT n,
+                                   typename PCT::IdxT k,
+                                   typename PCT::IdxT lda,
+                                   typename PCT::IdxT ldb,
+                                   typename PCT::IdxT ldd,
+                                   typename PCT::OutT* dOutput,
+                                   cudaStream_t stream)
 {
   using Policy = typename PCT::PolicyT;
 
   dim3 blk(Policy::Nthreads);
   size_t smem_size = distance_op.template shared_mem_size<Policy>();
-  dim3 grid       = launchConfigGenerator<Policy>(m, n, smem_size, pairwiseDistanceOpKernel<PCT>);
+  dim3 grid        = launchConfigGenerator<Policy>(m, n, smem_size, pairwiseDistanceOpKernel<PCT>);
 
   pairwiseDistanceOpKernel<PCT><<<grid, blk, smem_size, stream>>>(
     x, y, _xn, _yn, m, n, k, lda, ldb, ldd, dOutput, distance_op, fin_op);
 
   RAFT_CUDA_TRY(cudaGetLastError());
-
 }
 
-/**
- * @brief the L1 distance matrix calculation implementer
- *  It computes the following equation: cij = op(ai-bj)
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
-
- * @tparam FinalLambda    final lambda called on final distance value
- * @tparam isRowMajor     true if input/output is row major,
-                          false for column major
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of columns of B and C/D
- * @param[in]       k number of cols of A and rows of B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[output]   pD output matrix
- * @param fin_op    the final gemm epilogue lambda
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-static void l1Impl(const DataT* x,
-                   const DataT* y,
-                   IdxT m,
-                   IdxT n,
-                   IdxT k,
-                   IdxT lda,
-                   IdxT ldb,
-                   IdxT ldd,
-                   OutT* dOutput,
-                   FinalLambda fin_op,
-                   cudaStream_t stream)
+// Determine the largest number of elements that can be loaded in one
+// instruction without causing misalignment errors.
+template <typename DataT>
+int max_aligned_load(const DataT* x, const DataT* y, int ldx, int ldy)
 {
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  auto base_x     = reinterpret_cast<uintptr_t>(x);
+  auto base_y     = reinterpret_cast<uintptr_t>(y);
+  size_t stride_X = sizeof(DataT) * ldx;  // stride in bytes
+  size_t stride_Y = sizeof(DataT) * ldy;  // stride in bytes
 
-  l1_distance_op distance_op{};
+  bool base_16B_aligned = base_x % 16 == 0 && base_y % 16 == 0;
+  bool base_8B_aligned  = base_x % 8 == 0 && base_y % 8 == 0;
 
-  using PCT = params_CT<DataT, AccT, OutT, IdxT, KPolicy, raft::distance::detail::l1_distance_op, FinalLambda, isRowMajor>;
+  bool stride_16B_aligned = stride_X % 16 == 0 && stride_Y % 16 == 0;
+  bool stride_8B_aligned  = stride_X % 8 == 0 && stride_Y % 8 == 0;
 
-  distance_matrix_launch<PCT>(
-    distance_op, fin_op,        // Operations
-    x, y, nullptr, nullptr,     // Input data
-    m, n, k, lda, ldb, ldd,     // Dimensions
-    dOutput,                    // Output data
-    stream);                    // CUDA stream
+  if (16 % sizeof(DataT) == 0 && base_16B_aligned && stride_16B_aligned) {
+    return 16 / sizeof(DataT);
+  } else if (8 % sizeof(DataT) == 0 && base_8B_aligned && stride_8B_aligned) {
+    return 8 / sizeof(DataT);
+  } else {
+    return 1;
+  }
 }
 
-template <typename DataT,
+template <typename opT,
+          typename DataT,
           typename AccT,
           typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void l1(IdxT m,
-        IdxT n,
-        IdxT k,
-        IdxT lda,
-        IdxT ldb,
-        IdxT ldd,
-        const DataT* x,
-        const DataT* y,
-        OutT* dOutput,
-        FinalLambda fin_op,
-        cudaStream_t stream)
+          typename FinOpT,
+          typename IdxT = int>
+void distance_matrix_dispatch(opT distance_op,
+                              int m_,
+                              int n_,
+                              int k_,
+                              const DataT* x_,
+                              const DataT* y_,
+                              OutT* out,
+                              FinOpT fin_op,
+                              cudaStream_t stream,
+                              bool is_row_major)
 {
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    l1Impl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    l1Impl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+  // Determine leading dimensions and possibly flip order of passing x and y if
+  // column_major.
+  //
+  // ldx, ldy, and ld_out are the leading dimensions of x, y, and out
+  const DataT* x;
+  const DataT* y;
+  int ldx, ldy, ld_out;
+  int m, n, k;
+  if (is_row_major) {
+    // Pass x, y, m, n, k in order
+    x = x_,   y = y_;
+    m = m_,   n = n_,   k = k_;
+    ldx = k_, ldy = k_, ld_out = n_;
   } else {
-    l1Impl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+    // Flip x, y, and m, n, k.
+    x = y_,   y = x_;
+    m = n_,   n = m_,   k = k_;
+    ldx = n_, ldy = m_, ld_out = m_;
   }
+
+  int vectorized_load_num_elem = max_aligned_load(x, y, ldx, ldy);
+
+  // We dispatch based on
+  // - vectorized_load_num_elem
+  // - is_row_major
+
+  // Create run-time parameter struct that does the dispatching
+  using PRT = params_RT<DataT, AccT, OutT, IdxT, decltype(distance_op), FinOpT>;
+  PRT run_time_params{vectorized_load_num_elem, is_row_major};
+
+  // Turn run-time parameters into compile-time parameters.
+  bool dispatch_success = run_time_params.dispatch_with_compile_time_params(
+    // We pass a lambda that receives the compile-time parameters and can use these
+    // to call the correct kernel.
+    [&](auto compile_time_params) {
+      // compile_time_params is an empty struct that we can convert back to a type
+      // using decltype.
+      return distance_matrix_launch<decltype(compile_time_params)>(
+        distance_op,
+        fin_op,
+        x,
+        y,
+        nullptr,
+        nullptr,  // TODO: use _xn, _yn for non-l1 distances
+        m,
+        n,
+        k,
+        ldx,
+        ldy,
+        ld_out,
+        out,
+        stream);
+    });
 }
 
-/**
- * @brief the L1 distance matrix calculation
- *  It computes the following equation: cij = op(ai-bj)
- * @tparam InType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OutType output data-type (for C and D matrices)
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
- * @param m number of rows of A and C/D
- * @param n number of columns of B and C/D
- * @param k number of cols of A and rows of B
- * @param pA input matrix
- * @param pB input matrix
- * @param pD output matrix
- * @param fin_op the final element-wise epilogue lambda
- * @param stream cuda stream where to launch work
- * @param isRowMajor whether the input and output matrices are row major
- */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
 void l1Impl(int m,
             int n,
             int k,
-            const InType* pA,
-            const InType* pB,
-            OutType* pD,
-            FinalLambda fin_op,
+            const DataT* x,
+            const DataT* y,
+            OutT* out,
+            FinOpT fin_op,
             cudaStream_t stream,
-            bool isRowMajor)
+            bool is_row_major)
 {
-  typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type L1OutType;
-  Index_ lda, ldb, ldd;
-  L1OutType* pDcast = reinterpret_cast<L1OutType*>(pD);
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    l1<InType, AccType, L1OutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
+  l1_distance_op distance_op{};
 
-  } else {
-    lda = n, ldb = m, ldd = m;
-    l1<InType, AccType, L1OutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
-  }
+  distance_matrix_dispatch<l1_distance_op, DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, out, fin_op, stream, is_row_major);
 }
+
 }  // namespace detail
 }  // namespace distance
 }  // namespace raft
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_op.cuh b/cpp/include/raft/distance/detail/pairwise_distance_op.cuh
index 91c66a2217..2b776a378f 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_op.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_op.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,8 +25,6 @@
 
 namespace raft::distance::detail {
 
-
-
 template <typename data_type,
           typename accumulate_type,
           typename out_type,
@@ -39,58 +37,133 @@ template <typename data_type,
           bool row_major>
 struct params_CT {
   using DataT = data_type;
-  using AccT = accumulate_type;
-  using OutT = out_type;
-  using IdxT = index_type;
-
+  using AccT  = accumulate_type;
+  using OutT  = out_type;
+  using IdxT  = index_type;
   using PolicyT = policy;
-
-  using opT = op_type;
-  using FinOpT = final_op_type;
+  using opT                          = op_type;
+  using FinOpT                       = final_op_type;
   static constexpr bool is_row_major = row_major;
 };
 
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename opT,
+          typename FinOpT>
+struct params_RT {
+  int vectorized_load_num_elem = 1;
+  bool row_major               = true;
+
+  // Turn run-time parameters into compile-time parameters.
+  // Call the provided function f with these compile-time parameters.
+  // Returns false if dispatch fails, i.e., if there is no implementation
+  // for the given runtime parameters.
+  template <typename F>
+  bool dispatch_with_compile_time_params(F&& f) const
+  {
+    return convert_vectorized_load_num_elem(f);
+  }
+
+  // Step 1: convert alignment into a compile time constant
+  template <typename F>
+  bool convert_vectorized_load_num_elem(F&& f) const
+  {
+    bool fail = false;
+    switch (vectorized_load_num_elem) {
+      case 1: return layout<1>(f);
+      case 2: return layout<2>(f);
+      case 4:
+        // We need "if constexpr" here, to prevent the if else to be delegated
+        // to run time (in which case a kernel that loads 4 doubles is
+        // generated). This is especially important, because that leads to
+        // compilation errors (which we want to avoid).
+        if constexpr (sizeof(DataT) < 8) {
+          return layout<4>(f);
+        } else {
+          // For doubles, load at most 2 elements in one instruction.
+          return layout<2>(f);
+        }
+      default: return fail;
+    };
+  }
+
+  // Step 2: convert layout into a compile time constant
+  template <int vec_len, typename F>
+  bool layout(F&& f) const
+  {
+    if (row_major) {
+      return to_compile_time_params<vec_len, true>(f);
+    } else {
+      return to_compile_time_params<vec_len, false>(f);
+    }
+  }
+
+  // Step 3: convert compile-time constant into compile-time parameter struct and invoke
+  // function f with these compile time parameters.
+  template <int vec_len, bool is_row_major, typename F>
+  bool to_compile_time_params(F&& f) const
+  {
+    // Determine kernel policy using vec_len and layout
+    typedef typename raft::linalg::Policy4x4<DataT, vec_len>::Policy RowPolicy;
+    typedef typename raft::linalg::Policy4x4<DataT, vec_len>::ColPolicy ColPolicy;
+    typedef typename std::conditional<is_row_major, RowPolicy, ColPolicy>::type Policy;
+
+    // Create compile-time parameter type and instantiate a struct;
+    using PCT = params_CT<DataT, AccT, OutT, IdxT, Policy, opT, FinOpT, is_row_major>;
+    PCT compile_time_params{};
+
+    // Dispatch to f
+    f(compile_time_params);
+
+    bool dispatch_success = true;
+    return dispatch_success;
+  }
+};
+
 template <typename PCT>
 __global__ __launch_bounds__(PCT::PolicyT::Nthreads, 2)
 
-  void pairwiseDistanceOpKernel(
-    const typename PCT::DataT* x,
-    const typename PCT::DataT* y,
-    const typename PCT::DataT* _xn,
-    const typename PCT::DataT* _yn,
-    typename PCT::IdxT m,
-    typename PCT::IdxT n,
-    typename PCT::IdxT k,
-    typename PCT::IdxT lda,
-    typename PCT::IdxT ldb,
-    typename PCT::IdxT ldd,
-    typename PCT::OutT* dOutput,
-    typename PCT::opT distance_op,
-    typename PCT::FinOpT fin_op)
+  void pairwiseDistanceOpKernel(const typename PCT::DataT* x,
+                                const typename PCT::DataT* y,
+                                const typename PCT::DataT* _xn,
+                                const typename PCT::DataT* _yn,
+                                typename PCT::IdxT m,
+                                typename PCT::IdxT n,
+                                typename PCT::IdxT k,
+                                typename PCT::IdxT lda,
+                                typename PCT::IdxT ldb,
+                                typename PCT::IdxT ldd,
+                                typename PCT::OutT* dOutput,
+                                typename PCT::opT distance_op,
+                                typename PCT::FinOpT fin_op)
 {
-  using AccT = typename PCT::AccT;
+  using AccT  = typename PCT::AccT;
   using DataT = typename PCT::DataT;
-  using OutT = typename PCT::OutT;
-  using IdxT = typename PCT::IdxT;
+  using OutT  = typename PCT::OutT;
+  using IdxT  = typename PCT::IdxT;
 
   using Policy = typename PCT::PolicyT;
 
-  // Instantiate PCT to access constexpr members.
+  // Instantiate compile time parameters to access constexpr members.
   PCT compile_time_params{};
 
   extern __shared__ char smem[];
 
   // Wrap operator back into lambdas. This is temporary and should be removed. (TODO)
   auto core_op = [distance_op] __device__(AccT & acc, DataT & x, DataT & y) {
-    // use .template to disambiguate (See: https://en.cppreference.com/w/cpp/language/dependent_name)
+    // use .template to disambiguate (See:
+    // https://en.cppreference.com/w/cpp/language/dependent_name)
     distance_op.template core<AccT, DataT>(acc, x, y);
   };
   auto epilog_op = [distance_op] __device__(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
-                                     DataT * regxn,
-                                     DataT * regyn,
-                                     IdxT gridStrideX,
-                                     IdxT gridStrideY) {
-    distance_op.template epilog<Policy, AccT, DataT, IdxT>(acc, regxn, regyn, gridStrideX, gridStrideY);
+                                            DataT * regxn,
+                                            DataT * regyn,
+                                            IdxT gridStrideX,
+                                            IdxT gridStrideY) {
+    distance_op.template epilog<Policy, AccT, DataT, IdxT>(
+      acc, regxn, regyn, gridStrideX, gridStrideY);
   };
 
   // No support for row_epilog_op.
@@ -110,9 +183,23 @@ __global__ __launch_bounds__(PCT::PolicyT::Nthreads, 2)
                     decltype(row_epilog_op),
                     compile_time_params.is_row_major,
                     write_out>
-    obj(
-      x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, epilog_op, fin_op, row_epilog_op);
+    obj(x,
+        y,
+        m,
+        n,
+        k,
+        lda,
+        ldb,
+        ldd,
+        _xn,
+        _yn,
+        dOutput,
+        smem,
+        core_op,
+        epilog_op,
+        fin_op,
+        row_epilog_op);
   obj.run();
-
 }
-};  // namespace detail
+
+};  // namespace raft::distance::detail

From b23205707497252512d8007d6c434c096977b89e Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 13 Jan 2023 13:18:34 +0100
Subject: [PATCH 10/60] pairwise matrix: move files into subdirectories

---
 .../l1.cuh}                                   |  24 +--
 cpp/include/raft/distance/detail/l1.cuh       | 134 +------------
 .../dispatch.cuh}                             | 186 ++++++++++--------
 .../detail/pairwise_matrix/kernel_sm60.cuh    | 134 +++++++++++++
 4 files changed, 251 insertions(+), 227 deletions(-)
 rename cpp/include/raft/distance/detail/{distance_operators.cuh => distance_ops/l1.cuh} (73%)
 rename cpp/include/raft/distance/detail/{pairwise_distance_op.cuh => pairwise_matrix/dispatch.cuh} (52%)
 create mode 100644 cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh

diff --git a/cpp/include/raft/distance/detail/distance_operators.cuh b/cpp/include/raft/distance/detail/distance_ops/l1.cuh
similarity index 73%
rename from cpp/include/raft/distance/detail/distance_operators.cuh
rename to cpp/include/raft/distance/detail/distance_ops/l1.cuh
index 4abaeaaf8b..08ca313fe2 100644
--- a/cpp/include/raft/distance/detail/distance_operators.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l1.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,37 +15,37 @@
  */
 
 #pragma once
-#include <raft/distance/detail/pairwise_distance_base.cuh>
-
-namespace raft::distance::detail {
 
+namespace raft::distance::detail::ops {
 
 // Describes the computation the l1 distance
 struct l1_distance_op {
-  // Whether norms of data should be loaded.
+  // Do not load norms of data, the computation of L1 distance does not use them.
   static constexpr bool use_norms = false;
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
   template <typename Policy>
-  constexpr size_t shared_mem_size() {
+  constexpr size_t shared_mem_size()
+  {
     return Policy::SmemSize;
   }
 
   template <typename AccT, typename DataT>
-  DI void core(AccT & acc, DataT & x, DataT & y) const {
+  DI void core(AccT& acc, DataT& x, DataT& y) const
+  {
     acc += raft::abs(x - y);
   };
 
   template <typename Policy, typename AccT, typename DataT, typename IdxT>
   DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
-                 DataT * regxn,
-                 DataT * regyn,
+                 DataT* regxn,
+                 DataT* regyn,
                  IdxT gridStrideX,
-                 IdxT gridStrideY) const {
+                 IdxT gridStrideY) const
+  {
     return;
   };
-
 };
 
-}  // namespace raft::distance::detail
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/l1.cuh b/cpp/include/raft/distance/detail/l1.cuh
index 7645421fbf..a5f279d9a4 100644
--- a/cpp/include/raft/distance/detail/l1.cuh
+++ b/cpp/include/raft/distance/detail/l1.cuh
@@ -15,139 +15,13 @@
  */
 
 #pragma once
-#include <raft/distance/detail/distance_operators.cuh>
-#include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/distance/detail/pairwise_distance_op.cuh>
+#include "distance_ops/l1.cuh"
+#include "pairwise_matrix/dispatch.cuh"
 
 namespace raft {
 namespace distance {
 namespace detail {
 
-template <typename PCT>
-static void distance_matrix_launch(typename PCT::opT distance_op,
-                                   typename PCT::FinOpT fin_op,
-                                   const typename PCT::DataT* x,
-                                   const typename PCT::DataT* y,
-                                   const typename PCT::DataT* _xn,
-                                   const typename PCT::DataT* _yn,
-                                   typename PCT::IdxT m,
-                                   typename PCT::IdxT n,
-                                   typename PCT::IdxT k,
-                                   typename PCT::IdxT lda,
-                                   typename PCT::IdxT ldb,
-                                   typename PCT::IdxT ldd,
-                                   typename PCT::OutT* dOutput,
-                                   cudaStream_t stream)
-{
-  using Policy = typename PCT::PolicyT;
-
-  dim3 blk(Policy::Nthreads);
-  size_t smem_size = distance_op.template shared_mem_size<Policy>();
-  dim3 grid        = launchConfigGenerator<Policy>(m, n, smem_size, pairwiseDistanceOpKernel<PCT>);
-
-  pairwiseDistanceOpKernel<PCT><<<grid, blk, smem_size, stream>>>(
-    x, y, _xn, _yn, m, n, k, lda, ldb, ldd, dOutput, distance_op, fin_op);
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-// Determine the largest number of elements that can be loaded in one
-// instruction without causing misalignment errors.
-template <typename DataT>
-int max_aligned_load(const DataT* x, const DataT* y, int ldx, int ldy)
-{
-  auto base_x     = reinterpret_cast<uintptr_t>(x);
-  auto base_y     = reinterpret_cast<uintptr_t>(y);
-  size_t stride_X = sizeof(DataT) * ldx;  // stride in bytes
-  size_t stride_Y = sizeof(DataT) * ldy;  // stride in bytes
-
-  bool base_16B_aligned = base_x % 16 == 0 && base_y % 16 == 0;
-  bool base_8B_aligned  = base_x % 8 == 0 && base_y % 8 == 0;
-
-  bool stride_16B_aligned = stride_X % 16 == 0 && stride_Y % 16 == 0;
-  bool stride_8B_aligned  = stride_X % 8 == 0 && stride_Y % 8 == 0;
-
-  if (16 % sizeof(DataT) == 0 && base_16B_aligned && stride_16B_aligned) {
-    return 16 / sizeof(DataT);
-  } else if (8 % sizeof(DataT) == 0 && base_8B_aligned && stride_8B_aligned) {
-    return 8 / sizeof(DataT);
-  } else {
-    return 1;
-  }
-}
-
-template <typename opT,
-          typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void distance_matrix_dispatch(opT distance_op,
-                              int m_,
-                              int n_,
-                              int k_,
-                              const DataT* x_,
-                              const DataT* y_,
-                              OutT* out,
-                              FinOpT fin_op,
-                              cudaStream_t stream,
-                              bool is_row_major)
-{
-  // Determine leading dimensions and possibly flip order of passing x and y if
-  // column_major.
-  //
-  // ldx, ldy, and ld_out are the leading dimensions of x, y, and out
-  const DataT* x;
-  const DataT* y;
-  int ldx, ldy, ld_out;
-  int m, n, k;
-  if (is_row_major) {
-    // Pass x, y, m, n, k in order
-    x = x_,   y = y_;
-    m = m_,   n = n_,   k = k_;
-    ldx = k_, ldy = k_, ld_out = n_;
-  } else {
-    // Flip x, y, and m, n, k.
-    x = y_,   y = x_;
-    m = n_,   n = m_,   k = k_;
-    ldx = n_, ldy = m_, ld_out = m_;
-  }
-
-  int vectorized_load_num_elem = max_aligned_load(x, y, ldx, ldy);
-
-  // We dispatch based on
-  // - vectorized_load_num_elem
-  // - is_row_major
-
-  // Create run-time parameter struct that does the dispatching
-  using PRT = params_RT<DataT, AccT, OutT, IdxT, decltype(distance_op), FinOpT>;
-  PRT run_time_params{vectorized_load_num_elem, is_row_major};
-
-  // Turn run-time parameters into compile-time parameters.
-  bool dispatch_success = run_time_params.dispatch_with_compile_time_params(
-    // We pass a lambda that receives the compile-time parameters and can use these
-    // to call the correct kernel.
-    [&](auto compile_time_params) {
-      // compile_time_params is an empty struct that we can convert back to a type
-      // using decltype.
-      return distance_matrix_launch<decltype(compile_time_params)>(
-        distance_op,
-        fin_op,
-        x,
-        y,
-        nullptr,
-        nullptr,  // TODO: use _xn, _yn for non-l1 distances
-        m,
-        n,
-        k,
-        ldx,
-        ldy,
-        ld_out,
-        out,
-        stream);
-    });
-}
-
 template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
 void l1Impl(int m,
             int n,
@@ -159,9 +33,9 @@ void l1Impl(int m,
             cudaStream_t stream,
             bool is_row_major)
 {
-  l1_distance_op distance_op{};
+  ops::l1_distance_op distance_op{};
 
-  distance_matrix_dispatch<l1_distance_op, DataT, AccT, OutT, FinOpT, IdxT>(
+  distance_matrix_dispatch<ops::l1_distance_op, DataT, AccT, OutT, FinOpT, IdxT>(
     distance_op, m, n, k, x, y, out, fin_op, stream, is_row_major);
 }
 
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_op.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
similarity index 52%
rename from cpp/include/raft/distance/detail/pairwise_distance_op.cuh
rename to cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
index 2b776a378f..d2c8dfe660 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_op.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
@@ -14,14 +14,9 @@
  * limitations under the License.
  */
 #pragma once
-#include <raft/core/operators.hpp>
-#include <raft/linalg/contractions.cuh>
-#include <raft/linalg/norm.cuh>
-#include <raft/util/cuda_utils.cuh>
-#include <raft/util/cudart_utils.hpp>
-#include <raft/util/vectorized.cuh>
 
-#include <cstddef>
+#include <raft/linalg/contractions.cuh>
+#include "kernel_sm60.cuh"
 
 namespace raft::distance::detail {
 
@@ -36,11 +31,11 @@ template <typename data_type,
           typename final_op_type,
           bool row_major>
 struct params_CT {
-  using DataT = data_type;
-  using AccT  = accumulate_type;
-  using OutT  = out_type;
-  using IdxT  = index_type;
-  using PolicyT = policy;
+  using DataT                        = data_type;
+  using AccT                         = accumulate_type;
+  using OutT                         = out_type;
+  using IdxT                         = index_type;
+  using PolicyT                      = policy;
   using opT                          = op_type;
   using FinOpT                       = final_op_type;
   static constexpr bool is_row_major = row_major;
@@ -122,84 +117,105 @@ struct params_RT {
   }
 };
 
-template <typename PCT>
-__global__ __launch_bounds__(PCT::PolicyT::Nthreads, 2)
-
-  void pairwiseDistanceOpKernel(const typename PCT::DataT* x,
-                                const typename PCT::DataT* y,
-                                const typename PCT::DataT* _xn,
-                                const typename PCT::DataT* _yn,
-                                typename PCT::IdxT m,
-                                typename PCT::IdxT n,
-                                typename PCT::IdxT k,
-                                typename PCT::IdxT lda,
-                                typename PCT::IdxT ldb,
-                                typename PCT::IdxT ldd,
-                                typename PCT::OutT* dOutput,
-                                typename PCT::opT distance_op,
-                                typename PCT::FinOpT fin_op)
+// Determine the largest number of elements that can be loaded in one
+// instruction without causing misalignment errors.
+template <typename DataT>
+int max_aligned_load(const DataT* x, const DataT* y, int ldx, int ldy)
 {
-  using AccT  = typename PCT::AccT;
-  using DataT = typename PCT::DataT;
-  using OutT  = typename PCT::OutT;
-  using IdxT  = typename PCT::IdxT;
-
-  using Policy = typename PCT::PolicyT;
-
-  // Instantiate compile time parameters to access constexpr members.
-  PCT compile_time_params{};
-
-  extern __shared__ char smem[];
-
-  // Wrap operator back into lambdas. This is temporary and should be removed. (TODO)
-  auto core_op = [distance_op] __device__(AccT & acc, DataT & x, DataT & y) {
-    // use .template to disambiguate (See:
-    // https://en.cppreference.com/w/cpp/language/dependent_name)
-    distance_op.template core<AccT, DataT>(acc, x, y);
-  };
-  auto epilog_op = [distance_op] __device__(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
-                                            DataT * regxn,
-                                            DataT * regyn,
-                                            IdxT gridStrideX,
-                                            IdxT gridStrideY) {
-    distance_op.template epilog<Policy, AccT, DataT, IdxT>(
-      acc, regxn, regyn, gridStrideX, gridStrideY);
-  };
-
-  // No support for row_epilog_op.
-  auto row_epilog_op = raft::void_op();
-  // Always write output
-  constexpr bool write_out = true;
-  constexpr bool use_norms = distance_op.use_norms;
-  PairwiseDistances<use_norms,
-                    DataT,
-                    AccT,
-                    OutT,
-                    IdxT,
-                    Policy,
-                    decltype(core_op),
-                    decltype(epilog_op),
-                    decltype(fin_op),
-                    decltype(row_epilog_op),
-                    compile_time_params.is_row_major,
-                    write_out>
-    obj(x,
+  auto base_x     = reinterpret_cast<uintptr_t>(x);
+  auto base_y     = reinterpret_cast<uintptr_t>(y);
+  size_t stride_X = sizeof(DataT) * ldx;  // stride in bytes
+  size_t stride_Y = sizeof(DataT) * ldy;  // stride in bytes
+
+  bool base_16B_aligned = base_x % 16 == 0 && base_y % 16 == 0;
+  bool base_8B_aligned  = base_x % 8 == 0 && base_y % 8 == 0;
+
+  bool stride_16B_aligned = stride_X % 16 == 0 && stride_Y % 16 == 0;
+  bool stride_8B_aligned  = stride_X % 8 == 0 && stride_Y % 8 == 0;
+
+  if (16 % sizeof(DataT) == 0 && base_16B_aligned && stride_16B_aligned) {
+    return 16 / sizeof(DataT);
+  } else if (8 % sizeof(DataT) == 0 && base_8B_aligned && stride_8B_aligned) {
+    return 8 / sizeof(DataT);
+  } else {
+    return 1;
+  }
+}
+
+template <typename opT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void distance_matrix_dispatch(opT distance_op,
+                              int m_,
+                              int n_,
+                              int k_,
+                              const DataT* x_,
+                              const DataT* y_,
+                              OutT* out,
+                              FinOpT fin_op,
+                              cudaStream_t stream,
+                              bool is_row_major)
+{
+  // Determine leading dimensions and possibly flip order of passing x and y if
+  // column_major.
+  //
+  // ldx, ldy, and ld_out are the leading dimensions of x, y, and out
+  const DataT* x;
+  const DataT* y;
+  int ldx, ldy, ld_out;
+  int m, n, k;
+  if (is_row_major) {
+    // Pass x, y, m, n, k in order
+    x = x_, y = y_;
+    m = m_, n = n_, k = k_;
+    ldx = k_, ldy = k_, ld_out = n_;
+  } else {
+    // Flip x, y, and m, n, k.
+    x = y_, y = x_;
+    m = n_, n = m_, k = k_;
+    ldx = n_, ldy = m_, ld_out = m_;
+  }
+
+  int vectorized_load_num_elem = max_aligned_load(x, y, ldx, ldy);
+
+  // We dispatch based on
+  // - vectorized_load_num_elem
+  // - is_row_major
+
+  // Create run-time parameter struct that does the dispatching
+  using PRT = params_RT<DataT, AccT, OutT, IdxT, decltype(distance_op), FinOpT>;
+  PRT run_time_params{vectorized_load_num_elem, is_row_major};
+
+  // Turn run-time parameters into compile-time parameters.
+  bool dispatch_success = run_time_params.dispatch_with_compile_time_params(
+    // We pass a lambda that receives the compile-time parameters and can use these
+    // to call the correct kernel.
+    [&](auto compile_time_params) {
+      // compile_time_params is an empty struct that we can convert back to a type
+      // using decltype.
+      return pairwise_matrix<decltype(compile_time_params)>(
+        distance_op,
+        fin_op,
+        x,
         y,
+        nullptr,
+        nullptr,  // TODO: use _xn, _yn for non-l1 distances
         m,
         n,
         k,
-        lda,
-        ldb,
-        ldd,
-        _xn,
-        _yn,
-        dOutput,
-        smem,
-        core_op,
-        epilog_op,
-        fin_op,
-        row_epilog_op);
-  obj.run();
+        ldx,
+        ldy,
+        ld_out,
+        out,
+        stream);
+    });
+
+  if (!dispatch_success) {
+    // TODO
+  }
 }
 
 };  // namespace raft::distance::detail
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
new file mode 100644
index 0000000000..ec50f6cbbf
--- /dev/null
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cstddef>
+#include <raft/core/operators.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+#include <raft/distance/detail/pairwise_distance_base.cuh>
+
+namespace raft::distance::detail {
+
+template <typename PCT>
+__global__ __launch_bounds__(PCT::PolicyT::Nthreads, 2)
+
+  void pairwise_matrix_kernel(const typename PCT::DataT* x,
+                              const typename PCT::DataT* y,
+                              const typename PCT::DataT* _xn,
+                              const typename PCT::DataT* _yn,
+                              typename PCT::IdxT m,
+                              typename PCT::IdxT n,
+                              typename PCT::IdxT k,
+                              typename PCT::IdxT lda,
+                              typename PCT::IdxT ldb,
+                              typename PCT::IdxT ldd,
+                              typename PCT::OutT* dOutput,
+                              typename PCT::opT distance_op,
+                              typename PCT::FinOpT fin_op)
+{
+  using AccT  = typename PCT::AccT;
+  using DataT = typename PCT::DataT;
+  using OutT  = typename PCT::OutT;
+  using IdxT  = typename PCT::IdxT;
+
+  using Policy = typename PCT::PolicyT;
+
+  // Instantiate compile time parameters to access constexpr members.
+  PCT compile_time_params{};
+
+  extern __shared__ char smem[];
+
+  // Wrap operator back into lambdas. This is temporary and should be removed. (TODO)
+  auto core_op = [distance_op] __device__(AccT & acc, DataT & x, DataT & y) {
+    // use .template to disambiguate (See:
+    // https://en.cppreference.com/w/cpp/language/dependent_name)
+    distance_op.template core<AccT, DataT>(acc, x, y);
+  };
+  auto epilog_op = [distance_op] __device__(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                                            DataT * regxn,
+                                            DataT * regyn,
+                                            IdxT gridStrideX,
+                                            IdxT gridStrideY) {
+    distance_op.template epilog<Policy, AccT, DataT, IdxT>(
+      acc, regxn, regyn, gridStrideX, gridStrideY);
+  };
+
+  // No support for row_epilog_op.
+  auto row_epilog_op = raft::void_op();
+  // Always write output
+  constexpr bool write_out = true;
+  constexpr bool use_norms = distance_op.use_norms;
+  PairwiseDistances<use_norms,
+                    DataT,
+                    AccT,
+                    OutT,
+                    IdxT,
+                    Policy,
+                    decltype(core_op),
+                    decltype(epilog_op),
+                    decltype(fin_op),
+                    decltype(row_epilog_op),
+                    compile_time_params.is_row_major,
+                    write_out>
+    obj(x,
+        y,
+        m,
+        n,
+        k,
+        lda,
+        ldb,
+        ldd,
+        _xn,
+        _yn,
+        dOutput,
+        smem,
+        core_op,
+        epilog_op,
+        fin_op,
+        row_epilog_op);
+  obj.run();
+}
+
+template <typename PCT>
+static void pairwise_matrix(typename PCT::opT distance_op,
+                            typename PCT::FinOpT fin_op,
+                            const typename PCT::DataT* x,
+                            const typename PCT::DataT* y,
+                            const typename PCT::DataT* _xn,
+                            const typename PCT::DataT* _yn,
+                            typename PCT::IdxT m,
+                            typename PCT::IdxT n,
+                            typename PCT::IdxT k,
+                            typename PCT::IdxT lda,
+                            typename PCT::IdxT ldb,
+                            typename PCT::IdxT ldd,
+                            typename PCT::OutT* dOutput,
+                            cudaStream_t stream)
+{
+  using Policy = typename PCT::PolicyT;
+
+  dim3 blk(Policy::Nthreads);
+  size_t smem_size = distance_op.template shared_mem_size<Policy>();
+  dim3 grid        = launchConfigGenerator<Policy>(m, n, smem_size, pairwise_matrix_kernel<PCT>);
+
+  pairwise_matrix_kernel<PCT><<<grid, blk, smem_size, stream>>>(
+    x, y, _xn, _yn, m, n, k, lda, ldb, ldd, dOutput, distance_op, fin_op);
+
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+};  // namespace raft::distance::detail

From 06f6ffa26613e492ac996be7a176eaec35c16fd0 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 13 Jan 2023 13:56:02 +0100
Subject: [PATCH 11/60] pairwise matrix: Untangle dispatching and kernel
 template parameters

By adding yet another struct ^^
---
 .../detail/pairwise_matrix/dispatch.cuh       |  95 +++++++---------
 .../detail/pairwise_matrix/kernel_sm60.cuh    | 101 +++++++++++-------
 2 files changed, 99 insertions(+), 97 deletions(-)

diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
index d2c8dfe660..0e056405a1 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
@@ -20,37 +20,17 @@
 
 namespace raft::distance::detail {
 
-template <typename data_type,
-          typename accumulate_type,
-          typename out_type,
-          typename index_type,
-
-          typename policy,
-          // Op (L2, L1, etc...)
-          typename op_type,
-          typename final_op_type,
-          bool row_major>
-struct params_CT {
-  using DataT                        = data_type;
-  using AccT                         = accumulate_type;
-  using OutT                         = out_type;
-  using IdxT                         = index_type;
-  using PolicyT                      = policy;
-  using opT                          = op_type;
-  using FinOpT                       = final_op_type;
-  static constexpr bool is_row_major = row_major;
-};
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename opT,
-          typename FinOpT>
-struct params_RT {
+template <typename DataT>
+struct params_dispatch {
   int vectorized_load_num_elem = 1;
   bool row_major               = true;
 
+  template <int vl, bool rm>
+  struct params_constexpr {
+    static constexpr int vec_len = vl;
+    static constexpr bool is_row_major = rm;
+  };
+
   // Turn run-time parameters into compile-time parameters.
   // Call the provided function f with these compile-time parameters.
   // Returns false if dispatch fails, i.e., if there is no implementation
@@ -69,17 +49,7 @@ struct params_RT {
     switch (vectorized_load_num_elem) {
       case 1: return layout<1>(f);
       case 2: return layout<2>(f);
-      case 4:
-        // We need "if constexpr" here, to prevent the if else to be delegated
-        // to run time (in which case a kernel that loads 4 doubles is
-        // generated). This is especially important, because that leads to
-        // compilation errors (which we want to avoid).
-        if constexpr (sizeof(DataT) < 8) {
-          return layout<4>(f);
-        } else {
-          // For doubles, load at most 2 elements in one instruction.
-          return layout<2>(f);
-        }
+      case 4: return layout<4>(f);
       default: return fail;
     };
   }
@@ -100,14 +70,9 @@ struct params_RT {
   template <int vec_len, bool is_row_major, typename F>
   bool to_compile_time_params(F&& f) const
   {
-    // Determine kernel policy using vec_len and layout
-    typedef typename raft::linalg::Policy4x4<DataT, vec_len>::Policy RowPolicy;
-    typedef typename raft::linalg::Policy4x4<DataT, vec_len>::ColPolicy ColPolicy;
-    typedef typename std::conditional<is_row_major, RowPolicy, ColPolicy>::type Policy;
-
     // Create compile-time parameter type and instantiate a struct;
-    using PCT = params_CT<DataT, AccT, OutT, IdxT, Policy, opT, FinOpT, is_row_major>;
-    PCT compile_time_params{};
+    using ct_params_T = params_constexpr<vec_len, is_row_major>;
+    ct_params_T compile_time_params{};
 
     // Dispatch to f
     f(compile_time_params);
@@ -181,22 +146,38 @@ void distance_matrix_dispatch(opT distance_op,
 
   int vectorized_load_num_elem = max_aligned_load(x, y, ldx, ldy);
 
-  // We dispatch based on
-  // - vectorized_load_num_elem
-  // - is_row_major
-
-  // Create run-time parameter struct that does the dispatching
-  using PRT = params_RT<DataT, AccT, OutT, IdxT, decltype(distance_op), FinOpT>;
-  PRT run_time_params{vectorized_load_num_elem, is_row_major};
+  // Create run-time parameter struct that does the dispatching.
+  //
+  // In addition to the template parameters of this function (IdxT, DataT,
+  // etc..), we explicitly dispatch based on:
+  params_dispatch<DataT> run_time_params{
+    vectorized_load_num_elem,   // 1. num array elements per load instruction
+      is_row_major              // 2. the layout x, y, and out
+  };
 
   // Turn run-time parameters into compile-time parameters.
   bool dispatch_success = run_time_params.dispatch_with_compile_time_params(
     // We pass a lambda that receives the compile-time parameters and can use these
     // to call the correct kernel.
-    [&](auto compile_time_params) {
-      // compile_time_params is an empty struct that we can convert back to a type
-      // using decltype.
-      return pairwise_matrix<decltype(compile_time_params)>(
+    [&](auto p) {
+      // p has two constexpr members:
+      // - vec_len
+      // - is_row_major
+
+      // There is no instruction to load 4 doubles, so we catch this situation
+      // and load 2 doubles.
+      constexpr bool load_4_doubles = sizeof(DataT) > 4 && p.vec_len == 4;
+      constexpr int vec_len = (load_4_doubles) ? 2 : p.vec_len;
+
+      // Determine kernel policy using vec_len and layout
+      typedef typename raft::linalg::Policy4x4<DataT, vec_len>::Policy RowPolicy;
+      typedef typename raft::linalg::Policy4x4<DataT, vec_len>::ColPolicy ColPolicy;
+      typedef typename std::conditional<p.is_row_major, RowPolicy, ColPolicy>::type Policy;
+
+      // Create compile-time template parameter
+      using KP_T = kernel_params_T<DataT, AccT, OutT, IdxT, Policy, opT, FinOpT, p.is_row_major>;
+
+      return pairwise_matrix<KP_T>(
         distance_op,
         fin_op,
         x,
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
index ec50f6cbbf..fa30ff2c3e 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
@@ -17,38 +17,59 @@
 
 #include <cstddef>
 #include <raft/core/operators.hpp>
-#include <raft/util/cudart_utils.hpp>
+#include <raft/util/cudart_utils.hpp> // TODO: remove
 
 #include <raft/distance/detail/pairwise_distance_base.cuh>
 
 namespace raft::distance::detail {
 
-template <typename PCT>
-__global__ __launch_bounds__(PCT::PolicyT::Nthreads, 2)
-
-  void pairwise_matrix_kernel(const typename PCT::DataT* x,
-                              const typename PCT::DataT* y,
-                              const typename PCT::DataT* _xn,
-                              const typename PCT::DataT* _yn,
-                              typename PCT::IdxT m,
-                              typename PCT::IdxT n,
-                              typename PCT::IdxT k,
-                              typename PCT::IdxT lda,
-                              typename PCT::IdxT ldb,
-                              typename PCT::IdxT ldd,
-                              typename PCT::OutT* dOutput,
-                              typename PCT::opT distance_op,
-                              typename PCT::FinOpT fin_op)
+template <typename data_type,
+          typename accumulate_type,
+          typename out_type,
+          typename index_type,
+
+          typename policy,
+          // Op (L2, L1, etc...)
+          typename op_type,
+          typename final_op_type,
+          bool row_major>
+struct kernel_params_T {
+  using DataT                        = data_type;
+  using AccT                         = accumulate_type;
+  using OutT                         = out_type;
+  using IdxT                         = index_type;
+  using PolicyT                      = policy;
+  using opT                          = op_type;
+  using FinOpT                       = final_op_type;
+  static constexpr bool is_row_major = row_major;
+};
+
+template <typename KP_T>
+__global__ __launch_bounds__(KP_T::PolicyT::Nthreads, 2)
+
+  void pairwise_matrix_kernel(const typename KP_T::DataT* x,
+                              const typename KP_T::DataT* y,
+                              const typename KP_T::DataT* _xn,
+                              const typename KP_T::DataT* _yn,
+                              typename KP_T::IdxT m,
+                              typename KP_T::IdxT n,
+                              typename KP_T::IdxT k,
+                              typename KP_T::IdxT lda,
+                              typename KP_T::IdxT ldb,
+                              typename KP_T::IdxT ldd,
+                              typename KP_T::OutT* dOutput,
+                              typename KP_T::opT distance_op,
+                              typename KP_T::FinOpT fin_op)
 {
-  using AccT  = typename PCT::AccT;
-  using DataT = typename PCT::DataT;
-  using OutT  = typename PCT::OutT;
-  using IdxT  = typename PCT::IdxT;
+  using AccT  = typename KP_T::AccT;
+  using DataT = typename KP_T::DataT;
+  using OutT  = typename KP_T::OutT;
+  using IdxT  = typename KP_T::IdxT;
 
-  using Policy = typename PCT::PolicyT;
+  using Policy = typename KP_T::PolicyT;
 
   // Instantiate compile time parameters to access constexpr members.
-  PCT compile_time_params{};
+  KP_T compile_time_params{};
 
   extern __shared__ char smem[];
 
@@ -103,29 +124,29 @@ __global__ __launch_bounds__(PCT::PolicyT::Nthreads, 2)
   obj.run();
 }
 
-template <typename PCT>
-static void pairwise_matrix(typename PCT::opT distance_op,
-                            typename PCT::FinOpT fin_op,
-                            const typename PCT::DataT* x,
-                            const typename PCT::DataT* y,
-                            const typename PCT::DataT* _xn,
-                            const typename PCT::DataT* _yn,
-                            typename PCT::IdxT m,
-                            typename PCT::IdxT n,
-                            typename PCT::IdxT k,
-                            typename PCT::IdxT lda,
-                            typename PCT::IdxT ldb,
-                            typename PCT::IdxT ldd,
-                            typename PCT::OutT* dOutput,
+template <typename KP_T>
+static void pairwise_matrix(typename KP_T::opT distance_op,
+                            typename KP_T::FinOpT fin_op,
+                            const typename KP_T::DataT* x,
+                            const typename KP_T::DataT* y,
+                            const typename KP_T::DataT* _xn,
+                            const typename KP_T::DataT* _yn,
+                            typename KP_T::IdxT m,
+                            typename KP_T::IdxT n,
+                            typename KP_T::IdxT k,
+                            typename KP_T::IdxT lda,
+                            typename KP_T::IdxT ldb,
+                            typename KP_T::IdxT ldd,
+                            typename KP_T::OutT* dOutput,
                             cudaStream_t stream)
 {
-  using Policy = typename PCT::PolicyT;
+  using Policy = typename KP_T::PolicyT;
 
   dim3 blk(Policy::Nthreads);
   size_t smem_size = distance_op.template shared_mem_size<Policy>();
-  dim3 grid        = launchConfigGenerator<Policy>(m, n, smem_size, pairwise_matrix_kernel<PCT>);
+  dim3 grid        = launchConfigGenerator<Policy>(m, n, smem_size, pairwise_matrix_kernel<KP_T>);
 
-  pairwise_matrix_kernel<PCT><<<grid, blk, smem_size, stream>>>(
+  pairwise_matrix_kernel<KP_T><<<grid, blk, smem_size, stream>>>(
     x, y, _xn, _yn, m, n, k, lda, ldb, ldd, dOutput, distance_op, fin_op);
 
   RAFT_CUDA_TRY(cudaGetLastError());

From 2f41faa419e4bf08bb0ff68587bcaf3bca385c20 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 13 Jan 2023 15:00:06 +0100
Subject: [PATCH 12/60] l2 unexp: Use pairwise matrix dispatch

---
 .../distance/detail/distance_ops/l2_unexp.cuh |  68 +++++++
 .../raft/distance/detail/euclidean.cuh        | 186 +++---------------
 2 files changed, 91 insertions(+), 163 deletions(-)
 create mode 100644 cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh

diff --git a/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh b/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh
new file mode 100644
index 0000000000..99fda59f03
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace raft::distance::detail::ops {
+
+// Describes the computation the l2 unexpanded distance
+
+template <bool sqrt>
+struct l2_unexp_generic_distance_op {
+  // Do not load norms of data, the computation of L1 distance does not use them.
+  static constexpr bool use_norms = false;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize;
+  }
+
+  template <typename AccT, typename DataT>
+  DI void core(AccT& acc, DataT& x, DataT& y) const
+  {
+    const auto diff = x - y;
+    acc += diff * diff;
+  };
+
+  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+    if constexpr (sqrt) {
+#pragma unroll
+      for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+        for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+          acc[i][j] = raft::sqrt(acc[i][j]);
+        }
+      }
+    }
+  };
+};
+
+
+// Define distance ops with and without square root computation.
+using l2_unexp_distance_op = l2_unexp_generic_distance_op<false>;
+using l2_unexp_sqrt_distance_op = l2_unexp_generic_distance_op<true>;
+
+
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/euclidean.cuh b/cpp/include/raft/distance/detail/euclidean.cuh
index 1a2db63f5c..8ed1e9d615 100644
--- a/cpp/include/raft/distance/detail/euclidean.cuh
+++ b/cpp/include/raft/distance/detail/euclidean.cuh
@@ -21,6 +21,10 @@
 #include <raft/linalg/norm.cuh>
 #include <raft/util/cuda_utils.cuh>
 
+#include "distance_ops/l2_unexp.cuh"
+#include "pairwise_matrix/dispatch.cuh"
+
+
 namespace raft {
 namespace distance {
 namespace detail {
@@ -285,145 +289,6 @@ void euclideanAlgo1(Index_ m,
   }
 }
 
-/**
- * @brief the unexpanded euclidean distance matrix calculation
- *  It computes the following equation: cij = op((ai-bj)^2)
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
- * @tparam FinalLambda    final lambda called on final distance value
- *
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of columns of B and C/D
- * @param[in]       k number of cols of A and rows of B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[in]       sqrt if the square root is computed or not
- * @param[output]   pD output matrix
- * @param fin_op    the final gemm epilogue lambda
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-void euclideanUnExpImpl(const DataT* x,
-                        const DataT* y,
-                        IdxT m,
-                        IdxT n,
-                        IdxT k,
-                        IdxT lda,
-                        IdxT ldb,
-                        IdxT ldd,
-                        bool sqrt,
-                        OutT* dOutput,
-                        FinalLambda fin_op,
-                        cudaStream_t stream)
-{
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-  dim3 blk(KPolicy::Nthreads);
-
-  // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    const auto diff = x - y;
-    acc += diff * diff;
-  };
-
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                         DataT * regxn,
-                                         DataT * regyn,
-                                         IdxT gridStrideX,
-                                         IdxT gridStrideY) {
-    if (sqrt) {
-#pragma unroll
-      for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-#pragma unroll
-        for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-          acc[i][j] = raft::sqrt(acc[i][j]);
-        }
-      }
-    }
-  };
-
-  if (isRowMajor) {
-    auto euclideanUnExpRowMajor = pairwiseDistanceMatKernel<false,
-                                                            DataT,
-                                                            AccT,
-                                                            OutT,
-                                                            IdxT,
-                                                            KPolicy,
-                                                            decltype(core_lambda),
-                                                            decltype(epilog_lambda),
-                                                            FinalLambda,
-                                                            true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, euclideanUnExpRowMajor);
-
-    euclideanUnExpRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-
-  } else {
-    auto euclideanUnExpColMajor = pairwiseDistanceMatKernel<false,
-                                                            DataT,
-                                                            AccT,
-                                                            OutT,
-                                                            IdxT,
-                                                            KPolicy,
-                                                            decltype(core_lambda),
-                                                            decltype(epilog_lambda),
-                                                            FinalLambda,
-                                                            false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, euclideanUnExpColMajor);
-
-    euclideanUnExpColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void euclideanUnExp(IdxT m,
-                    IdxT n,
-                    IdxT k,
-                    IdxT lda,
-                    IdxT ldb,
-                    IdxT ldd,
-                    const DataT* x,
-                    const DataT* y,
-                    bool sqrt,
-                    OutT* dOutput,
-                    FinalLambda fin_op,
-                    cudaStream_t stream)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
-  } else {
-    euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
-  }
-}
 
 /**
  * @brief the unexpanded euclidean distance matrix calculation
@@ -444,35 +309,30 @@ void euclideanUnExp(IdxT m,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void euclideanAlgo2(Index_ m,
-                    Index_ n,
-                    Index_ k,
-                    const InType* pA,
-                    const InType* pB,
-                    OutType* pD,
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void euclideanAlgo2(IdxT m,
+                    IdxT n,
+                    IdxT k,
+                    const DataT* pA,
+                    const DataT* pB,
+                    OutT* pD,
                     bool enable_sqrt,
-                    FinalLambda fin_op,
+                    FinOpT fin_op,
                     cudaStream_t stream,
                     bool isRowMajor)
 {
-  typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type UnExpOutType;
-  UnExpOutType* pDcast = reinterpret_cast<UnExpOutType*>(pD);
-  Index_ lda, ldb, ldd;
-
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    euclideanUnExp<InType, AccType, UnExpOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, enable_sqrt, pDcast, fin_op, stream);
+  if (enable_sqrt) {
+    ops::l2_unexp_sqrt_distance_op l2_sqrt_op{};
+    distance_matrix_dispatch<decltype(l2_sqrt_op), DataT, AccT, OutT, FinOpT, IdxT>(
+      l2_sqrt_op, m, n, k, pA, pB, pD, fin_op, stream, isRowMajor);
   } else {
-    lda = n, ldb = m, ldd = m;
-    euclideanUnExp<InType, AccType, UnExpOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, enable_sqrt, pDcast, fin_op, stream);
+    ops::l2_unexp_distance_op l2_op{};
+    distance_matrix_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
+      l2_op, m, n, k, pA, pB, pD, fin_op, stream, isRowMajor);
   }
 }
 

From 7938614f0c74c7405c66b8074150c9e642952130 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 13 Jan 2023 16:34:41 +0100
Subject: [PATCH 13/60] l2 exp: Use pairwise matrix dispatch

This did remove support for the CUTLASS kernels. Has to be put back.
---
 .../raft/distance/detail/distance_ops/l1.cuh  |   2 +-
 .../distance/detail/distance_ops/l2_exp.cuh   |  72 +++++
 .../distance/detail/distance_ops/l2_unexp.cuh |  16 +-
 .../raft/distance/detail/euclidean.cuh        | 304 ++++--------------
 cpp/include/raft/distance/detail/l1.cuh       |   5 +-
 .../detail/pairwise_matrix/dispatch.cuh       |  13 +-
 .../detail/pairwise_matrix/kernel_sm60.cuh    |   3 +-
 7 files changed, 166 insertions(+), 249 deletions(-)
 create mode 100644 cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh

diff --git a/cpp/include/raft/distance/detail/distance_ops/l1.cuh b/cpp/include/raft/distance/detail/distance_ops/l1.cuh
index 08ca313fe2..9d31b24851 100644
--- a/cpp/include/raft/distance/detail/distance_ops/l1.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l1.cuh
@@ -25,7 +25,7 @@ struct l1_distance_op {
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
-  template <typename Policy>
+  template <typename Policy, typename DataT>
   constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize;
diff --git a/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
new file mode 100644
index 0000000000..c15b43a74e
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace raft::distance::detail::ops {
+
+// Describes the computation the l2 expanded distance
+//
+// TODO: more explanation.
+struct l2_exp_distance_op {
+  bool sqrt;
+
+  l2_exp_distance_op(bool sqrt_) noexcept : sqrt(sqrt_) {}
+
+  // Load norms of input data
+  static constexpr bool use_norms = true;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy, typename DataT>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT));
+  }
+
+  template <typename AccT, typename DataT>
+  DI void core(AccT& acc, DataT& x, DataT& y) const
+  {
+    acc += x * y;
+  };
+
+  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+#pragma unroll
+    for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+        acc[i][j] = regxn[i] + regyn[j] - (DataT)2.0 * acc[i][j];
+      }
+    }
+    if (sqrt) {
+#pragma unroll
+      for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+        for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+          acc[i][j] = raft::sqrt(acc[i][j]);
+        }
+      }
+    }
+  }
+};
+
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh b/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh
index 99fda59f03..03bbd936c6 100644
--- a/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh
@@ -19,15 +19,17 @@
 namespace raft::distance::detail::ops {
 
 // Describes the computation the l2 unexpanded distance
+struct l2_unexp_distance_op {
+  bool sqrt;
+
+  l2_unexp_distance_op(bool sqrt_) noexcept : sqrt(sqrt_) {}
 
-template <bool sqrt>
-struct l2_unexp_generic_distance_op {
   // Do not load norms of data, the computation of L1 distance does not use them.
   static constexpr bool use_norms = false;
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
-  template <typename Policy>
+  template <typename Policy, typename DataT>
   constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize;
@@ -47,7 +49,7 @@ struct l2_unexp_generic_distance_op {
                  IdxT gridStrideX,
                  IdxT gridStrideY) const
   {
-    if constexpr (sqrt) {
+    if (sqrt) {
 #pragma unroll
       for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
 #pragma unroll
@@ -59,10 +61,4 @@ struct l2_unexp_generic_distance_op {
   };
 };
 
-
-// Define distance ops with and without square root computation.
-using l2_unexp_distance_op = l2_unexp_generic_distance_op<false>;
-using l2_unexp_sqrt_distance_op = l2_unexp_generic_distance_op<true>;
-
-
 }  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/euclidean.cuh b/cpp/include/raft/distance/detail/euclidean.cuh
index 8ed1e9d615..29088257e2 100644
--- a/cpp/include/raft/distance/detail/euclidean.cuh
+++ b/cpp/include/raft/distance/detail/euclidean.cuh
@@ -16,13 +16,11 @@
 
 #pragma once
 
-#include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/distance/detail/pairwise_distance_cutlass_base.cuh>
 #include <raft/linalg/norm.cuh>
-#include <raft/util/cuda_utils.cuh>
 
-#include "distance_ops/l2_unexp.cuh"
 #include "pairwise_matrix/dispatch.cuh"
+#include "distance_ops/l2_exp.cuh"
+#include "distance_ops/l2_unexp.cuh"
 
 
 namespace raft {
@@ -44,249 +42,88 @@ struct L2ExpandedOp {
   __device__ AccT operator()(DataT aData) const noexcept { return aData; }
 };
 
-/**
- * @brief the expanded euclidean distance matrix calculation implementer
- *  It computes the following equation: C = op(A^2 + B^2 - 2AB)
- * @tparam DataT input data-type (for A and B matrices)
- * @tparam AccT   accumulation data-type
- * @tparam OutT   output data-type (for C and D matrices)
- * @tparam IdxT   index data-type
- * @tparam Veclen number of k-elements loaded by each thread for every LDG call
- *                it makes. check contractions.cuh for details.
- * @tparam FinalLambda the final lambda called on final distance value
- * @tparam isRowMajor  true if input/output is row major,
-                       false for column major
- * @param[in]     x input matrix
- * @param[in]     y input matrix
- * @param[in]     xn row norms of input matrix A.
- * @param[in]     yn row norms of input matrix B.
- * @param[in]     m number of rows of A and C/D
- * @param[in]     n number of columns of B and C/D
- * @param[in]     k number of cols of A and rows of B
- * @param[in]     lda leading dimension of A
- * @param[in]     ldb leading dimension of B
- * @param[in]     ldd leading dimension of C/D
- * @param[in]     sqrt if the square root is computed or not
- * @param[output] pD output matrix
- * @param fin_op  the final gemm epilogue lambda
-*  @param stream  cuda stream to launch cuda operations.
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-void euclideanExpImpl(const DataT* x,
-                      const DataT* y,
-                      const DataT* xn,
-                      const DataT* yn,
-                      IdxT m,
-                      IdxT n,
-                      IdxT k,
-                      IdxT lda,
-                      IdxT ldb,
-                      IdxT ldd,
-                      bool sqrt,
-                      OutT* dOutput,
-                      FinalLambda fin_op,
-                      cudaStream_t stream)
-{
-#if (__CUDACC_VER_MAJOR__ < 12)
-  const auto deviceVersion = getComputeCapability();
-  if (deviceVersion.first >= 8) {
-    using L2Op = L2ExpandedOp<DataT, AccT>;
-    L2Op L2_dist_op(sqrt);
-
-    cutlassDistanceKernel<DataT, AccT, OutT, IdxT, VecLen, FinalLambda, L2Op, isRowMajor>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, L2_dist_op, stream);
-
-  } else
-#endif
-  {
-
-    typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-    typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-    typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-    dim3 blk(KPolicy::Nthreads);
-
-    // Accumulation operation lambda
-    auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
-
-    // epilogue operation lambda for final value calculation
-    auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                           DataT * regxn,
-                                           DataT * regyn,
-                                           IdxT gridStrideX,
-                                           IdxT gridStrideY) {
-#pragma unroll
-      for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-#pragma unroll
-        for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-          acc[i][j] = regxn[i] + regyn[j] - (DataT)2.0 * acc[i][j];
-        }
-      }
-      if (sqrt) {
-#pragma unroll
-        for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-#pragma unroll
-          for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-            acc[i][j] = raft::sqrt(acc[i][j]);
-          }
-        }
-      }
-    };
-
-    constexpr size_t shmemSize =
-      KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
-    if (isRowMajor) {
-      auto euclideanExpRowMajor = pairwiseDistanceMatKernelPriorToAmpere<true,
-                                                                         DataT,
-                                                                         AccT,
-                                                                         OutT,
-                                                                         IdxT,
-                                                                         KPolicy,
-                                                                         decltype(core_lambda),
-                                                                         decltype(epilog_lambda),
-                                                                         FinalLambda,
-                                                                         true>;
-      dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpRowMajor);
-
-      euclideanExpRowMajor<<<grid, blk, shmemSize, stream>>>(
-        x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-    } else {
-      auto euclideanExpColMajor = pairwiseDistanceMatKernelPriorToAmpere<true,
-                                                                         DataT,
-                                                                         AccT,
-                                                                         OutT,
-                                                                         IdxT,
-                                                                         KPolicy,
-                                                                         decltype(core_lambda),
-                                                                         decltype(epilog_lambda),
-                                                                         FinalLambda,
-                                                                         false>;
-      dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpColMajor);
-      euclideanExpColMajor<<<grid, blk, shmemSize, stream>>>(
-        x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-    }
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
 
+// /**
+//  * @brief the expanded euclidean distance matrix calculation
+//  *  It computes the following equation: C = op(A^2 + B^2 - 2AB)
+//  * @tparam InType input data-type (for A and B matrices)
+//  * @tparam AccType accumulation data-type
+//  * @tparam OutType output data-type (for C and D matrices)
+//  * @tparam FinalLambda the final lambda called by FragmentMultiplyAdd_
+//  * @tparam Index_ index type
+//  * @param m number of rows of A and C/D
+//  * @param n number of columns of B and C/D
+//  * @param k number of cols of A and rows of B
+//  * @param pA input matrix
+//  * @param pB input matrix
+//  * @param pD output matrix
+//  * @param enable_sqrt if the square root is computed or not
+//  * @param workspace temporary workspace needed for computations
+//  * @param worksize number of bytes of the workspace
+//  * @param fin_op the final gemm epilogue lambda
+//  * @param stream cuda stream where to launch work
+//  * @param isRowMajor whether the input and output matrices are row major
+//  */
 template <typename DataT,
           typename AccT,
           typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void euclideanExp(IdxT m,
-                  IdxT n,
-                  IdxT k,
-                  IdxT lda,
-                  IdxT ldb,
-                  IdxT ldd,
-                  const DataT* x,
-                  const DataT* y,
-                  const DataT* xn,
-                  const DataT* yn,
-                  bool sqrt,
-                  OutT* dOutput,
-                  FinalLambda fin_op,
-                  cudaStream_t stream)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    euclideanExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    euclideanExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
-  } else {
-    euclideanExpImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
-  }
-}
-
-/**
- * @brief the expanded euclidean distance matrix calculation
- *  It computes the following equation: C = op(A^2 + B^2 - 2AB)
- * @tparam InType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OutType output data-type (for C and D matrices)
- * @tparam FinalLambda the final lambda called by FragmentMultiplyAdd_
- * @tparam Index_ index type
- * @param m number of rows of A and C/D
- * @param n number of columns of B and C/D
- * @param k number of cols of A and rows of B
- * @param pA input matrix
- * @param pB input matrix
- * @param pD output matrix
- * @param enable_sqrt if the square root is computed or not
- * @param workspace temporary workspace needed for computations
- * @param worksize number of bytes of the workspace
- * @param fin_op the final gemm epilogue lambda
- * @param stream cuda stream where to launch work
- * @param isRowMajor whether the input and output matrices are row major
- */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void euclideanAlgo1(Index_ m,
-                    Index_ n,
-                    Index_ k,
-                    const InType* pA,
-                    const InType* pB,
-                    OutType* pD,
+          typename FinOpT,
+          typename IdxT = int>
+void euclideanAlgo1(IdxT m,
+                    IdxT n,
+                    IdxT k,
+                    const DataT* pA,
+                    const DataT* pB,
+                    OutT* pD,
                     bool enable_sqrt,
-                    AccType* workspace,
+                    AccT* workspace,
                     size_t& worksize,
-                    FinalLambda fin_op,
+                    FinOpT fin_op,
                     cudaStream_t stream,
                     bool isRowMajor)
 {
+  // TODO: handle cutlass kernels
+  // constexpr bool CUDA_11_or_below = __CUDACC_VER_MAJOR__ < 12;
+
+  // if constexpr(CUDA_11_or_below) {
+  //   const auto deviceVersion = getComputeCapability();
+  //   if (deviceVersion.first >= 8) {
+  //     using L2Op = L2ExpandedOp<DataT, AccT>;
+  //     L2Op L2_dist_op(sqrt);
+
+  //     cutlassDistanceKernel<DataT, AccT, OutT, IdxT, VecLen, FinOpT, L2Op, isRowMajor>(
+  //       x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, L2_dist_op, stream);
+  //   }
+  // }
+
+
   // raft distance support inputs as float/double and output as uint8_t/float/double.
-  static_assert(!((sizeof(OutType) > 1) && (sizeof(AccType) != sizeof(OutType))),
-                "OutType can be uint8_t, float, double,"
-                "if sizeof(OutType) > 1 then sizeof(AccType) == sizeof(OutType).");
-  typedef typename std::conditional<sizeof(OutType) == 1, OutType, AccType>::type ExpOutType;
-  ExpOutType* pDcast = reinterpret_cast<ExpOutType*>(pD);
+  static_assert(!((sizeof(OutT) > 1) && (sizeof(AccT) != sizeof(OutT))),
+                "OutT can be uint8_t, float, double,"
+                "if sizeof(OutT) > 1 then sizeof(AccT) == sizeof(OutT).");
 
   ASSERT(
-    !(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || (worksize < m * sizeof(AccType))),
+    !(((pA != pB) && (worksize < (m + n) * sizeof(AccT))) || (worksize < m * sizeof(AccT))),
     "workspace size error");
   ASSERT(workspace != nullptr, "workspace is null");
 
-  Index_ lda, ldb, ldd;
-  InType* col_vec = workspace;
-  InType* row_vec = workspace;
+  DataT* norm_A = workspace;
+  DataT* norm_B = workspace;
   if (pA != pB) {
-    row_vec += m;
+    norm_B += m;
     raft::linalg::rowNorm(
-      col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
+      norm_A, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
     raft::linalg::rowNorm(
-      row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
+      norm_B, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
   } else {
     raft::linalg::rowNorm(
-      col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
+      norm_A, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
   }
 
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    euclideanExp<InType, AccType, ExpOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, enable_sqrt, pDcast, fin_op, stream);
-  } else {
-    lda = n, ldb = m, ldd = m;
-    euclideanExp<InType, AccType, ExpOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, enable_sqrt, pDcast, fin_op, stream);
-  }
+  ops::l2_exp_distance_op l2_op(enable_sqrt);
+
+  distance_matrix_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    l2_op, m, n, k, pA, pB, norm_A, norm_B, pD, fin_op, stream, isRowMajor);
 }
 
 
@@ -325,15 +162,14 @@ void euclideanAlgo2(IdxT m,
                     cudaStream_t stream,
                     bool isRowMajor)
 {
-  if (enable_sqrt) {
-    ops::l2_unexp_sqrt_distance_op l2_sqrt_op{};
-    distance_matrix_dispatch<decltype(l2_sqrt_op), DataT, AccT, OutT, FinOpT, IdxT>(
-      l2_sqrt_op, m, n, k, pA, pB, pD, fin_op, stream, isRowMajor);
-  } else {
-    ops::l2_unexp_distance_op l2_op{};
-    distance_matrix_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
-      l2_op, m, n, k, pA, pB, pD, fin_op, stream, isRowMajor);
-  }
+  ops::l2_unexp_distance_op l2_op(enable_sqrt);
+
+  // The unexpanded L2 does not require the norms of a and b to be calculated.
+  const DataT* norm_A = nullptr;
+  const DataT* norm_B = nullptr;
+
+  distance_matrix_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    l2_op, m, n, k, pA, pB, norm_A, norm_B, pD, fin_op, stream, isRowMajor);
 }
 
 };  // end namespace detail
diff --git a/cpp/include/raft/distance/detail/l1.cuh b/cpp/include/raft/distance/detail/l1.cuh
index a5f279d9a4..49402a9101 100644
--- a/cpp/include/raft/distance/detail/l1.cuh
+++ b/cpp/include/raft/distance/detail/l1.cuh
@@ -35,8 +35,11 @@ void l1Impl(int m,
 {
   ops::l1_distance_op distance_op{};
 
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
+
   distance_matrix_dispatch<ops::l1_distance_op, DataT, AccT, OutT, FinOpT, IdxT>(
-    distance_op, m, n, k, x, y, out, fin_op, stream, is_row_major);
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
 }
 
 }  // namespace detail
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
index 0e056405a1..4a8fb82861 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <cstdio>
 #include <raft/linalg/contractions.cuh>
 #include "kernel_sm60.cuh"
 
@@ -119,6 +120,8 @@ void distance_matrix_dispatch(opT distance_op,
                               int k_,
                               const DataT* x_,
                               const DataT* y_,
+                              const DataT* x_norm_,
+                              const DataT* y_norm_,
                               OutT* out,
                               FinOpT fin_op,
                               cudaStream_t stream,
@@ -129,17 +132,22 @@ void distance_matrix_dispatch(opT distance_op,
   //
   // ldx, ldy, and ld_out are the leading dimensions of x, y, and out
   const DataT* x;
+  const DataT* x_norm;
   const DataT* y;
+  const DataT* y_norm;
+
   int ldx, ldy, ld_out;
   int m, n, k;
   if (is_row_major) {
     // Pass x, y, m, n, k in order
     x = x_, y = y_;
+    x_norm = x_norm_, y_norm = y_norm_;
     m = m_, n = n_, k = k_;
     ldx = k_, ldy = k_, ld_out = n_;
   } else {
     // Flip x, y, and m, n, k.
     x = y_, y = x_;
+    x_norm = y_norm_, y_norm = x_norm_;
     m = n_, n = m_, k = k_;
     ldx = n_, ldy = m_, ld_out = m_;
   }
@@ -182,8 +190,8 @@ void distance_matrix_dispatch(opT distance_op,
         fin_op,
         x,
         y,
-        nullptr,
-        nullptr,  // TODO: use _xn, _yn for non-l1 distances
+        x_norm,
+        y_norm,
         m,
         n,
         k,
@@ -195,6 +203,7 @@ void distance_matrix_dispatch(opT distance_op,
     });
 
   if (!dispatch_success) {
+    std::printf("Dispatch error(!)\n");
     // TODO
   }
 }
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
index fa30ff2c3e..68026414c0 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
@@ -141,9 +141,10 @@ static void pairwise_matrix(typename KP_T::opT distance_op,
                             cudaStream_t stream)
 {
   using Policy = typename KP_T::PolicyT;
+  using DataT = typename KP_T::DataT;
 
   dim3 blk(Policy::Nthreads);
-  size_t smem_size = distance_op.template shared_mem_size<Policy>();
+  size_t smem_size = distance_op.template shared_mem_size<Policy, DataT>();
   dim3 grid        = launchConfigGenerator<Policy>(m, n, smem_size, pairwise_matrix_kernel<KP_T>);
 
   pairwise_matrix_kernel<KP_T><<<grid, blk, smem_size, stream>>>(

From 7afe6cc8219c4225442dbcbce4e8a28f22dbcb2f Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 13 Jan 2023 16:35:29 +0100
Subject: [PATCH 14/60] Add template for distance operator

I wasted a lot of time because I had not replaced the op::core() method
of the l2_exp_distance_op after I copied it from l2_unexp_distance_op...

If I copy something from the template and forget to fill it in, I get a
compile error.
---
 .../distance/detail/distance_ops/template.cuh | 53 +++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 cpp/include/raft/distance/detail/distance_ops/template.cuh

diff --git a/cpp/include/raft/distance/detail/distance_ops/template.cuh b/cpp/include/raft/distance/detail/distance_ops/template.cuh
new file mode 100644
index 0000000000..cfd12b8bc1
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/template.cuh
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace raft::distance::detail::ops {
+
+// Describes the computation the template distance
+//
+// Fill in the TODO items.
+
+struct template_op {
+  // Load norms of input data
+  static constexpr bool use_norms = TODO;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy, typename DataT>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize + TODO;
+  }
+
+  template <typename AccT, typename DataT>
+  DI void core(AccT& acc, DataT& x, DataT& y) const
+  {
+    TODO;
+  };
+
+  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+    TODO;
+  };
+
+}  // namespace raft::distance::detail::ops

From 5fe3292eba31fff96587c6e7c01757939166cb76 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 13 Jan 2023 17:32:34 +0100
Subject: [PATCH 15/60] Reenable cutlass-based kernels for CUDA 12.0

---
 .../raft/distance/detail/pairwise_distance_cutlass_base.cuh   | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh
index f39d880da4..efd44ea4dc 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh
@@ -19,8 +19,6 @@
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 
-#if (__CUDACC_VER_MAJOR__ < 12)
-
 // We define CUTLASS_NAMESPACE in case
 // RAFT cmake is not used
 #ifndef CUTLASS_NAMESPACE
@@ -174,5 +172,5 @@ void cutlassDistanceKernel(const DataT* x,
 };      // namespace detail
 };      // namespace distance
 };      // namespace raft
-#endif  //  (__CUDACC_VER_MAJOR__ < 12)
+
 #pragma GCC diagnostic pop

From c623332ac89ab28a4a9c5ab2d23aa2193b07d256 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 13 Jan 2023 18:26:02 +0100
Subject: [PATCH 16/60] pairwise matrix l2: Add support for CUTLASS kernels

I am testing on CUDA 12, where it does not seem to work. Prior to my
commits, the CUTLASS kernels were also not working. So not sure what's
up.

In any case: consider this untested.
---
 .../distance/detail/distance_ops/l2_exp.cuh   |  16 +++
 .../raft/distance/detail/euclidean.cuh        |  63 +++++------
 .../detail/pairwise_matrix/dispatch.cuh       | 104 ++++++++++++------
 3 files changed, 116 insertions(+), 67 deletions(-)

diff --git a/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
index c15b43a74e..4dfb26a826 100644
--- a/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
@@ -69,4 +69,20 @@ struct l2_exp_distance_op {
   }
 };
 
+// Epilogue operator for CUTLASS based kernel
+template <typename DataT, typename AccT>
+struct l2_exp_cutlass_op {
+  bool sqrt;
+
+  __device__ l2_exp_cutlass_op() noexcept : sqrt(false) {}
+  __device__ l2_exp_cutlass_op(bool isSqrt) noexcept : sqrt(isSqrt) {}
+  __device__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const noexcept
+  {
+    AccT outVal = aNorm + bNorm - DataT(2.0) * accVal;
+    return sqrt ? raft::sqrt(outVal) : outVal;
+  }
+
+  __device__ AccT operator()(DataT aData) const noexcept { return aData; }
+};
+
 }  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/euclidean.cuh b/cpp/include/raft/distance/detail/euclidean.cuh
index 29088257e2..51e2ff224f 100644
--- a/cpp/include/raft/distance/detail/euclidean.cuh
+++ b/cpp/include/raft/distance/detail/euclidean.cuh
@@ -27,22 +27,6 @@ namespace raft {
 namespace distance {
 namespace detail {
 
-template <typename DataT, typename AccT>
-struct L2ExpandedOp {
-  bool sqrt;
-
-  __device__ L2ExpandedOp() noexcept : sqrt(false) {}
-  __device__ L2ExpandedOp(bool isSqrt) noexcept : sqrt(isSqrt) {}
-  __device__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const noexcept
-  {
-    AccT outVal = aNorm + bNorm - DataT(2.0) * accVal;
-    return sqrt ? raft::sqrt(outVal) : outVal;
-  }
-
-  __device__ AccT operator()(DataT aData) const noexcept { return aData; }
-};
-
-
 // /**
 //  * @brief the expanded euclidean distance matrix calculation
 //  *  It computes the following equation: C = op(A^2 + B^2 - 2AB)
@@ -82,21 +66,6 @@ void euclideanAlgo1(IdxT m,
                     cudaStream_t stream,
                     bool isRowMajor)
 {
-  // TODO: handle cutlass kernels
-  // constexpr bool CUDA_11_or_below = __CUDACC_VER_MAJOR__ < 12;
-
-  // if constexpr(CUDA_11_or_below) {
-  //   const auto deviceVersion = getComputeCapability();
-  //   if (deviceVersion.first >= 8) {
-  //     using L2Op = L2ExpandedOp<DataT, AccT>;
-  //     L2Op L2_dist_op(sqrt);
-
-  //     cutlassDistanceKernel<DataT, AccT, OutT, IdxT, VecLen, FinOpT, L2Op, isRowMajor>(
-  //       x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, L2_dist_op, stream);
-  //   }
-  // }
-
-
   // raft distance support inputs as float/double and output as uint8_t/float/double.
   static_assert(!((sizeof(OutT) > 1) && (sizeof(AccT) != sizeof(OutT))),
                 "OutT can be uint8_t, float, double,"
@@ -120,10 +89,34 @@ void euclideanAlgo1(IdxT m,
       norm_A, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
   }
 
-  ops::l2_exp_distance_op l2_op(enable_sqrt);
-
-  distance_matrix_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
-    l2_op, m, n, k, pA, pB, norm_A, norm_B, pD, fin_op, stream, isRowMajor);
+  // On CUDA 12:
+  // - always execute normal kernel
+  //
+  // On CUDA 11 and below:
+  // - execute CUTLASS-based kernel on SM_80 and above
+  // - execute normal kernel otherwise.
+
+  if constexpr (__CUDACC_VER_MAJOR__ == 12) {
+    // Always execute legacy kernels on CUDA 12
+    ops::l2_exp_distance_op l2_op(enable_sqrt);
+    distance_matrix_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
+      l2_op, m, n, k, pA, pB, norm_A, norm_B, pD, fin_op, stream, isRowMajor);
+  } else {
+    const auto deviceVersion = getComputeCapability();
+    if (deviceVersion.first >= 8) {
+      // If device is SM_80 or later, use CUTLASS-based kernel.
+      using L2Op = ops::l2_exp_cutlass_op<DataT, AccT>;
+      L2Op l2_op(enable_sqrt);
+
+      distance_matrix_cutlass_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
+        l2_op, m, n, k, pA, pB, norm_A, norm_B, pD, fin_op, stream, isRowMajor);
+    } else {
+      // Else use "legacy" L2
+      ops::l2_exp_distance_op l2_op(enable_sqrt);
+      distance_matrix_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
+        l2_op, m, n, k, pA, pB, norm_A, norm_B, pD, fin_op, stream, isRowMajor);
+    }
+  }
 }
 
 
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
index 4a8fb82861..650c8fa805 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
@@ -16,7 +16,9 @@
 #pragma once
 
 #include <cstdio>
+#include <utility>
 #include <raft/linalg/contractions.cuh>
+#include <raft/distance/detail/pairwise_distance_cutlass_base.cuh>
 #include "kernel_sm60.cuh"
 
 namespace raft::distance::detail {
@@ -85,8 +87,8 @@ struct params_dispatch {
 
 // Determine the largest number of elements that can be loaded in one
 // instruction without causing misalignment errors.
-template <typename DataT>
-int max_aligned_load(const DataT* x, const DataT* y, int ldx, int ldy)
+template <typename DataT, typename IdxT>
+int vectorized_load_num_elem(const DataT* x, const DataT* y, IdxT ldx, IdxT ldy)
 {
   auto base_x     = reinterpret_cast<uintptr_t>(x);
   auto base_y     = reinterpret_cast<uintptr_t>(y);
@@ -115,13 +117,13 @@ template <typename opT,
           typename FinOpT,
           typename IdxT = int>
 void distance_matrix_dispatch(opT distance_op,
-                              int m_,
-                              int n_,
-                              int k_,
-                              const DataT* x_,
-                              const DataT* y_,
-                              const DataT* x_norm_,
-                              const DataT* y_norm_,
+                              IdxT m,
+                              IdxT n,
+                              IdxT k,
+                              const DataT* x,
+                              const DataT* y,
+                              const DataT* x_norm,
+                              const DataT* y_norm,
                               OutT* out,
                               FinOpT fin_op,
                               cudaStream_t stream,
@@ -129,38 +131,24 @@ void distance_matrix_dispatch(opT distance_op,
 {
   // Determine leading dimensions and possibly flip order of passing x and y if
   // column_major.
-  //
-  // ldx, ldy, and ld_out are the leading dimensions of x, y, and out
-  const DataT* x;
-  const DataT* x_norm;
-  const DataT* y;
-  const DataT* y_norm;
-
-  int ldx, ldy, ld_out;
-  int m, n, k;
+  IdxT ldx, ldy, ld_out;
   if (is_row_major) {
-    // Pass x, y, m, n, k in order
-    x = x_, y = y_;
-    x_norm = x_norm_, y_norm = y_norm_;
-    m = m_, n = n_, k = k_;
-    ldx = k_, ldy = k_, ld_out = n_;
+    ldx = k, ldy = k, ld_out = n;
   } else {
-    // Flip x, y, and m, n, k.
-    x = y_, y = x_;
-    x_norm = y_norm_, y_norm = x_norm_;
-    m = n_, n = m_, k = k_;
-    ldx = n_, ldy = m_, ld_out = m_;
+    // Flip x, y, and m, n.
+    std::swap<const DataT*>(x, y);
+    std::swap<const DataT*>(x_norm, y_norm);
+    std::swap(m, n);
+    ldx = m, ldy = n, ld_out = n;
   }
 
-  int vectorized_load_num_elem = max_aligned_load(x, y, ldx, ldy);
-
   // Create run-time parameter struct that does the dispatching.
   //
   // In addition to the template parameters of this function (IdxT, DataT,
   // etc..), we explicitly dispatch based on:
   params_dispatch<DataT> run_time_params{
-    vectorized_load_num_elem,   // 1. num array elements per load instruction
-      is_row_major              // 2. the layout x, y, and out
+    vectorized_load_num_elem(x, y, ldx, ldy),   // 1. num array elements per load instruction
+    is_row_major                                // 2. the layout of x, y, and out
   };
 
   // Turn run-time parameters into compile-time parameters.
@@ -208,4 +196,56 @@ void distance_matrix_dispatch(opT distance_op,
   }
 }
 
+template <typename opT,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void distance_matrix_cutlass_dispatch(opT cutlass_op,
+                                      IdxT m,
+                                      IdxT n,
+                                      IdxT k,
+                                      const DataT* x,
+                                      const DataT* y,
+                                      const DataT* x_norm,
+                                      const DataT* y_norm,
+                                      OutT* out,
+                                      FinOpT fin_op,
+                                      cudaStream_t stream,
+                                      bool is_row_major)
+{
+  // Determine leading dimensions and possibly flip order of passing x and y if
+  // column_major.
+  IdxT ldx, ldy, ld_out;
+  if (is_row_major) {
+    ldx = k, ldy = k, ld_out = n;
+  } else {
+    std::swap<const DataT*>(x, y);
+    std::swap<const DataT*>(x_norm, y_norm);
+    std::swap(m, n);
+    ldx = m, ldy = n, ld_out = n;
+  }
+
+  params_dispatch<DataT> run_time_params{
+    vectorized_load_num_elem(x, y, ldx, ldy),
+    is_row_major
+  };
+
+  bool dispatch_success = run_time_params.dispatch_with_compile_time_params(
+    [&](auto p) {
+      // Prevent loading 4 doubles in one instruction.
+      constexpr bool load_4_doubles = sizeof(DataT) > 4 && p.vec_len == 4;
+      constexpr int vec_len = (load_4_doubles) ? 2 : p.vec_len;
+
+      cutlassDistanceKernel<DataT, AccT, OutT, IdxT, vec_len, FinOpT, opT, p.is_row_major>(
+        x, y, x_norm, y_norm, m, n, k, ldx, ldy, ld_out, out, fin_op, cutlass_op, stream);
+    });
+
+  if (!dispatch_success) {
+    std::printf("Dispatch error(!)\n");
+    // TODO
+  }
+}
+
 };  // namespace raft::distance::detail

From 27511fc65f9c39a371a0efd02ffeb7bfcbbcd736 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 13 Jan 2023 18:42:44 +0100
Subject: [PATCH 17/60] Canberra: use dispatching mechanism

---
 cpp/include/raft/distance/detail/README.org   |  13 ++
 cpp/include/raft/distance/detail/canberra.cuh | 181 +++---------------
 .../distance/detail/distance_ops/canberra.cuh |  58 ++++++
 .../distance/detail/distance_ops/template.cuh |   3 +-
 cpp/include/raft/distance/detail/l1.cuh       |   6 +-
 5 files changed, 103 insertions(+), 158 deletions(-)
 create mode 100644 cpp/include/raft/distance/detail/README.org
 create mode 100644 cpp/include/raft/distance/detail/distance_ops/canberra.cuh

diff --git a/cpp/include/raft/distance/detail/README.org b/cpp/include/raft/distance/detail/README.org
new file mode 100644
index 0000000000..dcb9b1d1e2
--- /dev/null
+++ b/cpp/include/raft/distance/detail/README.org
@@ -0,0 +1,13 @@
+#+title: Readme
+
+
+- [X] canberra.cuh
+- [ ] chebyshev.cuh
+- [ ] correlation.cuh
+- [ ] cosine.cuh
+- [ ] hamming.cuh
+- [ ] hellinger.cuh
+- [ ] jensen_shannon.cuh
+- [ ] kl_divergence.cuh
+- [ ] minkowski.cuh
+- [ ] russell_rao.cuh
diff --git a/cpp/include/raft/distance/detail/canberra.cuh b/cpp/include/raft/distance/detail/canberra.cuh
index f17a26dc4b..3f0c2fa268 100644
--- a/cpp/include/raft/distance/detail/canberra.cuh
+++ b/cpp/include/raft/distance/detail/canberra.cuh
@@ -15,148 +15,23 @@
  */
 
 #pragma once
-#include <raft/distance/detail/pairwise_distance_base.cuh>
+
+#include "distance_ops/canberra.cuh"
+#include "pairwise_matrix/dispatch.cuh"
 
 namespace raft {
 namespace distance {
 namespace detail {
 
-/**
- * @brief the canberra distance matrix calculation implementer
- *  It computes the following equation: cij = max(cij, op(ai-bj))
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
- * @tparam Veclen         number of k-elements loaded by each thread
-                          for every LDG call. details in contractions.cuh
- * @tparam FinalLambda    final lambda called on final distance value
- * @tparam isRowMajor     true if input/output is row major,
-                          false for column major
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of rows of B and cols of C/D
- * @param[in]       k number of cols of A and B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[output]   dOutput output matrix
- * @param fin_op    the final gemm epilogue lambda
- * @param stream    cuda stream to launch work
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-static void canberraImpl(const DataT* x,
-                         const DataT* y,
-                         IdxT m,
-                         IdxT n,
-                         IdxT k,
-                         IdxT lda,
-                         IdxT ldb,
-                         IdxT ldd,
-                         OutT* dOutput,
-                         FinalLambda fin_op,
-                         cudaStream_t stream)
-{
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-  dim3 blk(KPolicy::Nthreads);
-
-  // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    const auto diff = raft::abs(x - y);
-    const auto add  = raft::abs(x) + raft::abs(y);
-    // deal with potential for 0 in denominator by
-    // forcing 1/0 instead
-    acc += ((add != 0) * diff / (add + (add == 0)));
-  };
-
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = raft::void_op();
-
-  if (isRowMajor) {
-    auto canberraRowMajor = pairwiseDistanceMatKernel<false,
-                                                      DataT,
-                                                      AccT,
-                                                      OutT,
-                                                      IdxT,
-                                                      KPolicy,
-                                                      decltype(core_lambda),
-                                                      decltype(epilog_lambda),
-                                                      FinalLambda,
-                                                      true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, canberraRowMajor);
-
-    canberraRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  } else {
-    auto canberraColMajor = pairwiseDistanceMatKernel<false,
-                                                      DataT,
-                                                      AccT,
-                                                      OutT,
-                                                      IdxT,
-                                                      KPolicy,
-                                                      decltype(core_lambda),
-                                                      decltype(epilog_lambda),
-                                                      FinalLambda,
-                                                      false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, canberraColMajor);
-    canberraColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void canberra(IdxT m,
-              IdxT n,
-              IdxT k,
-              IdxT lda,
-              IdxT ldb,
-              IdxT ldd,
-              const DataT* x,
-              const DataT* y,
-              OutT* dOutput,
-              FinalLambda fin_op,
-              cudaStream_t stream)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    canberraImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    canberraImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else {
-    canberraImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  }
-}
 
 /**
  * @brief the canberra distance matrix calculation
  *  It computes the following equation: cij = max(cij, op(ai-bj))
- * @tparam InType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OutType output data-type (for C and D matrices)
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
+ * @tparam DataT input data-type (for A and B matrices)
+ * @tparam AccT accumulation data-type
+ * @tparam OutT output data-type (for C and D matrices)
+ * @tparam FinOpT user-defined epilogue lamba
+ * @tparam IdxT Index type
  * @param[in] m number of rows of A and C/D
  * @param[in] n number of rows of B and cols of C/D
  * @param[in] k number of cols of A and B
@@ -167,34 +42,28 @@ void canberra(IdxT m,
  * @param[in] stream cuda stream to launch work
  * @param[in] isRowMajor whether the input and output matrices are row major
  */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
 void canberraImpl(int m,
                   int n,
                   int k,
-                  const InType* pA,
-                  const InType* pB,
-                  OutType* pD,
-                  FinalLambda fin_op,
+                  const DataT* x,
+                  const DataT* y,
+                  OutT* out,
+                  FinOpT fin_op,
                   cudaStream_t stream,
-                  bool isRowMajor)
+                  bool is_row_major)
 {
-  typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type canberraOutType;
-  Index_ lda, ldb, ldd;
-  canberraOutType* pDcast = reinterpret_cast<canberraOutType*>(pD);
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    canberra<InType, AccType, canberraOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
-  } else {
-    lda = n, ldb = m, ldd = m;
-    canberra<InType, AccType, canberraOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
-  }
+  ops::canberra_distance_op distance_op{};
+
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
+
+  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
 }
 
 }  // namespace detail
diff --git a/cpp/include/raft/distance/detail/distance_ops/canberra.cuh b/cpp/include/raft/distance/detail/distance_ops/canberra.cuh
new file mode 100644
index 0000000000..4fda825286
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/canberra.cuh
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::distance::detail::ops {
+
+// Describes the computation the canberra distance
+
+struct canberra_distance_op {
+  // Load norms of input data
+  static constexpr bool use_norms = false;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy, typename DataT>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize;
+  }
+
+  template <typename AccT, typename DataT>
+  DI void core(AccT& acc, DataT& x, DataT& y) const
+  {
+    const auto diff = raft::abs(x - y);
+    const auto add  = raft::abs(x) + raft::abs(y);
+    // deal with potential for 0 in denominator by
+    // forcing 1/0 instead
+    acc += ((add != 0) * diff / (add + (add == 0)));
+  };
+
+  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+    return;
+  }
+};
+
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/distance_ops/template.cuh b/cpp/include/raft/distance/detail/distance_ops/template.cuh
index cfd12b8bc1..4c624c5593 100644
--- a/cpp/include/raft/distance/detail/distance_ops/template.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/template.cuh
@@ -48,6 +48,7 @@ struct template_op {
                  IdxT gridStrideY) const
   {
     TODO;
-  };
+  }
+};
 
 }  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/l1.cuh b/cpp/include/raft/distance/detail/l1.cuh
index 49402a9101..cceb432c7d 100644
--- a/cpp/include/raft/distance/detail/l1.cuh
+++ b/cpp/include/raft/distance/detail/l1.cuh
@@ -22,7 +22,11 @@ namespace raft {
 namespace distance {
 namespace detail {
 
-template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
 void l1Impl(int m,
             int n,
             int k,

From 58ce6f8fb3da0d6916a0421fcdb82669ea6e28ee Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 13 Jan 2023 18:50:02 +0100
Subject: [PATCH 18/60] Chebyshev: use pairwise matrix dispatch

---
 cpp/include/raft/distance/detail/README.org   |   2 +-
 .../raft/distance/detail/chebyshev.cuh        | 174 +++---------------
 .../detail/distance_ops/chebyshev.cuh         |  55 ++++++
 3 files changed, 78 insertions(+), 153 deletions(-)
 create mode 100644 cpp/include/raft/distance/detail/distance_ops/chebyshev.cuh

diff --git a/cpp/include/raft/distance/detail/README.org b/cpp/include/raft/distance/detail/README.org
index dcb9b1d1e2..f84c2a0f2c 100644
--- a/cpp/include/raft/distance/detail/README.org
+++ b/cpp/include/raft/distance/detail/README.org
@@ -2,7 +2,7 @@
 
 
 - [X] canberra.cuh
-- [ ] chebyshev.cuh
+- [X] chebyshev.cuh
 - [ ] correlation.cuh
 - [ ] cosine.cuh
 - [ ] hamming.cuh
diff --git a/cpp/include/raft/distance/detail/chebyshev.cuh b/cpp/include/raft/distance/detail/chebyshev.cuh
index 43b36e7921..9f49660301 100644
--- a/cpp/include/raft/distance/detail/chebyshev.cuh
+++ b/cpp/include/raft/distance/detail/chebyshev.cuh
@@ -15,136 +15,12 @@
  */
 
 #pragma once
-#include <raft/core/operators.hpp>
-#include <raft/distance/detail/pairwise_distance_base.cuh>
+#include "distance_ops/chebyshev.cuh"
+#include "pairwise_matrix/dispatch.cuh"
 
 namespace raft {
 namespace distance {
 namespace detail {
-/**
- * @brief the Chebyshev distance matrix calculation implementer
- *  It computes the following equation: cij = max(cij, op(ai-bj))
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
- * @tparam Veclen         number of k-elements loaded by each thread
-                          for every LDG call. details in contractions.cuh
- * @tparam FinalLambda    final lambda called on final distance value
- * @tparam isRowMajor     true if input/output is row major,
-                          false for column major
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of rows of B and cols of C/D
- * @param[in]       k number of cols of A and B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[out]      dOutput output matrix
- * @param[in]       fin_op the final gemm epilogue lambda
- * @param[in]       stream cuda stream to launch work
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-static void chebyshevImpl(const DataT* x,
-                          const DataT* y,
-                          IdxT m,
-                          IdxT n,
-                          IdxT k,
-                          IdxT lda,
-                          IdxT ldb,
-                          IdxT ldd,
-                          OutT* dOutput,
-                          FinalLambda fin_op,
-                          cudaStream_t stream)
-{
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-  dim3 blk(KPolicy::Nthreads);
-
-  // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    const auto diff = raft::abs(x - y);
-    acc             = raft::max(acc, diff);
-  };
-
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = raft::void_op();
-
-  if (isRowMajor) {
-    auto chebyshevRowMajor = pairwiseDistanceMatKernel<false,
-                                                       DataT,
-                                                       AccT,
-                                                       OutT,
-                                                       IdxT,
-                                                       KPolicy,
-                                                       decltype(core_lambda),
-                                                       decltype(epilog_lambda),
-                                                       FinalLambda,
-                                                       true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, chebyshevRowMajor);
-
-    chebyshevRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  } else {
-    auto chebyshevColMajor = pairwiseDistanceMatKernel<false,
-                                                       DataT,
-                                                       AccT,
-                                                       OutT,
-                                                       IdxT,
-                                                       KPolicy,
-                                                       decltype(core_lambda),
-                                                       decltype(epilog_lambda),
-                                                       FinalLambda,
-                                                       false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, chebyshevColMajor);
-    chebyshevColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void chebyshev(IdxT m,
-               IdxT n,
-               IdxT k,
-               IdxT lda,
-               IdxT ldb,
-               IdxT ldd,
-               const DataT* x,
-               const DataT* y,
-               OutT* dOutput,
-               FinalLambda fin_op,
-               cudaStream_t stream)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    chebyshevImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    chebyshevImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else {
-    chebyshevImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  }
-}
 
 /**
  * @brief the chebyshev distance matrix calculation
@@ -164,34 +40,28 @@ void chebyshev(IdxT m,
  * @param[in] stream cuda stream to launch work
  * @param[in] isRowMajor whether the input and output matrices are row major
  */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
 void chebyshevImpl(int m,
-                   int n,
-                   int k,
-                   const InType* pA,
-                   const InType* pB,
-                   OutType* pD,
-                   FinalLambda fin_op,
-                   cudaStream_t stream,
-                   bool isRowMajor)
+                  int n,
+                  int k,
+                  const DataT* x,
+                  const DataT* y,
+                  OutT* out,
+                  FinOpT fin_op,
+                  cudaStream_t stream,
+                  bool is_row_major)
 {
-  typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type chebyshevOutType;
-  Index_ lda, ldb, ldd;
-  chebyshevOutType* pDcast = reinterpret_cast<chebyshevOutType*>(pD);
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    chebyshev<InType, AccType, chebyshevOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
-  } else {
-    lda = n, ldb = m, ldd = m;
-    chebyshev<InType, AccType, chebyshevOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
-  }
+  ops::chebyshev_distance_op distance_op{};
+
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
+
+  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
 }
 }  // namespace detail
 }  // namespace distance
diff --git a/cpp/include/raft/distance/detail/distance_ops/chebyshev.cuh b/cpp/include/raft/distance/detail/distance_ops/chebyshev.cuh
new file mode 100644
index 0000000000..ced9fcf6f7
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/chebyshev.cuh
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::distance::detail::ops {
+
+// Describes the computation the chebyshev distance
+
+struct chebyshev_distance_op {
+  // Load norms of input data
+  static constexpr bool use_norms = false;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy, typename DataT>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize;
+  }
+
+  template <typename AccT, typename DataT>
+  DI void core(AccT& acc, DataT& x, DataT& y) const
+  {
+    const auto diff = raft::abs(x - y);
+    acc             = raft::max(acc, diff);
+  };
+
+  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+    return;
+  }
+};
+
+}  // namespace raft::distance::detail::ops

From d397c170e62ec5ebf02d61510830a37381b8c429 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 13 Jan 2023 19:27:50 +0100
Subject: [PATCH 19/60] Correlation: use pairwise matrix dispatch

---
 cpp/include/raft/distance/detail/README.org   |   2 +-
 .../raft/distance/detail/correlation.cuh      | 228 +-----------------
 .../detail/distance_ops/correlation.cuh       | 127 ++++++++++
 .../distance/detail/distance_ops/template.cuh |   2 +-
 4 files changed, 137 insertions(+), 222 deletions(-)
 create mode 100644 cpp/include/raft/distance/detail/distance_ops/correlation.cuh

diff --git a/cpp/include/raft/distance/detail/README.org b/cpp/include/raft/distance/detail/README.org
index f84c2a0f2c..dc66a55f60 100644
--- a/cpp/include/raft/distance/detail/README.org
+++ b/cpp/include/raft/distance/detail/README.org
@@ -3,7 +3,7 @@
 
 - [X] canberra.cuh
 - [X] chebyshev.cuh
-- [ ] correlation.cuh
+- [X] correlation.cuh
 - [ ] cosine.cuh
 - [ ] hamming.cuh
 - [ ] hellinger.cuh
diff --git a/cpp/include/raft/distance/detail/correlation.cuh b/cpp/include/raft/distance/detail/correlation.cuh
index f7fe3678e6..89828c9ba2 100644
--- a/cpp/include/raft/distance/detail/correlation.cuh
+++ b/cpp/include/raft/distance/detail/correlation.cuh
@@ -15,192 +15,16 @@
  */
 
 #pragma once
-#include <raft/distance/detail/pairwise_distance_base.cuh>
+
 #include <raft/linalg/reduce.cuh>
-#include <raft/util/cuda_utils.cuh>
+
+#include "pairwise_matrix/dispatch.cuh"
+#include "distance_ops/correlation.cuh"
 
 namespace raft {
 namespace distance {
 namespace detail {
 
-/**
- * @brief the Correlation distance matrix:
- *
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
- * @tparam Veclen         number of k-elements loaded by each thread
-                          for every LDG call. details in contractions.cuh
- * @tparam FinalLambda    final lambda called on final distance value
- * @tparam isRowMajor     true if input/output is row major,
-                          false for column major
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of rows of B and C/D
- * @param[in]       k number of cols of A and B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[output]   dOutput output matrix
- * @param[in]       fin_op the final gemm epilogue lambda
- * @param[in]       stream cuda stream to launch work
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-static void correlationImpl(const DataT* x,
-                            const DataT* y,
-                            const DataT* xn,
-                            const DataT* yn,
-                            const DataT* x2n,
-                            const DataT* y2n,
-                            IdxT m,
-                            IdxT n,
-                            IdxT k,
-                            IdxT lda,
-                            IdxT ldb,
-                            IdxT ldd,
-                            OutT* dOutput,
-                            FinalLambda fin_op,
-                            cudaStream_t stream)
-{
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-  dim3 blk(KPolicy::Nthreads);
-
-  // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
-
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [x2n, y2n, m, n, k] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn,
-                         DataT * regyn,
-                         IdxT gridStrideX,
-                         IdxT gridStrideY) {
-    DataT regx2n[KPolicy::AccRowsPerTh], regy2n[KPolicy::AccColsPerTh];
-
-    extern __shared__ char smem[];
-    DataT* sx2Norm =
-      (DataT*)(&smem[KPolicy::SmemSize + (KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)]);
-    DataT* sy2Norm = (&sx2Norm[KPolicy::Mblk]);
-
-    // Load x & y norms required by this threadblock in shmem buffer
-    if (gridStrideX == blockIdx.x * KPolicy::Nblk) {
-      for (int i = threadIdx.x; i < KPolicy::Mblk; i += KPolicy::Nthreads) {
-        auto idx   = gridStrideY + i;
-        sx2Norm[i] = idx < m ? x2n[idx] : 0;
-      }
-    }
-
-    for (int i = threadIdx.x; i < KPolicy::Nblk; i += KPolicy::Nthreads) {
-      auto idx   = gridStrideX + i;
-      sy2Norm[i] = idx < n ? y2n[idx] : 0;
-    }
-    __syncthreads();
-
-#pragma unroll
-    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-      regx2n[i] = sx2Norm[i * KPolicy::AccThRows + (threadIdx.x / KPolicy::AccThCols)];
-    }
-#pragma unroll
-    for (int i = 0; i < KPolicy::AccColsPerTh; ++i) {
-      regy2n[i] = sy2Norm[i * KPolicy::AccThCols + (threadIdx.x % KPolicy::AccThCols)];
-    }
-
-#pragma unroll
-    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-#pragma unroll
-      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        auto numer   = k * acc[i][j] - (regxn[i] * regyn[j]);
-        auto Q_denom = k * regx2n[i] - (regxn[i] * regxn[i]);
-        auto R_denom = k * regy2n[j] - (regyn[j] * regyn[j]);
-
-        acc[i][j] = 1 - (numer / raft::sqrt(Q_denom * R_denom));
-      }
-    }
-  };
-
-  constexpr size_t shmemSize =
-    KPolicy::SmemSize + (2 * (KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
-  if (isRowMajor) {
-    constexpr auto correlationRowMajor = pairwiseDistanceMatKernel<true,
-                                                                   DataT,
-                                                                   AccT,
-                                                                   OutT,
-                                                                   IdxT,
-                                                                   KPolicy,
-                                                                   decltype(core_lambda),
-                                                                   decltype(epilog_lambda),
-                                                                   FinalLambda,
-                                                                   true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, correlationRowMajor);
-    correlationRowMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  } else {
-    constexpr auto correlationColMajor = pairwiseDistanceMatKernel<true,
-                                                                   DataT,
-                                                                   AccT,
-                                                                   OutT,
-                                                                   IdxT,
-                                                                   KPolicy,
-                                                                   decltype(core_lambda),
-                                                                   decltype(epilog_lambda),
-                                                                   FinalLambda,
-                                                                   false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, correlationColMajor);
-    correlationColMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void correlation(IdxT m,
-                 IdxT n,
-                 IdxT k,
-                 IdxT lda,
-                 IdxT ldb,
-                 IdxT ldd,
-                 const DataT* x,
-                 const DataT* y,
-                 const DataT* xn,
-                 const DataT* yn,
-                 const DataT* x2n,
-                 const DataT* y2n,
-                 OutT* dOutput,
-                 FinalLambda fin_op,
-                 cudaStream_t stream)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    correlationImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    correlationImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else {
-    correlationImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  }
-}
-
 /**
  * @brief the Correlation distance matrix calculation
  *
@@ -236,11 +60,6 @@ void correlationImpl(int m,
                      cudaStream_t stream,
                      bool isRowMajor)
 {
-  typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type correlationOutType;
-  Index_ lda, ldb, ldd;
-  correlationOutType* pDcast = reinterpret_cast<correlationOutType*>(pD);
-
   ASSERT(!(((pA != pB) && (worksize < 2 * (m + n) * sizeof(AccType))) ||
            (worksize < 2 * m * sizeof(AccType))),
          "workspace size error");
@@ -297,41 +116,10 @@ void correlationImpl(int m,
     raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream);
   }
 
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    correlation<InType, AccType, correlationOutType, Index_, FinalLambda, true>(m,
-                                                                                n,
-                                                                                k,
-                                                                                lda,
-                                                                                ldb,
-                                                                                ldd,
-                                                                                pA,
-                                                                                pB,
-                                                                                norm_col_vec,
-                                                                                norm_row_vec,
-                                                                                sq_norm_col_vec,
-                                                                                sq_norm_row_vec,
-                                                                                pDcast,
-                                                                                fin_op,
-                                                                                stream);
-  } else {
-    lda = n, ldb = m, ldd = m;
-    correlation<InType, AccType, correlationOutType, Index_, FinalLambda, false>(n,
-                                                                                 m,
-                                                                                 k,
-                                                                                 lda,
-                                                                                 ldb,
-                                                                                 ldd,
-                                                                                 pB,
-                                                                                 pA,
-                                                                                 norm_row_vec,
-                                                                                 norm_col_vec,
-                                                                                 sq_norm_row_vec,
-                                                                                 sq_norm_col_vec,
-                                                                                 pDcast,
-                                                                                 fin_op,
-                                                                                 stream);
-  }
+  using CorrOp = ops::correlation_distance_op<InType, Index_>;
+  CorrOp corr_op(isRowMajor, sq_norm_col_vec, sq_norm_row_vec, m, n, k);
+  distance_matrix_dispatch<decltype(corr_op), InType, AccType, OutType, FinalLambda, Index_>(
+    corr_op, m, n, k, pA, pB, norm_col_vec, norm_row_vec, pD, fin_op, stream, isRowMajor);
 }
 
 }  // namespace detail
diff --git a/cpp/include/raft/distance/detail/distance_ops/correlation.cuh b/cpp/include/raft/distance/detail/distance_ops/correlation.cuh
new file mode 100644
index 0000000000..98d90ea0a5
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/correlation.cuh
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace raft::distance::detail::ops {
+
+// Describes the computation the correlation distance
+
+
+template <typename DataT_struct, typename IdxT_struct>
+struct correlation_distance_op {
+  const DataT_struct* x2n;
+  const DataT_struct* y2n;
+  IdxT_struct m;
+  IdxT_struct n;
+  IdxT_struct k;
+
+  correlation_distance_op(
+    bool is_row_major,
+    const DataT_struct* x2n_,
+    const DataT_struct* y2n_,
+    IdxT_struct m_,
+    IdxT_struct n_,
+    IdxT_struct k_
+  ) noexcept
+  : x2n(x2n_),
+    y2n(y2n_),
+    m(m_),
+    n(n_),
+    k(k_)
+  {
+    // The distance op is typically created before the row-major/col-major
+    // swapping has been done. So we do it here.
+    if (!is_row_major) {
+      std::swap<const DataT_struct*>(x2n, y2n);
+      std::swap(m, n);
+    }
+  }
+
+
+  // Load norms of input data
+  static constexpr bool use_norms = true;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy, typename DataT>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize + (2 * (Policy::Mblk + Policy::Nblk) * sizeof(DataT));
+  }
+
+  template <typename AccT, typename DataT>
+  DI void core(AccT& acc, DataT& x, DataT& y) const
+  {
+    acc += x * y;
+  };
+
+  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+    // Note how we can sneakily get a pointer to shared memory here, to store
+    // more data. If the implementation of PairwiseDistanceMatKernel ever
+    // changes, this will be where we find the bugs.
+    extern __shared__ char smem[];
+
+    DataT regx2n[Policy::AccRowsPerTh], regy2n[Policy::AccColsPerTh];
+
+    DataT* sx2Norm =
+      (DataT*)(&smem[Policy::SmemSize + (Policy::Mblk + Policy::Nblk) * sizeof(DataT)]);
+    DataT* sy2Norm = (&sx2Norm[Policy::Mblk]);
+
+    // Load x & y norms required by this threadblock in shmem buffer
+    if (gridStrideX == blockIdx.x * Policy::Nblk) {
+      for (int i = threadIdx.x; i < Policy::Mblk; i += Policy::Nthreads) {
+        auto idx   = gridStrideY + i;
+        sx2Norm[i] = idx < m ? x2n[idx] : 0;
+      }
+    }
+
+    for (int i = threadIdx.x; i < Policy::Nblk; i += Policy::Nthreads) {
+      auto idx   = gridStrideX + i;
+      sy2Norm[i] = idx < n ? y2n[idx] : 0;
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+      regx2n[i] = sx2Norm[i * Policy::AccThRows + (threadIdx.x / Policy::AccThCols)];
+    }
+#pragma unroll
+    for (int i = 0; i < Policy::AccColsPerTh; ++i) {
+      regy2n[i] = sy2Norm[i * Policy::AccThCols + (threadIdx.x % Policy::AccThCols)];
+    }
+
+#pragma unroll
+    for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+        auto numer   = k * acc[i][j] - (regxn[i] * regyn[j]);
+        auto Q_denom = k * regx2n[i] - (regxn[i] * regxn[i]);
+        auto R_denom = k * regy2n[j] - (regyn[j] * regyn[j]);
+
+        acc[i][j] = 1 - (numer / raft::sqrt(Q_denom * R_denom));
+      }
+    }
+  }
+};
+
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/distance_ops/template.cuh b/cpp/include/raft/distance/detail/distance_ops/template.cuh
index 4c624c5593..98c35c6295 100644
--- a/cpp/include/raft/distance/detail/distance_ops/template.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/template.cuh
@@ -22,7 +22,7 @@ namespace raft::distance::detail::ops {
 //
 // Fill in the TODO items.
 
-struct template_op {
+struct template_distance_op {
   // Load norms of input data
   static constexpr bool use_norms = TODO;
 

From 7005a4f2361d77ad7e5001d98f10fd4477f8c669 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 13 Jan 2023 19:40:50 +0100
Subject: [PATCH 20/60] Hamming: use pairwise matrix dispatch

---
 cpp/include/raft/distance/detail/README.org   |  2 +-
 .../distance/detail/distance_ops/hamming.cuh  | 64 +++++++++++++++++++
 cpp/include/raft/distance/detail/hamming.cuh  | 43 ++++++-------
 3 files changed, 83 insertions(+), 26 deletions(-)
 create mode 100644 cpp/include/raft/distance/detail/distance_ops/hamming.cuh

diff --git a/cpp/include/raft/distance/detail/README.org b/cpp/include/raft/distance/detail/README.org
index dc66a55f60..223a50bee1 100644
--- a/cpp/include/raft/distance/detail/README.org
+++ b/cpp/include/raft/distance/detail/README.org
@@ -5,7 +5,7 @@
 - [X] chebyshev.cuh
 - [X] correlation.cuh
 - [ ] cosine.cuh
-- [ ] hamming.cuh
+- [X] hamming.cuh
 - [ ] hellinger.cuh
 - [ ] jensen_shannon.cuh
 - [ ] kl_divergence.cuh
diff --git a/cpp/include/raft/distance/detail/distance_ops/hamming.cuh b/cpp/include/raft/distance/detail/distance_ops/hamming.cuh
new file mode 100644
index 0000000000..1f88424d70
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/hamming.cuh
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace raft::distance::detail::ops {
+
+// Describes the computation the hamming distance
+
+template <typename IdxT_struct>
+struct hamming_distance_op {
+  IdxT_struct k;
+
+  hamming_distance_op(IdxT_struct k_) noexcept : k(k_) { }
+
+  // Load norms of input data
+  static constexpr bool use_norms = false;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy, typename DataT>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize;
+  }
+
+  template <typename AccT, typename DataT>
+  DI void core(AccT& acc, DataT& x, DataT& y) const
+  {
+    acc += (x != y);
+  };
+
+  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+    const DataT one_over_k = DataT(1.0) / k;
+#pragma unroll
+    for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+        acc[i][j] *= one_over_k;
+      }
+    }
+  }
+};
+
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/hamming.cuh b/cpp/include/raft/distance/detail/hamming.cuh
index bed9d09e3e..7d283def21 100644
--- a/cpp/include/raft/distance/detail/hamming.cuh
+++ b/cpp/include/raft/distance/detail/hamming.cuh
@@ -15,7 +15,8 @@
  */
 
 #pragma once
-#include <raft/distance/detail/pairwise_distance_base.cuh>
+#include "distance_ops/hamming.cuh"
+#include "pairwise_matrix/dispatch.cuh"
 
 namespace raft {
 namespace distance {
@@ -178,36 +179,28 @@ void hammingUnexpanded(IdxT m,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
 void hammingUnexpandedImpl(int m,
                            int n,
                            int k,
-                           const InType* pA,
-                           const InType* pB,
-                           OutType* pD,
-                           FinalLambda fin_op,
+                           const DataT* x,
+                           const DataT* y,
+                           OutT* out,
+                           FinOpT fin_op,
                            cudaStream_t stream,
-                           bool isRowMajor)
+                           bool is_row_major)
 {
-  typedef std::is_same<OutType, bool> is_bool;
-  typedef
-    typename std::conditional<is_bool::value, OutType, AccType>::type hammingUnexpandedOutType;
-  Index_ lda, ldb, ldd;
-  hammingUnexpandedOutType* pDcast = reinterpret_cast<hammingUnexpandedOutType*>(pD);
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    hammingUnexpanded<InType, AccType, hammingUnexpandedOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
+  ops::hamming_distance_op<IdxT> distance_op{k};
 
-  } else {
-    lda = n, ldb = m, ldd = m;
-    hammingUnexpanded<InType, AccType, hammingUnexpandedOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
-  }
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
+
+  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
 }
 
 }  // namespace detail

From 7831debb75eea89406ade619aab81890917be9a0 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 13 Jan 2023 20:07:46 +0100
Subject: [PATCH 21/60] Hellinger: use pairwise matrix dispatch

---
 cpp/include/raft/distance/detail/README.org   |   2 +-
 .../detail/distance_ops/hellinger.cuh         |  66 ++++++
 .../distance/detail/distance_ops/template.cuh |   4 +
 .../raft/distance/detail/hellinger.cuh        | 219 +++---------------
 4 files changed, 107 insertions(+), 184 deletions(-)
 create mode 100644 cpp/include/raft/distance/detail/distance_ops/hellinger.cuh

diff --git a/cpp/include/raft/distance/detail/README.org b/cpp/include/raft/distance/detail/README.org
index 223a50bee1..47239d3f69 100644
--- a/cpp/include/raft/distance/detail/README.org
+++ b/cpp/include/raft/distance/detail/README.org
@@ -6,7 +6,7 @@
 - [X] correlation.cuh
 - [ ] cosine.cuh
 - [X] hamming.cuh
-- [ ] hellinger.cuh
+- [X] hellinger.cuh
 - [ ] jensen_shannon.cuh
 - [ ] kl_divergence.cuh
 - [ ] minkowski.cuh
diff --git a/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh b/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh
new file mode 100644
index 0000000000..b01f118923
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::distance::detail::ops {
+
+// Describes the computation the hellinger distance
+//
+// Fill in the TODO items.
+
+struct hellinger_distance_op {
+  // Load norms of input data
+  static constexpr bool use_norms = false;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy, typename DataT>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize;
+  }
+
+  template <typename AccT, typename DataT>
+  DI void core(AccT& acc, DataT& x, DataT& y) const
+  {
+    // This is sqrt(x) * sqrt(y).
+    const auto product = x * y;
+    acc += product;
+  };
+
+  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+#pragma unroll
+    for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+        // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
+        const auto finalVal  = (1 - acc[i][j]);
+        const auto rectifier = (!signbit(finalVal));
+        acc[i][j]            = raft::sqrt(rectifier * finalVal);
+      }
+    }
+  }
+};
+
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/distance_ops/template.cuh b/cpp/include/raft/distance/detail/distance_ops/template.cuh
index 98c35c6295..c770a575a0 100644
--- a/cpp/include/raft/distance/detail/distance_ops/template.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/template.cuh
@@ -23,6 +23,10 @@ namespace raft::distance::detail::ops {
 // Fill in the TODO items.
 
 struct template_distance_op {
+  TODO member;
+
+  template_distance_op(TODO member_) noexcept : member(member_) { }
+
   // Load norms of input data
   static constexpr bool use_norms = TODO;
 
diff --git a/cpp/include/raft/distance/detail/hellinger.cuh b/cpp/include/raft/distance/detail/hellinger.cuh
index 13507fe84f..306977f266 100644
--- a/cpp/include/raft/distance/detail/hellinger.cuh
+++ b/cpp/include/raft/distance/detail/hellinger.cuh
@@ -15,173 +15,16 @@
  */
 
 #pragma once
-#include <raft/distance/detail/pairwise_distance_base.cuh>
+#include <raft/core/operators.cuh>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/util/cuda_utils.cuh>
+
+#include "pairwise_matrix/dispatch.cuh"
+#include "distance_ops/hellinger.cuh"
 
 namespace raft {
 namespace distance {
 namespace detail {
 
-/**
- * @brief the Hellinger distance matrix using the expanded form:
- *  It computes the following equation:
-    cij = sqrt(1 - sum(sqrt(x_k * y_k)))
- * This distance computation modifies A and B by computing a sqrt
- * and then performing a `pow(x, 2)` to convert it back. Because of this,
- * it is possible that the values in A and B might differ slightly
- * after this is invoked.
- *
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
- * @tparam Veclen         number of k-elements loaded by each thread
-                          for every LDG call. details in contractions.cuh
- * @tparam FinalLambda    final lambda called on final distance value
- * @tparam isRowMajor     true if input/output is row major,
-                          false for column major
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of rows of B and C/D
- * @param[in]       k number of cols of A and B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[output]   dOutput output matrix
- * @param[in]       fin_op the final gemm epilogue lambda
- * @param[in]       stream cuda stream to launch work
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-static void hellingerImpl(const DataT* x,
-                          const DataT* y,
-                          IdxT m,
-                          IdxT n,
-                          IdxT k,
-                          IdxT lda,
-                          IdxT ldb,
-                          IdxT ldd,
-                          OutT* dOutput,
-                          FinalLambda fin_op,
-                          cudaStream_t stream)
-{
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-  dim3 blk(KPolicy::Nthreads);
-
-  // First sqrt x and y
-  raft::linalg::unaryOp<DataT, raft::sqrt_op, IdxT>((DataT*)x, x, m * k, raft::sqrt_op{}, stream);
-  if (x != y) {
-    raft::linalg::unaryOp<DataT, raft::sqrt_op, IdxT>((DataT*)y, y, n * k, raft::sqrt_op{}, stream);
-  }
-
-  // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    // This is sqrt(x) * sqrt(y).
-    const auto product = x * y;
-    acc += product;
-  };
-
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                     DataT * regxn,
-                                     DataT * regyn,
-                                     IdxT gridStrideX,
-                                     IdxT gridStrideY) {
-#pragma unroll
-    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-#pragma unroll
-      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
-        const auto finalVal  = (1 - acc[i][j]);
-        const auto rectifier = (!signbit(finalVal));
-        acc[i][j]            = raft::sqrt(rectifier * finalVal);
-      }
-    }
-  };
-
-  if (isRowMajor) {
-    auto hellingerRowMajor = pairwiseDistanceMatKernel<false,
-                                                       DataT,
-                                                       AccT,
-                                                       OutT,
-                                                       IdxT,
-                                                       KPolicy,
-                                                       decltype(core_lambda),
-                                                       decltype(epilog_lambda),
-                                                       FinalLambda,
-                                                       true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, hellingerRowMajor);
-
-    hellingerRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  } else {
-    auto hellingerColMajor = pairwiseDistanceMatKernel<false,
-                                                       DataT,
-                                                       AccT,
-                                                       OutT,
-                                                       IdxT,
-                                                       KPolicy,
-                                                       decltype(core_lambda),
-                                                       decltype(epilog_lambda),
-                                                       FinalLambda,
-                                                       false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, hellingerColMajor);
-    hellingerColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  }
-
-  // Revert sqrt of x and y
-  raft::linalg::unaryOp<DataT, raft::sqrt_op, IdxT>((DataT*)x, x, m * k, raft::sqrt_op{}, stream);
-  if (x != y) {
-    raft::linalg::unaryOp<DataT, raft::sqrt_op, IdxT>((DataT*)y, y, n * k, raft::sqrt_op{}, stream);
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void hellinger(IdxT m,
-               IdxT n,
-               IdxT k,
-               IdxT lda,
-               IdxT ldb,
-               IdxT ldd,
-               const DataT* x,
-               const DataT* y,
-               OutT* dOutput,
-               FinalLambda fin_op,
-               cudaStream_t stream)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    hellingerImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    hellingerImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else {
-    hellingerImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  }
-}
-
 /**
  * @brief the Hellinger distance matrix calculation
  *  It computes the following equation:
@@ -206,35 +49,45 @@ void hellinger(IdxT m,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
 void hellingerImpl(int m,
                    int n,
                    int k,
-                   const InType* pA,
-                   const InType* pB,
-                   OutType* pD,
-                   FinalLambda fin_op,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   FinOpT fin_op,
                    cudaStream_t stream,
-                   bool isRowMajor)
+                   bool is_row_major)
 {
-  typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type hellingerOutType;
-  Index_ lda, ldb, ldd;
-  hellingerOutType* pDcast = reinterpret_cast<hellingerOutType*>(pD);
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    hellinger<InType, AccType, hellingerOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
+  // First sqrt x and y
+  const auto raft_sqrt = raft::linalg::unaryOp<DataT, raft::sqrt_op, IdxT>;
+
+  raft_sqrt((DataT*)x, x, m * k, raft::sqrt_op{}, stream);
+  if (x != y) {
+    raft_sqrt((DataT*)y, y, n * k, raft::sqrt_op{}, stream);
+  }
 
-  } else {
-    lda = n, ldb = m, ldd = m;
-    hellinger<InType, AccType, hellingerOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
+  // Then calculate Hellinger distance
+  ops::hellinger_distance_op distance_op{};
+
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
+
+  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
+
+  // Finally revert sqrt of x and y
+  raft_sqrt((DataT*)x, x, m * k, raft::sqrt_op{}, stream);
+  if (x != y) {
+    raft_sqrt((DataT*)y, y, n * k, raft::sqrt_op{}, stream);
   }
+
+  RAFT_CUDA_TRY(cudaGetLastError());
 }
 }  // namespace detail
 }  // namespace distance

From 4dc72ce00b25e077e0654ae4f05f8b8c3b3e7789 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 13 Jan 2023 20:14:28 +0100
Subject: [PATCH 22/60] Jensen-Shannon: use pairwise matrix dispatch

---
 cpp/include/raft/distance/detail/README.org   |   2 +-
 .../detail/distance_ops/jensen_shannon.cuh    |  65 ++++++
 .../raft/distance/detail/jensen_shannon.cuh   | 188 ++----------------
 3 files changed, 85 insertions(+), 170 deletions(-)
 create mode 100644 cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh

diff --git a/cpp/include/raft/distance/detail/README.org b/cpp/include/raft/distance/detail/README.org
index 47239d3f69..4f18391fce 100644
--- a/cpp/include/raft/distance/detail/README.org
+++ b/cpp/include/raft/distance/detail/README.org
@@ -7,7 +7,7 @@
 - [ ] cosine.cuh
 - [X] hamming.cuh
 - [X] hellinger.cuh
-- [ ] jensen_shannon.cuh
+- [X] jensen_shannon.cuh
 - [ ] kl_divergence.cuh
 - [ ] minkowski.cuh
 - [ ] russell_rao.cuh
diff --git a/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh b/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh
new file mode 100644
index 0000000000..116af61964
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::distance::detail::ops {
+
+// Describes the computation the jensen_shannon distance
+
+struct jensen_shannon_distance_op {
+  // Load norms of input data
+  static constexpr bool use_norms = false;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy, typename DataT>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize;
+  }
+
+  template <typename AccT, typename DataT>
+  DI void core(AccT& acc, DataT& x, DataT& y) const
+  {
+    const DataT m     = 0.5f * (x + y);
+    const bool m_zero = (m == 0);
+    const auto logM   = (!m_zero) * raft::log(m + m_zero);
+
+    const bool x_zero = (x == 0);
+    const bool y_zero = (y == 0);
+    acc += (-x * (logM - raft::log(x + x_zero))) + (-y * (logM - raft::log(y + y_zero)));
+  };
+
+  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+#pragma unroll
+    for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+        acc[i][j] = raft::sqrt(0.5 * acc[i][j]);
+      }
+    }
+  }
+};
+
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/jensen_shannon.cuh b/cpp/include/raft/distance/detail/jensen_shannon.cuh
index f96da01b87..71339e0c1a 100644
--- a/cpp/include/raft/distance/detail/jensen_shannon.cuh
+++ b/cpp/include/raft/distance/detail/jensen_shannon.cuh
@@ -15,157 +15,14 @@
  */
 
 #pragma once
-#include <raft/distance/detail/pairwise_distance_base.cuh>
+#include "distance_ops/jensen_shannon.cuh"
+#include "pairwise_matrix/dispatch.cuh"
+
 
 namespace raft {
 namespace distance {
 namespace detail {
 
-/**
- * @brief the Jensen Shannon distance matrix:
- *  It computes the following equation:
-    Cij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i))
-            + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i)))))
- *
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
- * @tparam Veclen         number of k-elements loaded by each thread
-                          for every LDG call. details in contractions.cuh
- * @tparam FinalLambda    final lambda called on final distance value
- * @tparam isRowMajor     true if input/output is row major,
-                          false for column major
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of rows of B and C/D
- * @param[in]       k number of cols of A and B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[output]   dOutput output matrix
- * @param[in]       fin_op the final gemm epilogue lambda
- * @param[in]       stream cuda stream to launch work
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-static void jensenShannonImpl(const DataT* x,
-                              const DataT* y,
-                              IdxT m,
-                              IdxT n,
-                              IdxT k,
-                              IdxT lda,
-                              IdxT ldb,
-                              IdxT ldd,
-                              OutT* dOutput,
-                              FinalLambda fin_op,
-                              cudaStream_t stream)
-{
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-  dim3 blk(KPolicy::Nthreads);
-
-  // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    const DataT m     = 0.5f * (x + y);
-    const bool m_zero = (m == 0);
-    const auto logM   = (!m_zero) * raft::log(m + m_zero);
-
-    const bool x_zero = (x == 0);
-    const bool y_zero = (y == 0);
-    acc += (-x * (logM - raft::log(x + x_zero))) + (-y * (logM - raft::log(y + y_zero)));
-  };
-
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                     DataT * regxn,
-                                     DataT * regyn,
-                                     IdxT gridStrideX,
-                                     IdxT gridStrideY) {
-#pragma unroll
-    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-#pragma unroll
-      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        acc[i][j] = raft::sqrt(0.5 * acc[i][j]);
-      }
-    }
-  };
-
-  if (isRowMajor) {
-    auto jensenShannonRowMajor = pairwiseDistanceMatKernel<false,
-                                                           DataT,
-                                                           AccT,
-                                                           OutT,
-                                                           IdxT,
-                                                           KPolicy,
-                                                           decltype(core_lambda),
-                                                           decltype(epilog_lambda),
-                                                           FinalLambda,
-                                                           true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, jensenShannonRowMajor);
-
-    jensenShannonRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  } else {
-    auto jensenShannonColMajor = pairwiseDistanceMatKernel<false,
-                                                           DataT,
-                                                           AccT,
-                                                           OutT,
-                                                           IdxT,
-                                                           KPolicy,
-                                                           decltype(core_lambda),
-                                                           decltype(epilog_lambda),
-                                                           FinalLambda,
-                                                           false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, jensenShannonColMajor);
-    jensenShannonColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void jensenShannon(IdxT m,
-                   IdxT n,
-                   IdxT k,
-                   IdxT lda,
-                   IdxT ldb,
-                   IdxT ldd,
-                   const DataT* x,
-                   const DataT* y,
-                   OutT* dOutput,
-                   FinalLambda fin_op,
-                   cudaStream_t stream)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    jensenShannonImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    jensenShannonImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else {
-    jensenShannonImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  }
-}
-
 /**
  * @brief the Jensen Shannon distance matrix calculation
  *  It computes the following equation:
@@ -187,35 +44,28 @@ void jensenShannon(IdxT m,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
 void jensenShannonImpl(int m,
                        int n,
                        int k,
-                       const InType* pA,
-                       const InType* pB,
-                       OutType* pD,
-                       FinalLambda fin_op,
+                       const DataT* x,
+                       const DataT* y,
+                       OutT* out,
+                       FinOpT fin_op,
                        cudaStream_t stream,
-                       bool isRowMajor)
+                       bool is_row_major)
 {
-  typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type jensenShannonOutType;
-  Index_ lda, ldb, ldd;
-  jensenShannonOutType* pDcast = reinterpret_cast<jensenShannonOutType*>(pD);
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    jensenShannon<InType, AccType, jensenShannonOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
+  ops::jensen_shannon_distance_op distance_op{};
+
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
 
-  } else {
-    lda = n, ldb = m, ldd = m;
-    jensenShannon<InType, AccType, jensenShannonOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
-  }
+  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
 }
 }  // namespace detail
 }  // namespace distance

From b0d36c1cc4d2f34bdc8ff3a6281ad0b7145b2b32 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 13 Jan 2023 20:53:29 +0100
Subject: [PATCH 23/60] remove old hamming code

---
 cpp/include/raft/distance/detail/hamming.cuh | 137 -------------------
 1 file changed, 137 deletions(-)

diff --git a/cpp/include/raft/distance/detail/hamming.cuh b/cpp/include/raft/distance/detail/hamming.cuh
index 7d283def21..9935c96a40 100644
--- a/cpp/include/raft/distance/detail/hamming.cuh
+++ b/cpp/include/raft/distance/detail/hamming.cuh
@@ -22,143 +22,6 @@ namespace raft {
 namespace distance {
 namespace detail {
 
-/**
- * @brief the Hamming distance matrix using the unexpanded form:
- *  It computes the following equation:
-    Cij = sum(x_i != y_i) / k
- *
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
- * @tparam Veclen         number of k-elements loaded by each thread
-                          for every LDG call. details in contractions.cuh
- * @tparam FinalLambda    final lambda called on final distance value
- * @tparam isRowMajor     true if input/output is row major,
-                          false for column major
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of rows of B and C/D
- * @param[in]       k number of cols of A and B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[output]   dOutput output matrix
- * @param[in]       fin_op the final gemm epilogue lambda
- * @param[in]       stream cuda stream to launch work
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-static void hammingUnexpandedImpl(const DataT* x,
-                                  const DataT* y,
-                                  IdxT m,
-                                  IdxT n,
-                                  IdxT k,
-                                  IdxT lda,
-                                  IdxT ldb,
-                                  IdxT ldd,
-                                  OutT* dOutput,
-                                  FinalLambda fin_op,
-                                  cudaStream_t stream)
-{
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-  dim3 blk(KPolicy::Nthreads);
-
-  // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += (x != y); };
-
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [k] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                      DataT * regxn,
-                                      DataT * regyn,
-                                      IdxT gridStrideX,
-                                      IdxT gridStrideY) {
-    const DataT one_over_k = DataT(1.0) / k;
-#pragma unroll
-    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-#pragma unroll
-      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        acc[i][j] *= one_over_k;
-      }
-    }
-  };
-
-  if (isRowMajor) {
-    auto hammingUnexpandedRowMajor = pairwiseDistanceMatKernel<false,
-                                                               DataT,
-                                                               AccT,
-                                                               OutT,
-                                                               IdxT,
-                                                               KPolicy,
-                                                               decltype(core_lambda),
-                                                               decltype(epilog_lambda),
-                                                               FinalLambda,
-                                                               true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, hammingUnexpandedRowMajor);
-
-    hammingUnexpandedRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  } else {
-    auto hammingUnexpandedColMajor = pairwiseDistanceMatKernel<false,
-                                                               DataT,
-                                                               AccT,
-                                                               OutT,
-                                                               IdxT,
-                                                               KPolicy,
-                                                               decltype(core_lambda),
-                                                               decltype(epilog_lambda),
-                                                               FinalLambda,
-                                                               false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, hammingUnexpandedColMajor);
-    hammingUnexpandedColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void hammingUnexpanded(IdxT m,
-                       IdxT n,
-                       IdxT k,
-                       IdxT lda,
-                       IdxT ldb,
-                       IdxT ldd,
-                       const DataT* x,
-                       const DataT* y,
-                       OutT* dOutput,
-                       FinalLambda fin_op,
-                       cudaStream_t stream)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    hammingUnexpandedImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    hammingUnexpandedImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else {
-    hammingUnexpandedImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  }
-}
-
 /**
  * @brief the Hamming Unexpanded distance matrix calculation
  *  It computes the following equation:

From e95a65bbee829ede8cfc3645f41bc512d38afdab Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 13 Jan 2023 20:55:35 +0100
Subject: [PATCH 24/60] KL divergence: use pairwise matrix dispatch

---
 cpp/include/raft/distance/detail/README.org   |   6 +-
 .../detail/distance_ops/kl_divergence.cuh     |  88 +++++
 .../raft/distance/detail/kl_divergence.cuh    | 329 +++---------------
 3 files changed, 135 insertions(+), 288 deletions(-)
 create mode 100644 cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh

diff --git a/cpp/include/raft/distance/detail/README.org b/cpp/include/raft/distance/detail/README.org
index 4f18391fce..f5d3b6b0a6 100644
--- a/cpp/include/raft/distance/detail/README.org
+++ b/cpp/include/raft/distance/detail/README.org
@@ -8,6 +8,10 @@
 - [X] hamming.cuh
 - [X] hellinger.cuh
 - [X] jensen_shannon.cuh
-- [ ] kl_divergence.cuh
+- [X] kl_divergence.cuh
+  - *Notes*: the isRowMajor and x_equal_y boolean parameters where previously
+    template / constexpr parameters. Now they are passed by value. This greatly
+    reduces the number of kernels, but may have negative consequences for run
+    time.
 - [ ] minkowski.cuh
 - [ ] russell_rao.cuh
diff --git a/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh b/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh
new file mode 100644
index 0000000000..a1f438e0d4
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::distance::detail::ops {
+
+// Describes the computation of the kl_divergence
+struct kl_divergence_op {
+  const bool is_row_major;
+  const bool x_equal_y;
+
+  kl_divergence_op(
+    bool row_major_,
+    bool x_equal_y_=false
+  ) noexcept
+  : is_row_major(row_major_),
+    x_equal_y(x_equal_y_)
+  { }
+
+  // Load norms of input data
+  static constexpr bool use_norms = false;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy, typename DataT>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize;
+  }
+
+  template <typename AccT, typename DataT>
+  DI void core(AccT& acc, DataT& x, DataT& y) const
+  {
+    // TODO: make sure that these branches get hoisted out of main loop.. Could
+    // be quite expensive otherwise.
+    if (x_equal_y) {
+      if (is_row_major) {
+        const bool x_zero = (x == 0);
+        const bool y_zero = (y == 0);
+        acc += x * (raft::log(x + x_zero) - (!y_zero) * raft::log(y + y_zero));
+      } else {
+        const bool y_zero = (y == 0);
+        const bool x_zero = (x == 0);
+        acc += y * (raft::log(y + y_zero) - (!x_zero) * raft::log(x + x_zero));
+      }
+    } else {
+      if (is_row_major) {
+        const bool x_zero = (x == 0);
+        acc += x * (raft::log(x + x_zero) - y);
+      } else {
+        const bool y_zero = (y == 0);
+        acc += y * (raft::log(y + y_zero) - x);
+      }
+    }
+  };
+
+  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+#pragma unroll
+    for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+        acc[i][j] = (0.5f * acc[i][j]);
+      }
+    }
+  }
+};
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/kl_divergence.cuh b/cpp/include/raft/distance/detail/kl_divergence.cuh
index 7ebeaf4de9..e76cd5a3b9 100644
--- a/cpp/include/raft/distance/detail/kl_divergence.cuh
+++ b/cpp/include/raft/distance/detail/kl_divergence.cuh
@@ -15,276 +15,16 @@
  */
 
 #pragma once
-#include <raft/distance/detail/pairwise_distance_base.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/linalg/unary_op.cuh>
+
+#include "distance_ops/kl_divergence.cuh"
+#include "pairwise_matrix/dispatch.cuh"
 
 namespace raft {
 namespace distance {
 namespace detail {
 
-/**
- * @brief the KL Divergence distance matrix:
- *  It computes the following equation:
-    Cij = 0.5 * sum(x * log (x / y));
- * This distance computation modifies A or B by computing a log(x)
- * and then performing a `pow(e, log(x))` to convert it back. Because of this,
- * it is possible that the values in A or B might differ slightly
- * after this is invoked.
- *
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
- * @tparam Veclen         number of k-elements loaded by each thread
-                          for every LDG call. details in contractions.cuh
- * @tparam FinalLambda    final lambda called on final distance value
- * @tparam isRowMajor     true if input/output is row major,
-                          false for column major
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of rows of B and C/D
- * @param[in]       k number of cols of A and B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[output]   dOutput output matrix
- * @param[in]       fin_op the final gemm epilogue lambda
- * @param[in]       stream cuda stream to launch work
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-static void klDivergenceImpl(const DataT* x,
-                             const DataT* y,
-                             IdxT m,
-                             IdxT n,
-                             IdxT k,
-                             IdxT lda,
-                             IdxT ldb,
-                             IdxT ldd,
-                             OutT* dOutput,
-                             FinalLambda fin_op,
-                             cudaStream_t stream)
-{
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-  dim3 blk(KPolicy::Nthreads);
-
-  // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    if (isRowMajor) {
-      const bool x_zero = (x == 0);
-      acc += x * (raft::log(x + x_zero) - y);
-    } else {
-      const bool y_zero = (y == 0);
-      acc += y * (raft::log(y + y_zero) - x);
-    }
-  };
-
-  auto core_lambda_x_equal_y = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    if (isRowMajor) {
-      const bool x_zero = (x == 0);
-      const bool y_zero = (y == 0);
-      acc += x * (raft::log(x + x_zero) - (!y_zero) * raft::log(y + y_zero));
-    } else {
-      const bool y_zero = (y == 0);
-      const bool x_zero = (x == 0);
-      acc += y * (raft::log(y + y_zero) - (!x_zero) * raft::log(x + x_zero));
-    }
-  };
-
-  auto unaryOp_lambda = [] __device__(DataT input) {
-    const bool x_zero = (input == 0);
-    return (!x_zero) * raft::log(input + x_zero);
-  };
-
-  auto unaryOp_lambda_reverse = [] __device__(DataT input) {
-    // reverse previous log (x) back to x using (e ^ log(x))
-    const bool x_zero = (input == 0);
-    return (!x_zero) * raft::exp(input);
-  };
-
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                     DataT * regxn,
-                                     DataT * regyn,
-                                     IdxT gridStrideX,
-                                     IdxT gridStrideY) {
-#pragma unroll
-    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-#pragma unroll
-      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        acc[i][j] = (0.5f * acc[i][j]);
-      }
-    }
-  };
-
-  if (isRowMajor) {
-    constexpr auto klDivergenceRowMajor = pairwiseDistanceMatKernel<false,
-                                                                    DataT,
-                                                                    AccT,
-                                                                    OutT,
-                                                                    IdxT,
-                                                                    KPolicy,
-                                                                    decltype(core_lambda),
-                                                                    decltype(epilog_lambda),
-                                                                    FinalLambda,
-                                                                    true>;
-    constexpr auto klDivergenceRowMajorXequalY =
-      pairwiseDistanceMatKernel<false,
-                                DataT,
-                                AccT,
-                                OutT,
-                                IdxT,
-                                KPolicy,
-                                decltype(core_lambda_x_equal_y),
-                                decltype(epilog_lambda),
-                                FinalLambda,
-                                true>;
-    if (x != y) {
-      raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-        (DataT*)y, y, n * k, unaryOp_lambda, stream);
-      dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, klDivergenceRowMajor);
-      klDivergenceRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(x,
-                                                                     y,
-                                                                     nullptr,
-                                                                     nullptr,
-                                                                     m,
-                                                                     n,
-                                                                     k,
-                                                                     lda,
-                                                                     ldb,
-                                                                     ldd,
-                                                                     dOutput,
-                                                                     core_lambda,
-                                                                     epilog_lambda,
-                                                                     fin_op);
-      // Now reverse previous log (x) back to x using (e ^ log(x))
-      raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda_reverse), IdxT>(
-        (DataT*)y, y, n * k, unaryOp_lambda_reverse, stream);
-    } else {
-      dim3 grid =
-        launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, klDivergenceRowMajorXequalY);
-      klDivergenceRowMajorXequalY<<<grid, blk, KPolicy::SmemSize, stream>>>(x,
-                                                                            y,
-                                                                            nullptr,
-                                                                            nullptr,
-                                                                            m,
-                                                                            n,
-                                                                            k,
-                                                                            lda,
-                                                                            ldb,
-                                                                            ldd,
-                                                                            dOutput,
-                                                                            core_lambda_x_equal_y,
-                                                                            epilog_lambda,
-                                                                            fin_op);
-    }
-  } else {
-    constexpr auto klDivergenceColMajor = pairwiseDistanceMatKernel<false,
-                                                                    DataT,
-                                                                    AccT,
-                                                                    OutT,
-                                                                    IdxT,
-                                                                    KPolicy,
-                                                                    decltype(core_lambda),
-                                                                    decltype(epilog_lambda),
-                                                                    FinalLambda,
-                                                                    false>;
-    constexpr auto klDivergenceColMajorXequalY =
-      pairwiseDistanceMatKernel<false,
-                                DataT,
-                                AccT,
-                                OutT,
-                                IdxT,
-                                KPolicy,
-                                decltype(core_lambda_x_equal_y),
-                                decltype(epilog_lambda),
-                                FinalLambda,
-                                false>;
-    if (x != y) {
-      raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-        (DataT*)x, x, m * k, unaryOp_lambda, stream);
-      dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, klDivergenceColMajor);
-      klDivergenceColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(x,
-                                                                     y,
-                                                                     nullptr,
-                                                                     nullptr,
-                                                                     m,
-                                                                     n,
-                                                                     k,
-                                                                     lda,
-                                                                     ldb,
-                                                                     ldd,
-                                                                     dOutput,
-                                                                     core_lambda,
-                                                                     epilog_lambda,
-                                                                     fin_op);
-      // Now reverse previous log (x) back to x using (e ^ log(x))
-      raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda_reverse), IdxT>(
-        (DataT*)x, x, m * k, unaryOp_lambda_reverse, stream);
-    } else {
-      dim3 grid =
-        launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, klDivergenceColMajorXequalY);
-      klDivergenceColMajorXequalY<<<grid, blk, KPolicy::SmemSize, stream>>>(x,
-                                                                            y,
-                                                                            nullptr,
-                                                                            nullptr,
-                                                                            m,
-                                                                            n,
-                                                                            k,
-                                                                            lda,
-                                                                            ldb,
-                                                                            ldd,
-                                                                            dOutput,
-                                                                            core_lambda_x_equal_y,
-                                                                            epilog_lambda,
-                                                                            fin_op);
-    }
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void klDivergence(IdxT m,
-                  IdxT n,
-                  IdxT k,
-                  IdxT lda,
-                  IdxT ldb,
-                  IdxT ldd,
-                  const DataT* x,
-                  const DataT* y,
-                  OutT* dOutput,
-                  FinalLambda fin_op,
-                  cudaStream_t stream)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    klDivergenceImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    klDivergenceImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else {
-    klDivergenceImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  }
-}
-
 /**
  * @brief the KL Divergence distance matrix calculation
  *  It computes the following equation:
@@ -308,34 +48,49 @@ void klDivergence(IdxT m,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
 void klDivergenceImpl(int m,
                       int n,
                       int k,
-                      const InType* pA,
-                      const InType* pB,
-                      OutType* pD,
-                      FinalLambda fin_op,
+                      const DataT* x,
+                      const DataT* y,
+                      OutT* out,
+                      FinOpT fin_op,
                       cudaStream_t stream,
-                      bool isRowMajor)
+                      bool is_row_major)
 {
-  typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type klDivergenceOutType;
-  Index_ lda, ldb, ldd;
-  klDivergenceOutType* pDcast = reinterpret_cast<klDivergenceOutType*>(pD);
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    klDivergence<InType, AccType, klDivergenceOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
+  auto unaryOp_lambda = [] __device__(DataT input) {
+  const bool x_zero = (input == 0);
+  return (!x_zero) * raft::myLog(input + x_zero);  };
+
+  auto unaryOp_lambda_reverse = [] __device__(DataT input) {
+  // reverse previous log (x) back to x using (e ^ log(x))
+  const bool x_zero = (input == 0);
+  return (!x_zero) * raft::myExp(input);  };
+
+  // This op takes some shortcuts when x equals y. So it behavior changes based
+  // on this.
+  ops::kl_divergence_op kl_divergence{is_row_major, x == y};
+
+  if (x != y) {
+    raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
+      (DataT*)y, y, n * k, unaryOp_lambda, stream);
+  }
+
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
+
+  distance_matrix_dispatch<decltype(kl_divergence), DataT, AccT, OutT, FinOpT, IdxT>(
+    kl_divergence, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
 
-  } else {
-    lda = n, ldb = m, ldd = m;
-    klDivergence<InType, AccType, klDivergenceOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
+  if (x != y) {
+    // Now reverse previous log (x) back to x using (e ^ log(x))
+    raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda_reverse), IdxT>(
+      (DataT*)y, y, n * k, unaryOp_lambda_reverse, stream);
   }
 }
 }  // namespace detail

From f1c105bd070483d032edbdc8e0b356879f9a89b4 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 13 Jan 2023 21:09:39 +0100
Subject: [PATCH 25/60] Minkowski: use pairwise matrix dispatch

---
 cpp/include/raft/distance/detail/README.org   |   6 +-
 .../detail/distance_ops/minkowski.cuh         |  66 ++++++
 .../raft/distance/detail/minkowski.cuh        | 192 ++----------------
 3 files changed, 92 insertions(+), 172 deletions(-)
 create mode 100644 cpp/include/raft/distance/detail/distance_ops/minkowski.cuh

diff --git a/cpp/include/raft/distance/detail/README.org b/cpp/include/raft/distance/detail/README.org
index f5d3b6b0a6..a82cc9a0e3 100644
--- a/cpp/include/raft/distance/detail/README.org
+++ b/cpp/include/raft/distance/detail/README.org
@@ -1,6 +1,8 @@
 #+title: Readme
 
-
+- [X] Euclidean
+  - *Notes*: isRowMajor is now a runtime parameter. Was it a compile time
+    parameter before?
 - [X] canberra.cuh
 - [X] chebyshev.cuh
 - [X] correlation.cuh
@@ -13,5 +15,5 @@
     template / constexpr parameters. Now they are passed by value. This greatly
     reduces the number of kernels, but may have negative consequences for run
     time.
-- [ ] minkowski.cuh
+- [X] minkowski.cuh
 - [ ] russell_rao.cuh
diff --git a/cpp/include/raft/distance/detail/distance_ops/minkowski.cuh b/cpp/include/raft/distance/detail/distance_ops/minkowski.cuh
new file mode 100644
index 0000000000..11be4e6ae0
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/minkowski.cuh
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <raft/util/cuda_utils.cuh>
+
+namespace raft::distance::detail::ops {
+
+// Describes the computation the minkowski distance
+
+template <typename DataT_struct>
+struct minkowski_distance_op {
+  DataT_struct p;
+
+  minkowski_distance_op(DataT_struct p_) noexcept : p(p_) { }
+
+  // Load norms of input data
+  static constexpr bool use_norms = false;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy, typename DataT>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize;
+  }
+
+  template <typename AccT, typename DataT>
+  DI void core(AccT& acc, DataT& x, DataT& y) const
+  {
+    const auto diff = raft::abs(x - y);
+    acc += raft::pow(diff, p);
+  };
+
+  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+    const auto one_over_p = 1.0f / p;
+#pragma unroll
+    for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+        acc[i][j] = raft::pow(acc[i][j], one_over_p);
+      }
+    }
+  }
+};
+
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/minkowski.cuh b/cpp/include/raft/distance/detail/minkowski.cuh
index 42af8cd281..778ceb45cf 100644
--- a/cpp/include/raft/distance/detail/minkowski.cuh
+++ b/cpp/include/raft/distance/detail/minkowski.cuh
@@ -15,154 +15,13 @@
  */
 
 #pragma once
-#include <raft/distance/detail/pairwise_distance_base.cuh>
+#include "pairwise_matrix/dispatch.cuh"
+#include "distance_ops/minkowski.cuh"
 
 namespace raft {
 namespace distance {
 namespace detail {
 
-/**
- * @brief the unexpanded Minkowski distance matrix calculation
- *  It computes the following equation: cij = sum(|x - y|^p)^(1/p)
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
- * @tparam Veclen         number of k-elements loaded by each thread
-                          for every LDG call. details in contractions.cuh
- * @tparam FinalLambda    final lambda called on final distance value
- *
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of rows of B and cols of C/D
- * @param[in]       k number of cols of A and B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[output]   pD output matrix
- * @param[in]       fin_op the final gemm epilogue lambda
- * @param[in]       stream cuda stream to launch work
- * @param[in]       the value of `p` for Minkowski (l-p) distances.
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-void minkowskiUnExpImpl(const DataT* x,
-                        const DataT* y,
-                        IdxT m,
-                        IdxT n,
-                        IdxT k,
-                        IdxT lda,
-                        IdxT ldb,
-                        IdxT ldd,
-                        OutT* dOutput,
-                        FinalLambda fin_op,
-                        cudaStream_t stream,
-                        DataT p)
-{
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-  dim3 blk(KPolicy::Nthreads);
-
-  // Accumulation operation lambda
-  auto core_lambda = [p] __device__(AccT & acc, DataT & x, DataT & y) {
-    const auto diff = raft::abs(x - y);
-    acc += raft::pow(diff, p);
-  };
-
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [p] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                      DataT * regxn,
-                                      DataT * regyn,
-                                      IdxT gridStrideX,
-                                      IdxT gridStrideY) {
-    const auto one_over_p = 1.0f / p;
-#pragma unroll
-    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-#pragma unroll
-      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        acc[i][j] = raft::pow(acc[i][j], one_over_p);
-      }
-    }
-  };
-
-  if (isRowMajor) {
-    auto minkowskiUnExpRowMajor = pairwiseDistanceMatKernel<false,
-                                                            DataT,
-                                                            AccT,
-                                                            OutT,
-                                                            IdxT,
-                                                            KPolicy,
-                                                            decltype(core_lambda),
-                                                            decltype(epilog_lambda),
-                                                            FinalLambda,
-                                                            true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, minkowskiUnExpRowMajor);
-
-    minkowskiUnExpRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-
-  } else {
-    auto minkowskiUnExpColMajor = pairwiseDistanceMatKernel<false,
-                                                            DataT,
-                                                            AccT,
-                                                            OutT,
-                                                            IdxT,
-                                                            KPolicy,
-                                                            decltype(core_lambda),
-                                                            decltype(epilog_lambda),
-                                                            FinalLambda,
-                                                            false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, minkowskiUnExpColMajor);
-
-    minkowskiUnExpColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void minkowskiUnExp(IdxT m,
-                    IdxT n,
-                    IdxT k,
-                    IdxT lda,
-                    IdxT ldb,
-                    IdxT ldd,
-                    const DataT* x,
-                    const DataT* y,
-                    OutT* dOutput,
-                    FinalLambda fin_op,
-                    cudaStream_t stream,
-                    DataT metric_arg)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg);
-  } else {
-    minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg);
-  }
-}
-
 /**
  * @brief the unexpanded minkowski distance matrix calculation
  *  It computes the following equation: cij = sum(|x - y|^p)^(1/p)
@@ -182,36 +41,29 @@ void minkowskiUnExp(IdxT m,
  * @param[in] isRowMajor whether the input and output matrices are row major
  * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances.
  */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void minkowskiImpl(Index_ m,
-                   Index_ n,
-                   Index_ k,
-                   const InType* pA,
-                   const InType* pB,
-                   OutType* pD,
-                   FinalLambda fin_op,
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void minkowskiImpl(IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   FinOpT fin_op,
                    cudaStream_t stream,
-                   bool isRowMajor,
-                   InType metric_arg)
+                   bool is_row_major,
+                   DataT metric_arg)
 {
-  typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type LpUnexpOutType;
-  LpUnexpOutType* pDcast = reinterpret_cast<LpUnexpOutType*>(pD);
-  Index_ lda, ldb, ldd;
+  ops::minkowski_distance_op<DataT> distance_op{metric_arg};
+
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
 
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    minkowskiUnExp<InType, AccType, LpUnexpOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream, metric_arg);
-  } else {
-    lda = n, ldb = m, ldd = m;
-    minkowskiUnExp<InType, AccType, LpUnexpOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream, metric_arg);
-  }
+  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
 }
 };  // end namespace detail
 };  // end namespace distance

From ac66e3f9bc4109eef8be103c408ff637c5f708d5 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 13 Jan 2023 21:19:38 +0100
Subject: [PATCH 26/60] Russel-Rao: use pairwise matrix dispatch

---
 cpp/include/raft/distance/detail/README.org   |   2 +-
 .../detail/distance_ops/russel_rao.cuh        |  67 +++++++
 .../raft/distance/detail/russell_rao.cuh      | 180 ++----------------
 3 files changed, 86 insertions(+), 163 deletions(-)
 create mode 100644 cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh

diff --git a/cpp/include/raft/distance/detail/README.org b/cpp/include/raft/distance/detail/README.org
index a82cc9a0e3..4c2005381c 100644
--- a/cpp/include/raft/distance/detail/README.org
+++ b/cpp/include/raft/distance/detail/README.org
@@ -16,4 +16,4 @@
     reduces the number of kernels, but may have negative consequences for run
     time.
 - [X] minkowski.cuh
-- [ ] russell_rao.cuh
+- [X] russell_rao.cuh
diff --git a/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh b/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh
new file mode 100644
index 0000000000..d4d1044b6e
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace raft::distance::detail::ops {
+
+// Describes the computation the russel_rao distance
+
+template <typename IdxT_struct>
+struct russel_rao_distance_op {
+  IdxT_struct k;
+  const float one_over_k;
+
+  russel_rao_distance_op(IdxT_struct k_) noexcept
+    : k(k_),
+      one_over_k(1.0f / k_)
+  { }
+
+  // Load norms of input data
+  static constexpr bool use_norms = false;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy, typename DataT>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize;
+  }
+
+  template <typename AccT, typename DataT>
+  DI void core(AccT& acc, DataT& x, DataT& y) const
+  {
+    acc += x * y;
+  };
+
+  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+#pragma unroll
+    for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+        acc[i][j] = (k - acc[i][j]) * one_over_k;
+      }
+    }
+  }
+};
+
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/russell_rao.cuh b/cpp/include/raft/distance/detail/russell_rao.cuh
index 5d516e7830..5e8da08b1d 100644
--- a/cpp/include/raft/distance/detail/russell_rao.cuh
+++ b/cpp/include/raft/distance/detail/russell_rao.cuh
@@ -15,150 +15,13 @@
  */
 
 #pragma once
-#include <raft/distance/detail/pairwise_distance_base.cuh>
+#include "distance_ops/russel_rao.cuh"
+#include "pairwise_matrix/dispatch.cuh"
 
 namespace raft {
 namespace distance {
 namespace detail {
 
-/**
- * @brief the Russell Rao distance matrix:
- *  It computes the following equation:
-    Cij = (k - sum(x_i * y_i)) / k
- *
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
- * @tparam Veclen         number of k-elements loaded by each thread
-                          for every LDG call. details in contractions.cuh
- * @tparam FinalLambda    final lambda called on final distance value
- * @tparam isRowMajor     true if input/output is row major,
-                          false for column major
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of rows of B and C/D
- * @param[in]       k number of cols of A and B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[output]   dOutput output matrix
- * @param[in]       fin_op the final gemm epilogue lambda
- * @param[in]       stream cuda stream to launch work
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-static void russellRaoImpl(const DataT* x,
-                           const DataT* y,
-                           IdxT m,
-                           IdxT n,
-                           IdxT k,
-                           IdxT lda,
-                           IdxT ldb,
-                           IdxT ldd,
-                           OutT* dOutput,
-                           FinalLambda fin_op,
-                           cudaStream_t stream)
-{
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-  typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-  dim3 blk(KPolicy::Nthreads);
-
-  // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
-
-  const float one_over_k = 1.0 / k;
-  // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [k, one_over_k] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn,
-                         DataT * regyn,
-                         IdxT gridStrideX,
-                         IdxT gridStrideY) {
-#pragma unroll
-    for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-#pragma unroll
-      for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        acc[i][j] = (k - acc[i][j]) * one_over_k;
-      }
-    }
-  };
-
-  if (isRowMajor) {
-    constexpr auto russellRaoRowMajor = pairwiseDistanceMatKernel<false,
-                                                                  DataT,
-                                                                  AccT,
-                                                                  OutT,
-                                                                  IdxT,
-                                                                  KPolicy,
-                                                                  decltype(core_lambda),
-                                                                  decltype(epilog_lambda),
-                                                                  FinalLambda,
-                                                                  true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, russellRaoRowMajor);
-
-    russellRaoRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  } else {
-    constexpr auto russellRaoColMajor = pairwiseDistanceMatKernel<false,
-                                                                  DataT,
-                                                                  AccT,
-                                                                  OutT,
-                                                                  IdxT,
-                                                                  KPolicy,
-                                                                  decltype(core_lambda),
-                                                                  decltype(epilog_lambda),
-                                                                  FinalLambda,
-                                                                  false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, russellRaoColMajor);
-    russellRaoColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void russellRao(IdxT m,
-                IdxT n,
-                IdxT k,
-                IdxT lda,
-                IdxT ldb,
-                IdxT ldd,
-                const DataT* x,
-                const DataT* y,
-                OutT* dOutput,
-                FinalLambda fin_op,
-                cudaStream_t stream)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    russellRaoImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    russellRaoImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else {
-    russellRaoImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  }
-}
-
 /**
  * @brief the Russell Rao distance matrix calculation
  *  It computes the following equation:
@@ -179,35 +42,28 @@ void russellRao(IdxT m,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
 void russellRaoImpl(int m,
                     int n,
                     int k,
-                    const InType* pA,
-                    const InType* pB,
-                    OutType* pD,
-                    FinalLambda fin_op,
+                    const DataT* x,
+                    const DataT* y,
+                    OutT* out,
+                    FinOpT fin_op,
                     cudaStream_t stream,
-                    bool isRowMajor)
+                    bool is_row_major)
 {
-  typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type russellRaoOutType;
-  Index_ lda, ldb, ldd;
-  russellRaoOutType* pDcast = reinterpret_cast<russellRaoOutType*>(pD);
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    russellRao<InType, AccType, russellRaoOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
+  ops::russel_rao_distance_op<IdxT> distance_op{k};
+
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
 
-  } else {
-    lda = n, ldb = m, ldd = m;
-    russellRao<InType, AccType, russellRaoOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
-  }
+  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
 }
 }  // namespace detail
 }  // namespace distance

From a89896a456d72592a8490a057d325b4d45fea930 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 13 Jan 2023 21:45:38 +0100
Subject: [PATCH 27/60] Cosine: use pairwise matrix dispatch

---
 cpp/include/raft/distance/detail/README.org   |  12 +-
 cpp/include/raft/distance/detail/cosine.cuh   | 261 ++++--------------
 .../distance/detail/distance_ops/cosine.cuh   |  70 +++++
 .../raft/distance/detail/euclidean.cuh        |   1 -
 4 files changed, 135 insertions(+), 209 deletions(-)
 create mode 100644 cpp/include/raft/distance/detail/distance_ops/cosine.cuh

diff --git a/cpp/include/raft/distance/detail/README.org b/cpp/include/raft/distance/detail/README.org
index 4c2005381c..03d540cb84 100644
--- a/cpp/include/raft/distance/detail/README.org
+++ b/cpp/include/raft/distance/detail/README.org
@@ -1,12 +1,18 @@
 #+title: Readme
 
 - [X] Euclidean
-  - *Notes*: isRowMajor is now a runtime parameter. Was it a compile time
-    parameter before?
+  - *Notes*:
+    - enable_sqrt is now a runtime parameter. Was it a compile time
+      parameter before?
+    - CUTLASS fails on CUDA 12 (but prior to refactoring CUDA 12 did not work
+      either). I have not yet tested if everything works correctly on CUDA 11.
 - [X] canberra.cuh
 - [X] chebyshev.cuh
 - [X] correlation.cuh
-- [ ] cosine.cuh
+- [X] cosine.cuh
+  - *Notes*: cutlass fails on CUDA 12 (but prior to refactoring CUDA 12 did not
+    work either). I have not yet tested if everything works correctly on
+    CUDA 11.
 - [X] hamming.cuh
 - [X] hellinger.cuh
 - [X] jensen_shannon.cuh
diff --git a/cpp/include/raft/distance/detail/cosine.cuh b/cpp/include/raft/distance/detail/cosine.cuh
index 46a694aa51..ea1dd64933 100644
--- a/cpp/include/raft/distance/detail/cosine.cuh
+++ b/cpp/include/raft/distance/detail/cosine.cuh
@@ -15,181 +15,15 @@
  */
 
 #pragma once
-
-#include <raft/distance/detail/pairwise_distance_base.cuh>
-#include <raft/distance/detail/pairwise_distance_cutlass_base.cuh>
 #include <raft/linalg/norm.cuh>
-#include <raft/util/cuda_utils.cuh>
+
+#include "pairwise_matrix/dispatch.cuh"
+#include "distance_ops/cosine.cuh"
 
 namespace raft {
 namespace distance {
 namespace detail {
 
-template <typename DataT, typename AccT>
-struct CosineOp {
-  __device__ CosineOp() noexcept {}
-  __device__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const noexcept
-  {
-    return static_cast<AccT>(1.0) - (AccT)(accVal / (aNorm * bNorm));
-  }
-  __device__ AccT operator()(DataT aData) const noexcept { return aData; }
-};
-
-/**
- * @brief the cosine distance matrix calculation implementer
- *  It computes the following equation:
- *    C = 1 - op(A * B / sqrt(A^2) * sqrt(B^2)))
- * @tparam DataT input data-type (for A and B matrices)
- * @tparam AccT   accumulation data-type
- * @tparam OutT   output data-type (for C and D matrices)
- * @tparam IdxT   index data-type
- * @tparam Veclen number of k-elements loaded by each thread for every LDG call
- *                it makes. check contractions.cuh for details.
- * @tparam FinalLambda the final lambda called on final distance value
- * @tparam isRowMajor  true if input/output is row major,
-                       false for column major
- * @param[in]     x input matrix
- * @param[in]     y input matrix
- * @param[in]     xn row norms of input matrix A.
- * @param[in]     yn row norms of input matrix B.
- * @param[in]     m number of rows of A and C/D
- * @param[in]     n number of columns of B and C/D
- * @param[in]     k number of cols of A and rows of B
- * @param[in]     lda leading dimension of A
- * @param[in]     ldb leading dimension of B
- * @param[in]     ldd leading dimension of C/D
- * @param[output] pD output matrix
- * @param fin_op  the final gemm epilogue lambda
-*  @param stream  cuda stream to launch cuda operations.
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-void cosineImpl(const DataT* x,
-                const DataT* y,
-                const DataT* xn,
-                const DataT* yn,
-                IdxT m,
-                IdxT n,
-                IdxT k,
-                IdxT lda,
-                IdxT ldb,
-                IdxT ldd,
-                OutT* dOutput,
-                FinalLambda fin_op,
-                cudaStream_t stream)
-{
-#if (__CUDACC_VER_MAJOR__ < 12)
-  const auto deviceVersion = getComputeCapability();
-  if (deviceVersion.first >= 8) {
-    using CosineOp_ = CosineOp<DataT, AccT>;
-    CosineOp_ cosine_dist_op;
-
-    cutlassDistanceKernel<DataT, AccT, OutT, IdxT, VecLen, FinalLambda, CosineOp_, isRowMajor>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, cosine_dist_op, stream);
-
-  } else
-#endif
-  {
-    typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
-    typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
-
-    typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
-
-    dim3 blk(KPolicy::Nthreads);
-
-    // Accumulation operation lambda
-    auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
-
-    // epilogue operation lambda for final value calculation
-    auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                       DataT * regxn,
-                                       DataT * regyn,
-                                       IdxT gridStrideX,
-                                       IdxT gridStrideY) {
-#pragma unroll
-      for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-#pragma unroll
-        for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-          acc[i][j] = 1.0 - (acc[i][j] / (regxn[i] * regyn[j]));
-        }
-      }
-    };
-
-    constexpr size_t shmemSize =
-      KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
-    if (isRowMajor) {
-      auto cosineRowMajor = pairwiseDistanceMatKernelPriorToAmpere<true,
-                                                                   DataT,
-                                                                   AccT,
-                                                                   OutT,
-                                                                   IdxT,
-                                                                   KPolicy,
-                                                                   decltype(core_lambda),
-                                                                   decltype(epilog_lambda),
-                                                                   FinalLambda,
-                                                                   true>;
-      dim3 grid           = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineRowMajor);
-      cosineRowMajor<<<grid, blk, shmemSize, stream>>>(
-        x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-    } else {
-      auto cosineColMajor = pairwiseDistanceMatKernelPriorToAmpere<true,
-                                                                   DataT,
-                                                                   AccT,
-                                                                   OutT,
-                                                                   IdxT,
-                                                                   KPolicy,
-                                                                   decltype(core_lambda),
-                                                                   decltype(epilog_lambda),
-                                                                   FinalLambda,
-                                                                   false>;
-      dim3 grid           = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineColMajor);
-      cosineColMajor<<<grid, blk, shmemSize, stream>>>(
-        x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
-    }
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void cosine(IdxT m,
-            IdxT n,
-            IdxT k,
-            IdxT lda,
-            IdxT ldb,
-            IdxT ldd,
-            const DataT* x,
-            const DataT* y,
-            const DataT* xn,
-            const DataT* yn,
-            OutT* dOutput,
-            FinalLambda fin_op,
-            cudaStream_t stream)
-{
-  size_t bytesA = sizeof(DataT) * lda;
-  size_t bytesB = sizeof(DataT) * ldb;
-  if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    cosineImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    cosineImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  } else {
-    cosineImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
-  }
-}
-
 /**
  * @brief the expanded cosine distance matrix calculation
  *  It computes the following equation:
@@ -213,57 +47,74 @@ void cosine(IdxT m,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void cosineAlgo1(Index_ m,
-                 Index_ n,
-                 Index_ k,
-                 const InType* pA,
-                 const InType* pB,
-                 OutType* pD,
-                 AccType* workspace,
-                 size_t worksize,
-                 FinalLambda fin_op,
-                 cudaStream_t stream,
-                 bool isRowMajor)
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void cosineAlgo1(IdxT m,
+                    IdxT n,
+                    IdxT k,
+                    const DataT* pA,
+                    const DataT* pB,
+                    OutT* pD,
+                    AccT* workspace,
+                    size_t worksize,
+                    FinOpT fin_op,
+                    cudaStream_t stream,
+                    bool isRowMajor)
 {
   // raft distance support inputs as float/double and output as uint8_t/float/double.
-  static_assert(!((sizeof(OutType) > 1) && (sizeof(AccType) != sizeof(OutType))),
-                "OutType can be uint8_t, float, double,"
-                "if sizeof(OutType) > 1 then sizeof(AccType) == sizeof(OutType).");
-  typedef typename std::conditional<sizeof(OutType) == 1, OutType, AccType>::type CosOutType;
-  CosOutType* pDcast = reinterpret_cast<CosOutType*>(pD);
+  static_assert(!((sizeof(OutT) > 1) && (sizeof(AccT) != sizeof(OutT))),
+                "OutT can be uint8_t, float, double,"
+                "if sizeof(OutT) > 1 then sizeof(AccT) == sizeof(OutT).");
 
   ASSERT(
-    !(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || (worksize < m * sizeof(AccType))),
+    !(((pA != pB) && (worksize < (m + n) * sizeof(AccT))) || (worksize < m * sizeof(AccT))),
     "workspace size error");
   ASSERT(workspace != nullptr, "workspace is null");
 
-  Index_ lda, ldb, ldd;
-  InType* col_vec = workspace;
-  InType* row_vec = workspace;
+
+  DataT* norm_A = workspace;
+  DataT* norm_B = workspace;
   if (pA != pB) {
-    row_vec += m;
+    norm_B += m;
     raft::linalg::rowNorm(
-      col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::sqrt_op{});
+      norm_A, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::sqrt_op{});
     raft::linalg::rowNorm(
-      row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, raft::sqrt_op{});
+      norm_B, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, raft::sqrt_op{});
   } else {
     raft::linalg::rowNorm(
-      col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::sqrt_op{});
+      norm_A, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::sqrt_op{});
   }
 
-  if (isRowMajor) {
-    lda = k, ldb = k, ldd = n;
-    cosine<InType, AccType, CosOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, pDcast, fin_op, stream);
+  // On CUDA 12:
+  // - always execute normal kernel
+  //
+  // On CUDA 11 and below:
+  // - execute CUTLASS-based kernel on SM_80 and above
+  // - execute normal kernel otherwise.
+
+  if constexpr (__CUDACC_VER_MAJOR__ == 12) {
+    // Always execute legacy kernels on CUDA 12
+    ops::cosine_distance_op distance_op{};
+    distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+      distance_op, m, n, k, pA, pB, norm_A, norm_B, pD, fin_op, stream, isRowMajor);
   } else {
-    lda = n, ldb = m, ldd = m;
-    cosine<InType, AccType, CosOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, pDcast, fin_op, stream);
+    const auto deviceVersion = getComputeCapability();
+    if (deviceVersion.first >= 8) {
+      // If device is SM_80 or later, use CUTLASS-based kernel.
+      using Op = ops::cosine_cutlass_op<DataT, AccT>;
+      Op distance_op{};
+
+      distance_matrix_cutlass_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+        distance_op, m, n, k, pA, pB, norm_A, norm_B, pD, fin_op, stream, isRowMajor);
+    } else {
+      // Else use "legacy" L2
+      ops::cosine_distance_op distance_op{};
+      distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+        distance_op, m, n, k, pA, pB, norm_A, norm_B, pD, fin_op, stream, isRowMajor);
+    }
   }
 }
 
diff --git a/cpp/include/raft/distance/detail/distance_ops/cosine.cuh b/cpp/include/raft/distance/detail/distance_ops/cosine.cuh
new file mode 100644
index 0000000000..c2679d5380
--- /dev/null
+++ b/cpp/include/raft/distance/detail/distance_ops/cosine.cuh
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace raft::distance::detail::ops {
+
+// Describes the computation the cosine distance
+
+struct cosine_distance_op {
+  // Load norms of input data
+  static constexpr bool use_norms = true;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy, typename DataT>
+  constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT));
+  }
+
+  template <typename AccT, typename DataT>
+  DI void core(AccT& acc, DataT& x, DataT& y) const
+  {
+    acc += x * y;
+  };
+
+  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 DataT* regxn,
+                 DataT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+#pragma unroll
+    for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+        acc[i][j] = 1.0 - (acc[i][j] / (regxn[i] * regyn[j]));
+      }
+    }
+  }
+};
+
+
+template <typename DataT, typename AccT>
+struct cosine_cutlass_op {
+  __device__ cosine_cutlass_op() noexcept {}
+  __device__ AccT operator()(DataT& aNorm, const DataT& bNorm, DataT& accVal) const noexcept
+  {
+    return static_cast<AccT>(1.0) - (AccT)(accVal / (aNorm * bNorm));
+  }
+  __device__ AccT operator()(DataT aData) const noexcept { return aData; }
+};
+
+
+}  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/euclidean.cuh b/cpp/include/raft/distance/detail/euclidean.cuh
index 51e2ff224f..3cdc5489a6 100644
--- a/cpp/include/raft/distance/detail/euclidean.cuh
+++ b/cpp/include/raft/distance/detail/euclidean.cuh
@@ -22,7 +22,6 @@
 #include "distance_ops/l2_exp.cuh"
 #include "distance_ops/l2_unexp.cuh"
 
-
 namespace raft {
 namespace distance {
 namespace detail {

From 16b2acdc55da1f1877b086233399361418f539df Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 13 Jan 2023 21:49:39 +0100
Subject: [PATCH 28/60] Fix include for l1 op

---
 cpp/include/raft/distance/detail/distance_ops/l1.cuh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/include/raft/distance/detail/distance_ops/l1.cuh b/cpp/include/raft/distance/detail/distance_ops/l1.cuh
index 9d31b24851..7153154588 100644
--- a/cpp/include/raft/distance/detail/distance_ops/l1.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l1.cuh
@@ -15,6 +15,7 @@
  */
 
 #pragma once
+#include <raft/util/cuda_utils.cuh>
 
 namespace raft::distance::detail::ops {
 

From 1326e3408254b8ad9562541c0d9d35dc00241cb2 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 10 Feb 2023 11:21:35 +0100
Subject: [PATCH 29/60] kl_divergence: Use raft::log instead of raft::myLog

---
 cpp/include/raft/distance/detail/kl_divergence.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/include/raft/distance/detail/kl_divergence.cuh b/cpp/include/raft/distance/detail/kl_divergence.cuh
index e76cd5a3b9..e2f7bf2beb 100644
--- a/cpp/include/raft/distance/detail/kl_divergence.cuh
+++ b/cpp/include/raft/distance/detail/kl_divergence.cuh
@@ -65,14 +65,14 @@ void klDivergenceImpl(int m,
 {
   auto unaryOp_lambda = [] __device__(DataT input) {
   const bool x_zero = (input == 0);
-  return (!x_zero) * raft::myLog(input + x_zero);  };
+  return (!x_zero) * raft::log(input + x_zero);  };
 
   auto unaryOp_lambda_reverse = [] __device__(DataT input) {
   // reverse previous log (x) back to x using (e ^ log(x))
   const bool x_zero = (input == 0);
-  return (!x_zero) * raft::myExp(input);  };
+  return (!x_zero) * raft::exp(input);  };
 
-  // This op takes some shortcuts when x equals y. So it behavior changes based
+  // This op takes some shortcuts when x equals y. So its behavior changes based
   // on this.
   ops::kl_divergence_op kl_divergence{is_row_major, x == y};
 

From 0169b26f55b5a4d3ed7db3b05c72fb9a6a7f50e4 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 10 Feb 2023 11:23:53 +0100
Subject: [PATCH 30/60] distance_op: Add expensive_inner_loop marker

This indicates that the operator uses expensive operations (pow, exp,
log) in the inner loop. Therefore, unrolling and or veclen parameters
should be adjusted
---
 cpp/include/raft/distance/detail/distance_ops/canberra.cuh     | 3 +++
 cpp/include/raft/distance/detail/distance_ops/chebyshev.cuh    | 3 +++
 cpp/include/raft/distance/detail/distance_ops/correlation.cuh  | 3 +++
 cpp/include/raft/distance/detail/distance_ops/cosine.cuh       | 3 +++
 cpp/include/raft/distance/detail/distance_ops/hamming.cuh      | 3 +++
 cpp/include/raft/distance/detail/distance_ops/hellinger.cuh    | 3 +++
 .../raft/distance/detail/distance_ops/jensen_shannon.cuh       | 3 +++
 .../raft/distance/detail/distance_ops/kl_divergence.cuh        | 3 +++
 cpp/include/raft/distance/detail/distance_ops/l1.cuh           | 3 +++
 cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh       | 3 +++
 cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh     | 3 +++
 cpp/include/raft/distance/detail/distance_ops/minkowski.cuh    | 3 +++
 cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh   | 3 +++
 cpp/include/raft/distance/detail/distance_ops/template.cuh     | 3 +++
 14 files changed, 42 insertions(+)

diff --git a/cpp/include/raft/distance/detail/distance_ops/canberra.cuh b/cpp/include/raft/distance/detail/distance_ops/canberra.cuh
index 4fda825286..e9c16d6d6d 100644
--- a/cpp/include/raft/distance/detail/distance_ops/canberra.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/canberra.cuh
@@ -25,6 +25,9 @@ namespace raft::distance::detail::ops {
 struct canberra_distance_op {
   // Load norms of input data
   static constexpr bool use_norms = false;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = true;
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
diff --git a/cpp/include/raft/distance/detail/distance_ops/chebyshev.cuh b/cpp/include/raft/distance/detail/distance_ops/chebyshev.cuh
index ced9fcf6f7..a68d9fc21c 100644
--- a/cpp/include/raft/distance/detail/distance_ops/chebyshev.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/chebyshev.cuh
@@ -25,6 +25,9 @@ namespace raft::distance::detail::ops {
 struct chebyshev_distance_op {
   // Load norms of input data
   static constexpr bool use_norms = false;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = false;
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
diff --git a/cpp/include/raft/distance/detail/distance_ops/correlation.cuh b/cpp/include/raft/distance/detail/distance_ops/correlation.cuh
index 98d90ea0a5..eb18355ca9 100644
--- a/cpp/include/raft/distance/detail/distance_ops/correlation.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/correlation.cuh
@@ -54,6 +54,9 @@ struct correlation_distance_op {
 
   // Load norms of input data
   static constexpr bool use_norms = true;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = false;
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
diff --git a/cpp/include/raft/distance/detail/distance_ops/cosine.cuh b/cpp/include/raft/distance/detail/distance_ops/cosine.cuh
index c2679d5380..bbc1ffcba2 100644
--- a/cpp/include/raft/distance/detail/distance_ops/cosine.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/cosine.cuh
@@ -23,6 +23,9 @@ namespace raft::distance::detail::ops {
 struct cosine_distance_op {
   // Load norms of input data
   static constexpr bool use_norms = true;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = false;
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
diff --git a/cpp/include/raft/distance/detail/distance_ops/hamming.cuh b/cpp/include/raft/distance/detail/distance_ops/hamming.cuh
index 1f88424d70..c8b3b7658e 100644
--- a/cpp/include/raft/distance/detail/distance_ops/hamming.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/hamming.cuh
@@ -28,6 +28,9 @@ struct hamming_distance_op {
 
   // Load norms of input data
   static constexpr bool use_norms = false;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = false;
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
diff --git a/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh b/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh
index b01f118923..b0fae700b5 100644
--- a/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh
@@ -26,6 +26,9 @@ namespace raft::distance::detail::ops {
 struct hellinger_distance_op {
   // Load norms of input data
   static constexpr bool use_norms = false;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = false;
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
diff --git a/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh b/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh
index 116af61964..124010e96d 100644
--- a/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh
@@ -24,6 +24,9 @@ namespace raft::distance::detail::ops {
 struct jensen_shannon_distance_op {
   // Load norms of input data
   static constexpr bool use_norms = false;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = true;
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
diff --git a/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh b/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh
index a1f438e0d4..a97582aa5a 100644
--- a/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh
@@ -34,6 +34,9 @@ struct kl_divergence_op {
 
   // Load norms of input data
   static constexpr bool use_norms = false;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = true;
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
diff --git a/cpp/include/raft/distance/detail/distance_ops/l1.cuh b/cpp/include/raft/distance/detail/distance_ops/l1.cuh
index 7153154588..4bb4a8796c 100644
--- a/cpp/include/raft/distance/detail/distance_ops/l1.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l1.cuh
@@ -23,6 +23,9 @@ namespace raft::distance::detail::ops {
 struct l1_distance_op {
   // Do not load norms of data, the computation of L1 distance does not use them.
   static constexpr bool use_norms = false;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = false;
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
diff --git a/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
index 4dfb26a826..13a41190c1 100644
--- a/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
@@ -28,6 +28,9 @@ struct l2_exp_distance_op {
 
   // Load norms of input data
   static constexpr bool use_norms = true;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = false;
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
diff --git a/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh b/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh
index 03bbd936c6..31fbd11667 100644
--- a/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh
@@ -26,6 +26,9 @@ struct l2_unexp_distance_op {
 
   // Do not load norms of data, the computation of L1 distance does not use them.
   static constexpr bool use_norms = false;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = false;
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
diff --git a/cpp/include/raft/distance/detail/distance_ops/minkowski.cuh b/cpp/include/raft/distance/detail/distance_ops/minkowski.cuh
index 11be4e6ae0..8deb42d1fe 100644
--- a/cpp/include/raft/distance/detail/distance_ops/minkowski.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/minkowski.cuh
@@ -29,6 +29,9 @@ struct minkowski_distance_op {
 
   // Load norms of input data
   static constexpr bool use_norms = false;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = true;
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
diff --git a/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh b/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh
index d4d1044b6e..f46a1a5e67 100644
--- a/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh
@@ -32,6 +32,9 @@ struct russel_rao_distance_op {
 
   // Load norms of input data
   static constexpr bool use_norms = false;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = false;
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
diff --git a/cpp/include/raft/distance/detail/distance_ops/template.cuh b/cpp/include/raft/distance/detail/distance_ops/template.cuh
index c770a575a0..d7bbfc7fca 100644
--- a/cpp/include/raft/distance/detail/distance_ops/template.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/template.cuh
@@ -29,6 +29,9 @@ struct template_distance_op {
 
   // Load norms of input data
   static constexpr bool use_norms = TODO;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = false;
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.

From 52e95e1f255dbddbb5dda6513ce4feff6386bc5e Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 10 Feb 2023 11:26:55 +0100
Subject: [PATCH 31/60] Update copyright notices

---
 cpp/include/raft/distance/detail/cosine.cuh                     | 2 +-
 cpp/include/raft/distance/detail/hamming.cuh                    | 2 +-
 .../raft/distance/detail/pairwise_distance_cutlass_base.cuh     | 2 +-
 cpp/include/raft/distance/detail/russell_rao.cuh                | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/include/raft/distance/detail/cosine.cuh b/cpp/include/raft/distance/detail/cosine.cuh
index ea1dd64933..4ae0c285f5 100644
--- a/cpp/include/raft/distance/detail/cosine.cuh
+++ b/cpp/include/raft/distance/detail/cosine.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/detail/hamming.cuh b/cpp/include/raft/distance/detail/hamming.cuh
index 9935c96a40..824e930023 100644
--- a/cpp/include/raft/distance/detail/hamming.cuh
+++ b/cpp/include/raft/distance/detail/hamming.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh
index efd44ea4dc..0d26d940b3 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/include/raft/distance/detail/russell_rao.cuh b/cpp/include/raft/distance/detail/russell_rao.cuh
index 5e8da08b1d..6bf5ae04bb 100644
--- a/cpp/include/raft/distance/detail/russell_rao.cuh
+++ b/cpp/include/raft/distance/detail/russell_rao.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 28cd57bffefdf8037ff8e3ea1618efffd76e27ca Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 10 Feb 2023 11:52:05 +0100
Subject: [PATCH 32/60] Reusable dispatch mechanism

---
 .../detail/pairwise_matrix/dispatch.cuh       | 241 ++++++++----------
 1 file changed, 104 insertions(+), 137 deletions(-)

diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
index 650c8fa805..1eb2a65d5a 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
@@ -15,99 +15,101 @@
  */
 #pragma once
 
+#include "kernel_sm60.cuh"
 #include <cstdio>
-#include <utility>
-#include <raft/linalg/contractions.cuh>
 #include <raft/distance/detail/pairwise_distance_cutlass_base.cuh>
-#include "kernel_sm60.cuh"
+#include <raft/linalg/contractions.cuh>
+#include <utility>
 
 namespace raft::distance::detail {
 
+/**
+ * @brief: Computes minimal alignment of row starting elements in 2D array
+ *
+ * The 2D matrix x is assumed to be row-major. This function computes the
+ * minimal alignment in bytes of the first elements of each row.
+ * Output can be 16, 8, 4, 2, 1.
+ *
+ * @param x        Base pointer of row-major input matrix
+ * @param stride   Stride in number of element between consecutive rows.
+ */
 template <typename DataT>
-struct params_dispatch {
-  int vectorized_load_num_elem = 1;
-  bool row_major               = true;
-
-  template <int vl, bool rm>
-  struct params_constexpr {
-    static constexpr int vec_len = vl;
-    static constexpr bool is_row_major = rm;
-  };
+size_t alignment_of_2d_array(const DataT* x, size_t stride)
+{
+  auto base           = reinterpret_cast<uintptr_t>(x);
+  size_t stride_bytes = sizeof(DataT) * stride;
 
-  // Turn run-time parameters into compile-time parameters.
-  // Call the provided function f with these compile-time parameters.
-  // Returns false if dispatch fails, i.e., if there is no implementation
-  // for the given runtime parameters.
-  template <typename F>
-  bool dispatch_with_compile_time_params(F&& f) const
-  {
-    return convert_vectorized_load_num_elem(f);
+  for (int align = 16; align >= 0; align /= 2) {
+    bool base_aligned   = base % align == 0;
+    bool stride_aligned = stride_bytes % align == 0;
+    if (base_aligned && stride_aligned) { return align; }
   }
+  return 1;
+}
 
-  // Step 1: convert alignment into a compile time constant
-  template <typename F>
-  bool convert_vectorized_load_num_elem(F&& f) const
+template <int n>
+struct alignment_tag {
+  static constexpr int value = n;
+};
+
+struct alignment_dispatch {
+  size_t byte_alignment = 0;
+
+  template <typename DataT>
+  alignment_dispatch(const DataT* x, const DataT* y, size_t ldx, size_t ldy)
   {
-    bool fail = false;
-    switch (vectorized_load_num_elem) {
-      case 1: return layout<1>(f);
-      case 2: return layout<2>(f);
-      case 4: return layout<4>(f);
-      default: return fail;
-    };
+    size_t align_x = alignment_of_2d_array(x, ldx);
+    size_t align_y = alignment_of_2d_array(y, ldy);
+
+    byte_alignment = min(align_x, align_y);
   }
 
-  // Step 2: convert layout into a compile time constant
-  template <int vec_len, typename F>
-  bool layout(F&& f) const
+  template <typename F>
+  auto operator()(F&& f) const
   {
-    if (row_major) {
-      return to_compile_time_params<vec_len, true>(f);
-    } else {
-      return to_compile_time_params<vec_len, false>(f);
+    switch (byte_alignment) {
+      case 16: f(alignment_tag<16>()); break;
+      case 8: f(alignment_tag<8>()); break;
+      case 4: f(alignment_tag<4>()); break;
+      case 2: f(alignment_tag<2>()); break;
+      default: f(alignment_tag<1>()); break;
     }
   }
+};
 
-  // Step 3: convert compile-time constant into compile-time parameter struct and invoke
-  // function f with these compile time parameters.
-  template <int vec_len, bool is_row_major, typename F>
-  bool to_compile_time_params(F&& f) const
-  {
-    // Create compile-time parameter type and instantiate a struct;
-    using ct_params_T = params_constexpr<vec_len, is_row_major>;
-    ct_params_T compile_time_params{};
+template <bool rm>
+struct row_major_tag {
+  static constexpr int value = rm;
+};
 
-    // Dispatch to f
-    f(compile_time_params);
+struct row_major_dispatch {
+  bool is_row_major_;
+  row_major_dispatch(bool row_major) : is_row_major_(row_major) {}
 
-    bool dispatch_success = true;
-    return dispatch_success;
+  template <typename F>
+  auto operator()(F&& f) const
+  {
+    if (is_row_major_) {
+      f(row_major_tag<true>());
+    } else {
+      f(row_major_tag<false>());
+    }
   }
 };
 
-// Determine the largest number of elements that can be loaded in one
-// instruction without causing misalignment errors.
-template <typename DataT, typename IdxT>
-int vectorized_load_num_elem(const DataT* x, const DataT* y, IdxT ldx, IdxT ldy)
+template <typename F1, typename F2>
+auto join_dispatch(F1&& f1, F2&& f2)
 {
-  auto base_x     = reinterpret_cast<uintptr_t>(x);
-  auto base_y     = reinterpret_cast<uintptr_t>(y);
-  size_t stride_X = sizeof(DataT) * ldx;  // stride in bytes
-  size_t stride_Y = sizeof(DataT) * ldy;  // stride in bytes
-
-  bool base_16B_aligned = base_x % 16 == 0 && base_y % 16 == 0;
-  bool base_8B_aligned  = base_x % 8 == 0 && base_y % 8 == 0;
-
-  bool stride_16B_aligned = stride_X % 16 == 0 && stride_Y % 16 == 0;
-  bool stride_8B_aligned  = stride_X % 8 == 0 && stride_Y % 8 == 0;
+  const auto lam = [f1, f2](auto f) {
+    f1([f, f2](auto... args1) { f2([f, args1...](auto... args2) { f(args1..., args2...); }); });
+  };
+  return lam;
+}
 
-  if (16 % sizeof(DataT) == 0 && base_16B_aligned && stride_16B_aligned) {
-    return 16 / sizeof(DataT);
-  } else if (8 % sizeof(DataT) == 0 && base_8B_aligned && stride_8B_aligned) {
-    return 8 / sizeof(DataT);
-  } else {
-    return 1;
-  }
+template <typename F1, typename F2, typename... Fs>
+auto join_dispatch(F1 f1, F2 f2, Fs... fs)
+{
+  return join_dispatch(join_dispatch(f1, f2), std::forward<Fs>(fs)...);
 }
 
 template <typename opT,
@@ -142,58 +144,30 @@ void distance_matrix_dispatch(opT distance_op,
     ldx = m, ldy = n, ld_out = n;
   }
 
-  // Create run-time parameter struct that does the dispatching.
-  //
-  // In addition to the template parameters of this function (IdxT, DataT,
-  // etc..), we explicitly dispatch based on:
-  params_dispatch<DataT> run_time_params{
-    vectorized_load_num_elem(x, y, ldx, ldy),   // 1. num array elements per load instruction
-    is_row_major                                // 2. the layout of x, y, and out
-  };
+  alignment_dispatch d_align(x, y, ldx, ldy);
+  row_major_dispatch d_row_major(is_row_major);
+  auto dispatch = join_dispatch(d_align, d_row_major);
 
-  // Turn run-time parameters into compile-time parameters.
-  bool dispatch_success = run_time_params.dispatch_with_compile_time_params(
-    // We pass a lambda that receives the compile-time parameters and can use these
-    // to call the correct kernel.
-    [&](auto p) {
-      // p has two constexpr members:
-      // - vec_len
-      // - is_row_major
-
-      // There is no instruction to load 4 doubles, so we catch this situation
-      // and load 2 doubles.
-      constexpr bool load_4_doubles = sizeof(DataT) > 4 && p.vec_len == 4;
-      constexpr int vec_len = (load_4_doubles) ? 2 : p.vec_len;
-
-      // Determine kernel policy using vec_len and layout
-      typedef typename raft::linalg::Policy4x4<DataT, vec_len>::Policy RowPolicy;
-      typedef typename raft::linalg::Policy4x4<DataT, vec_len>::ColPolicy ColPolicy;
-      typedef typename std::conditional<p.is_row_major, RowPolicy, ColPolicy>::type Policy;
-
-      // Create compile-time template parameter
-      using KP_T = kernel_params_T<DataT, AccT, OutT, IdxT, Policy, opT, FinOpT, p.is_row_major>;
-
-      return pairwise_matrix<KP_T>(
-        distance_op,
-        fin_op,
-        x,
-        y,
-        x_norm,
-        y_norm,
-        m,
-        n,
-        k,
-        ldx,
-        ldy,
-        ld_out,
-        out,
-        stream);
-    });
-
-  if (!dispatch_success) {
-    std::printf("Dispatch error(!)\n");
-    // TODO
-  }
+  dispatch([&](auto alignment_tag, auto row_major_tag) {
+    // Compute number of elements that can be loaded in one instruction
+    // without causing misalignent errors.
+    constexpr int vec_len_ideal =
+      (alignment_tag.value % sizeof(DataT) == 0) ? alignment_tag.value / sizeof(DataT) : 1;
+
+    // To keep compile times in check, we only specialize on veclen > 1 when
+    // the inner loop is relatively cheap (< 5 flops).
+    constexpr int vec_len = distance_op.expensive_inner_loop ? 1 : vec_len_ideal;
+
+    typedef typename raft::linalg::Policy4x4<DataT, vec_len>::Policy RowPolicy;
+    typedef typename raft::linalg::Policy4x4<DataT, vec_len>::ColPolicy ColPolicy;
+    typedef typename std::conditional<row_major_tag.value, RowPolicy, ColPolicy>::type Policy;
+
+    // Create compile-time template parameter
+    using KP_T = kernel_params_T<DataT, AccT, OutT, IdxT, Policy, opT, FinOpT, row_major_tag.value>;
+
+    return pairwise_matrix<KP_T>(
+      distance_op, fin_op, x, y, x_norm, y_norm, m, n, k, ldx, ldy, ld_out, out, stream);
+  });
 }
 
 template <typename opT,
@@ -227,25 +201,18 @@ void distance_matrix_cutlass_dispatch(opT cutlass_op,
     ldx = m, ldy = n, ld_out = n;
   }
 
-  params_dispatch<DataT> run_time_params{
-    vectorized_load_num_elem(x, y, ldx, ldy),
-    is_row_major
-  };
+  alignment_dispatch d_align(x, y, ldx, ldy);
+  row_major_dispatch d_row_major(is_row_major);
 
-  bool dispatch_success = run_time_params.dispatch_with_compile_time_params(
-    [&](auto p) {
-      // Prevent loading 4 doubles in one instruction.
-      constexpr bool load_4_doubles = sizeof(DataT) > 4 && p.vec_len == 4;
-      constexpr int vec_len = (load_4_doubles) ? 2 : p.vec_len;
+  auto dispatch = join_dispatch(d_align, d_row_major);
 
-      cutlassDistanceKernel<DataT, AccT, OutT, IdxT, vec_len, FinOpT, opT, p.is_row_major>(
-        x, y, x_norm, y_norm, m, n, k, ldx, ldy, ld_out, out, fin_op, cutlass_op, stream);
-    });
+  dispatch([&](auto alignment_tag, auto row_major_tag) {
+    constexpr int vec_len =
+      (alignment_tag.value % sizeof(DataT) == 0) ? alignment_tag.value / sizeof(DataT) : 1;
 
-  if (!dispatch_success) {
-    std::printf("Dispatch error(!)\n");
-    // TODO
-  }
+    cutlassDistanceKernel<DataT, AccT, OutT, IdxT, vec_len, FinOpT, opT, row_major_tag.value>(
+      x, y, x_norm, y_norm, m, n, k, ldx, ldy, ld_out, out, fin_op, cutlass_op, stream);
+  });
 }
 
 };  // namespace raft::distance::detail

From c44aecef0ba07bea2e91f7690d3552b03886ccfa Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 10 Feb 2023 14:36:49 +0100
Subject: [PATCH 33/60] Dispatch mechanism using switch statement

I fear the other way was getting too complicated and possibilities for
reuse were scarce anyway.
---
 .../detail/pairwise_matrix/dispatch.cuh       | 152 ++++++++----------
 1 file changed, 69 insertions(+), 83 deletions(-)

diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
index 1eb2a65d5a..cf95b10960 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
@@ -48,68 +48,27 @@ size_t alignment_of_2d_array(const DataT* x, size_t stride)
 }
 
 template <int n>
-struct alignment_tag {
-  static constexpr int value = n;
-};
+using align_constant = std::integral_constant<size_t, n>;
 
-struct alignment_dispatch {
-  size_t byte_alignment = 0;
-
-  template <typename DataT>
-  alignment_dispatch(const DataT* x, const DataT* y, size_t ldx, size_t ldy)
-  {
-    size_t align_x = alignment_of_2d_array(x, ldx);
-    size_t align_y = alignment_of_2d_array(y, ldy);
-
-    byte_alignment = min(align_x, align_y);
-  }
-
-  template <typename F>
-  auto operator()(F&& f) const
-  {
+template <typename F>
+inline void dispatch(bool row_major, size_t byte_alignment, F&& f) {
+  if (row_major) {
     switch (byte_alignment) {
-      case 16: f(alignment_tag<16>()); break;
-      case 8: f(alignment_tag<8>()); break;
-      case 4: f(alignment_tag<4>()); break;
-      case 2: f(alignment_tag<2>()); break;
-      default: f(alignment_tag<1>()); break;
+      case 16: f(std::bool_constant<true>(), align_constant<16>()); break;
+      case 8: f(std::bool_constant<true>(), align_constant<8>()); break;
+      case 4: f(std::bool_constant<true>(), align_constant<4>()); break;
+      case 2: f(std::bool_constant<true>(), align_constant<2>()); break;
+      default: f(std::bool_constant<true>(), align_constant<1>()); break;
     }
-  }
-};
-
-template <bool rm>
-struct row_major_tag {
-  static constexpr int value = rm;
-};
-
-struct row_major_dispatch {
-  bool is_row_major_;
-  row_major_dispatch(bool row_major) : is_row_major_(row_major) {}
-
-  template <typename F>
-  auto operator()(F&& f) const
-  {
-    if (is_row_major_) {
-      f(row_major_tag<true>());
-    } else {
-      f(row_major_tag<false>());
+  } else {
+    switch (byte_alignment) {
+      case 16: f(std::bool_constant<false>(), align_constant<16>()); break;
+      case 8: f(std::bool_constant<false>(), align_constant<8>()); break;
+      case 4: f(std::bool_constant<false>(), align_constant<4>()); break;
+      case 2: f(std::bool_constant<false>(), align_constant<2>()); break;
+      default: f(std::bool_constant<false>(), align_constant<1>()); break;
     }
   }
-};
-
-template <typename F1, typename F2>
-auto join_dispatch(F1&& f1, F2&& f2)
-{
-  const auto lam = [f1, f2](auto f) {
-    f1([f, f2](auto... args1) { f2([f, args1...](auto... args2) { f(args1..., args2...); }); });
-  };
-  return lam;
-}
-
-template <typename F1, typename F2, typename... Fs>
-auto join_dispatch(F1 f1, F2 f2, Fs... fs)
-{
-  return join_dispatch(join_dispatch(f1, f2), std::forward<Fs>(fs)...);
 }
 
 template <typename opT,
@@ -144,26 +103,39 @@ void distance_matrix_dispatch(opT distance_op,
     ldx = m, ldy = n, ld_out = n;
   }
 
-  alignment_dispatch d_align(x, y, ldx, ldy);
-  row_major_dispatch d_row_major(is_row_major);
-  auto dispatch = join_dispatch(d_align, d_row_major);
-
-  dispatch([&](auto alignment_tag, auto row_major_tag) {
-    // Compute number of elements that can be loaded in one instruction
-    // without causing misalignent errors.
-    constexpr int vec_len_ideal =
-      (alignment_tag.value % sizeof(DataT) == 0) ? alignment_tag.value / sizeof(DataT) : 1;
-
-    // To keep compile times in check, we only specialize on veclen > 1 when
-    // the inner loop is relatively cheap (< 5 flops).
-    constexpr int vec_len = distance_op.expensive_inner_loop ? 1 : vec_len_ideal;
-
-    typedef typename raft::linalg::Policy4x4<DataT, vec_len>::Policy RowPolicy;
-    typedef typename raft::linalg::Policy4x4<DataT, vec_len>::ColPolicy ColPolicy;
-    typedef typename std::conditional<row_major_tag.value, RowPolicy, ColPolicy>::type Policy;
+  size_t align_x = alignment_of_2d_array(x, ldx);
+  size_t align_y = alignment_of_2d_array(y, ldy);
+  size_t byte_alignment = min(align_x, align_y);
+
+  dispatch(
+    is_row_major,
+    byte_alignment,
+    [&](auto row_major, auto alignment) {
+      // row_major and alignment are std::integral_constants of type bool and
+      // size_t respectively.
+
+      // Since alignment is in bytes, it could be smaller than sizeof(DataT).
+      // Handle this (unlikely) case here.
+      if constexpr (alignment() < sizeof(DataT)) {
+        RAFT_EXPECTS(sizeof(DataT) <= alignment(), "Input matrix must be aligned to size of elements.");
+        return;
+      }
+
+      // Compute number of elements that can be loaded in one instruction
+      // without causing misalignent errors.
+      constexpr int vec_len_aligned =
+        (alignment() % sizeof(DataT) == 0) ? alignment() / sizeof(DataT) : 1;
+
+      // To keep compile times in check, we only specialize on veclen > 1 when
+      // the inner loop is relatively cheap (< 5 flops).
+      constexpr int vec_len = distance_op.expensive_inner_loop ? 1 : vec_len_aligned;
+
+      typedef typename raft::linalg::Policy4x4<DataT, vec_len>::Policy RowPolicy;
+      typedef typename raft::linalg::Policy4x4<DataT, vec_len>::ColPolicy ColPolicy;
+      typedef typename std::conditional<row_major(), RowPolicy, ColPolicy>::type Policy;
 
     // Create compile-time template parameter
-    using KP_T = kernel_params_T<DataT, AccT, OutT, IdxT, Policy, opT, FinOpT, row_major_tag.value>;
+    using KP_T = kernel_params_T<DataT, AccT, OutT, IdxT, Policy, opT, FinOpT, row_major()>;
 
     return pairwise_matrix<KP_T>(
       distance_op, fin_op, x, y, x_norm, y_norm, m, n, k, ldx, ldy, ld_out, out, stream);
@@ -201,17 +173,31 @@ void distance_matrix_cutlass_dispatch(opT cutlass_op,
     ldx = m, ldy = n, ld_out = n;
   }
 
-  alignment_dispatch d_align(x, y, ldx, ldy);
-  row_major_dispatch d_row_major(is_row_major);
+  size_t align_x = alignment_of_2d_array(x, ldx);
+  size_t align_y = alignment_of_2d_array(y, ldy);
+  size_t byte_alignment = min(align_x, align_y);
+
+
+  dispatch(
+    is_row_major,
+    byte_alignment,
+    [&](auto row_major, auto alignment) {
+      // row_major and alignment are std::integral_constants of type bool and
+      // size_t respectively.
 
-  auto dispatch = join_dispatch(d_align, d_row_major);
+      // Since alignment is in bytes, it could be smaller than sizeof(DataT).
+      // Handle this (unlikely) case here.
+      if constexpr (alignment() < sizeof(DataT)) {
+        RAFT_EXPECTS(sizeof(DataT) <= alignment(), "Input matrix must be aligned to size of elements.");
+        return;
+      }
 
-  dispatch([&](auto alignment_tag, auto row_major_tag) {
-    constexpr int vec_len =
-      (alignment_tag.value % sizeof(DataT) == 0) ? alignment_tag.value / sizeof(DataT) : 1;
+      // Compute number of elements that can be loaded in one instruction
+      // without causing misalignent errors.
+      constexpr int vec_len = (alignment() % sizeof(DataT) == 0) ? alignment() / sizeof(DataT) : 1;
 
-    cutlassDistanceKernel<DataT, AccT, OutT, IdxT, vec_len, FinOpT, opT, row_major_tag.value>(
-      x, y, x_norm, y_norm, m, n, k, ldx, ldy, ld_out, out, fin_op, cutlass_op, stream);
+      cutlassDistanceKernel<DataT, AccT, OutT, IdxT, vec_len, FinOpT, opT, row_major()>(
+        x, y, x_norm, y_norm, m, n, k, ldx, ldy, ld_out, out, fin_op, cutlass_op, stream);
   });
 }
 

From 7c3bd763bcb9761bb6022641519d404b96b992f7 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 10 Feb 2023 16:42:43 +0100
Subject: [PATCH 34/60] Remove one ".template" from kernel_sm60

---
 .../raft/distance/detail/pairwise_matrix/kernel_sm60.cuh   | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
index 68026414c0..eed50c36f7 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
@@ -17,8 +17,6 @@
 
 #include <cstddef>
 #include <raft/core/operators.hpp>
-#include <raft/util/cudart_utils.hpp> // TODO: remove
-
 #include <raft/distance/detail/pairwise_distance_base.cuh>
 
 namespace raft::distance::detail {
@@ -75,9 +73,7 @@ __global__ __launch_bounds__(KP_T::PolicyT::Nthreads, 2)
 
   // Wrap operator back into lambdas. This is temporary and should be removed. (TODO)
   auto core_op = [distance_op] __device__(AccT & acc, DataT & x, DataT & y) {
-    // use .template to disambiguate (See:
-    // https://en.cppreference.com/w/cpp/language/dependent_name)
-    distance_op.template core<AccT, DataT>(acc, x, y);
+    distance_op.core(acc, x, y);
   };
   auto epilog_op = [distance_op] __device__(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
                                             DataT * regxn,
@@ -90,6 +86,7 @@ __global__ __launch_bounds__(KP_T::PolicyT::Nthreads, 2)
 
   // No support for row_epilog_op.
   auto row_epilog_op = raft::void_op();
+
   // Always write output
   constexpr bool write_out = true;
   constexpr bool use_norms = distance_op.use_norms;

From d62eeb79e75824cd2cee6b2c155fbf7c823dae7d Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 10 Feb 2023 16:43:12 +0100
Subject: [PATCH 35/60] Dispatch on veclen instead of byte_alignment

To reduce compile times.
---
 .../detail/pairwise_matrix/dispatch.cuh       | 96 +++++++++----------
 1 file changed, 45 insertions(+), 51 deletions(-)

diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
index cf95b10960..75e557a420 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
@@ -48,25 +48,21 @@ size_t alignment_of_2d_array(const DataT* x, size_t stride)
 }
 
 template <int n>
-using align_constant = std::integral_constant<size_t, n>;
+using vec_len_constant = std::integral_constant<int, n>;
 
 template <typename F>
-inline void dispatch(bool row_major, size_t byte_alignment, F&& f) {
+inline void dispatch(bool row_major, int vec_len, F&& f) {
   if (row_major) {
-    switch (byte_alignment) {
-      case 16: f(std::bool_constant<true>(), align_constant<16>()); break;
-      case 8: f(std::bool_constant<true>(), align_constant<8>()); break;
-      case 4: f(std::bool_constant<true>(), align_constant<4>()); break;
-      case 2: f(std::bool_constant<true>(), align_constant<2>()); break;
-      default: f(std::bool_constant<true>(), align_constant<1>()); break;
+    switch (vec_len) {
+      case 4: f(std::bool_constant<true>(), vec_len_constant<4>()); break;
+      case 2: f(std::bool_constant<true>(), vec_len_constant<2>()); break;
+      default: f(std::bool_constant<true>(), vec_len_constant<1>()); break;
     }
   } else {
-    switch (byte_alignment) {
-      case 16: f(std::bool_constant<false>(), align_constant<16>()); break;
-      case 8: f(std::bool_constant<false>(), align_constant<8>()); break;
-      case 4: f(std::bool_constant<false>(), align_constant<4>()); break;
-      case 2: f(std::bool_constant<false>(), align_constant<2>()); break;
-      default: f(std::bool_constant<false>(), align_constant<1>()); break;
+    switch (vec_len) {
+      case 4: f(std::bool_constant<false>(), vec_len_constant<4>()); break;
+      case 2: f(std::bool_constant<false>(), vec_len_constant<2>()); break;
+      default: f(std::bool_constant<false>(), vec_len_constant<1>()); break;
     }
   }
 }
@@ -107,39 +103,38 @@ void distance_matrix_dispatch(opT distance_op,
   size_t align_y = alignment_of_2d_array(y, ldy);
   size_t byte_alignment = min(align_x, align_y);
 
+  // Since alignment is in bytes, it could be smaller than sizeof(DataT).
+  // Handle this (unlikely) case here.
+  RAFT_EXPECTS(sizeof(DataT) <= byte_alignment, "Input matrix must be aligned to size of elements.");
+
+  // Compute number of elements that can be loaded in one instruction
+  // without causing misalignent errors.
+  int vec_len_aligned = (byte_alignment % sizeof(DataT) == 0) ? byte_alignment / sizeof(DataT) : 1;
+
   dispatch(
     is_row_major,
-    byte_alignment,
-    [&](auto row_major, auto alignment) {
-      // row_major and alignment are std::integral_constants of type bool and
-      // size_t respectively.
-
-      // Since alignment is in bytes, it could be smaller than sizeof(DataT).
-      // Handle this (unlikely) case here.
-      if constexpr (alignment() < sizeof(DataT)) {
-        RAFT_EXPECTS(sizeof(DataT) <= alignment(), "Input matrix must be aligned to size of elements.");
-        return;
-      }
-
-      // Compute number of elements that can be loaded in one instruction
-      // without causing misalignent errors.
-      constexpr int vec_len_aligned =
-        (alignment() % sizeof(DataT) == 0) ? alignment() / sizeof(DataT) : 1;
+    vec_len_aligned,
+    [&](auto row_major, auto vec_len_aligned) {
+      // row_major and vec_len are std::integral_constants of type bool and int
+      // respectively.
 
       // To keep compile times in check, we only specialize on veclen > 1 when
       // the inner loop is relatively cheap (< 5 flops).
-      constexpr int vec_len = distance_op.expensive_inner_loop ? 1 : vec_len_aligned;
+      constexpr int vec_len_op = distance_op.expensive_inner_loop ? 1 : vec_len_aligned();
+
+      // Prevent double, vec_len=4 combination (this is not supported)
+      constexpr int vec_len = std::min(vec_len_op, static_cast<int>(16 / sizeof(DataT)));
 
       typedef typename raft::linalg::Policy4x4<DataT, vec_len>::Policy RowPolicy;
       typedef typename raft::linalg::Policy4x4<DataT, vec_len>::ColPolicy ColPolicy;
       typedef typename std::conditional<row_major(), RowPolicy, ColPolicy>::type Policy;
 
-    // Create compile-time template parameter
-    using KP_T = kernel_params_T<DataT, AccT, OutT, IdxT, Policy, opT, FinOpT, row_major()>;
+      // Create compile-time template parameter
+      using KP_T = kernel_params_T<DataT, AccT, OutT, IdxT, Policy, opT, FinOpT, row_major()>;
 
-    return pairwise_matrix<KP_T>(
-      distance_op, fin_op, x, y, x_norm, y_norm, m, n, k, ldx, ldy, ld_out, out, stream);
-  });
+      return pairwise_matrix<KP_T>(
+        distance_op, fin_op, x, y, x_norm, y_norm, m, n, k, ldx, ldy, ld_out, out, stream);
+    });
 }
 
 template <typename opT,
@@ -177,24 +172,23 @@ void distance_matrix_cutlass_dispatch(opT cutlass_op,
   size_t align_y = alignment_of_2d_array(y, ldy);
   size_t byte_alignment = min(align_x, align_y);
 
+  // Since alignment is in bytes, it could be smaller than sizeof(DataT).
+  // Handle this (unlikely) case here.
+  RAFT_EXPECTS(sizeof(DataT) <= byte_alignment, "Input matrix must be aligned to size of elements.");
+
+  // Compute number of elements that can be loaded in one instruction
+  // without causing misalignent errors.
+  int vec_len_aligned = (byte_alignment % sizeof(DataT) == 0) ? byte_alignment / sizeof(DataT) : 1;
 
   dispatch(
     is_row_major,
-    byte_alignment,
-    [&](auto row_major, auto alignment) {
-      // row_major and alignment are std::integral_constants of type bool and
-      // size_t respectively.
-
-      // Since alignment is in bytes, it could be smaller than sizeof(DataT).
-      // Handle this (unlikely) case here.
-      if constexpr (alignment() < sizeof(DataT)) {
-        RAFT_EXPECTS(sizeof(DataT) <= alignment(), "Input matrix must be aligned to size of elements.");
-        return;
-      }
-
-      // Compute number of elements that can be loaded in one instruction
-      // without causing misalignent errors.
-      constexpr int vec_len = (alignment() % sizeof(DataT) == 0) ? alignment() / sizeof(DataT) : 1;
+    vec_len_aligned,
+    [&](auto row_major, auto vec_len_aligned) {
+      // row_major and vec_len are std::integral_constants of type bool and int
+      // respectively.
+
+      // Prevent double, vec_len=4 combination (this is not supported)
+      constexpr int vec_len = std::min(vec_len_aligned, static_cast<int>(16 / sizeof(DataT)));
 
       cutlassDistanceKernel<DataT, AccT, OutT, IdxT, vec_len, FinOpT, opT, row_major()>(
         x, y, x_norm, y_norm, m, n, k, ldx, ldy, ld_out, out, fin_op, cutlass_op, stream);

From 5c3dcafea941d1e877c8b1c714022ba254363601 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 20 Feb 2023 15:02:53 +0100
Subject: [PATCH 36/60] Use many template parameters again

---
 .../detail/pairwise_matrix/dispatch.cuh       |  96 +++++++-------
 .../detail/pairwise_matrix/kernel_sm60.cuh    | 121 ++++++++----------
 2 files changed, 105 insertions(+), 112 deletions(-)

diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
index 75e557a420..b3362e7647 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
@@ -24,9 +24,9 @@
 namespace raft::distance::detail {
 
 /**
- * @brief: Computes minimal alignment of row starting elements in 2D array
+ * @brief: Computes minimal common alignment of the rows in a 2D array in bytes
  *
- * The 2D matrix x is assumed to be row-major. This function computes the
+ * The 2D matrix `x` is assumed to be row-major. This function computes the
  * minimal alignment in bytes of the first elements of each row.
  * Output can be 16, 8, 4, 2, 1.
  *
@@ -50,8 +50,25 @@ size_t alignment_of_2d_array(const DataT* x, size_t stride)
 template <int n>
 using vec_len_constant = std::integral_constant<int, n>;
 
+/**
+ * @brief: Converts run-time arguments to compile-time arguments
+ *
+ * Converts run-time arguments row_major and vec_len to compile-time arguments
+ * and dispatches a lambda f with these compile-time arguments.
+ *
+ * This is equivalent to copying and pasting the lambda function `f` in each of
+ * the switch case statements.
+ *
+ * @tparam F         Type of lambda f.
+ * @param row_major  Boolean indicating whether input arrays have row-major layout.
+ * @param vec_len    Integer value 1, 2, or 4 specifying the Veclen template parameter of
+ *                   the KernelPolicy.
+ * @param f          Lambda that takes two std::integral_constant parameters representing
+ *                   row_major and vec_len.
+ */
 template <typename F>
-inline void dispatch(bool row_major, int vec_len, F&& f) {
+void dispatch(bool row_major, int vec_len, F&& f)
+{
   if (row_major) {
     switch (vec_len) {
       case 4: f(std::bool_constant<true>(), vec_len_constant<4>()); break;
@@ -67,13 +84,13 @@ inline void dispatch(bool row_major, int vec_len, F&& f) {
   }
 }
 
-template <typename opT,
+template <typename OpT,
           typename DataT,
           typename AccT,
           typename OutT,
           typename FinOpT,
           typename IdxT = int>
-void distance_matrix_dispatch(opT distance_op,
+void distance_matrix_dispatch(OpT distance_op,
                               IdxT m,
                               IdxT n,
                               IdxT k,
@@ -86,8 +103,8 @@ void distance_matrix_dispatch(opT distance_op,
                               cudaStream_t stream,
                               bool is_row_major)
 {
-  // Determine leading dimensions and possibly flip order of passing x and y if
-  // column_major.
+  // Determine leading dimensions and, if column-major, flip order of passing x
+  // and y.
   IdxT ldx, ldy, ld_out;
   if (is_row_major) {
     ldx = k, ldy = k, ld_out = n;
@@ -99,42 +116,37 @@ void distance_matrix_dispatch(opT distance_op,
     ldx = m, ldy = n, ld_out = n;
   }
 
-  size_t align_x = alignment_of_2d_array(x, ldx);
-  size_t align_y = alignment_of_2d_array(y, ldy);
+  size_t align_x        = alignment_of_2d_array(x, ldx);
+  size_t align_y        = alignment_of_2d_array(y, ldy);
   size_t byte_alignment = min(align_x, align_y);
 
   // Since alignment is in bytes, it could be smaller than sizeof(DataT).
   // Handle this (unlikely) case here.
-  RAFT_EXPECTS(sizeof(DataT) <= byte_alignment, "Input matrix must be aligned to size of elements.");
+  RAFT_EXPECTS(sizeof(DataT) <= byte_alignment,
+               "Input matrix must be aligned to size of elements.");
 
   // Compute number of elements that can be loaded in one instruction
   // without causing misalignent errors.
   int vec_len_aligned = (byte_alignment % sizeof(DataT) == 0) ? byte_alignment / sizeof(DataT) : 1;
 
-  dispatch(
-    is_row_major,
-    vec_len_aligned,
-    [&](auto row_major, auto vec_len_aligned) {
-      // row_major and vec_len are std::integral_constants of type bool and int
-      // respectively.
+  dispatch(is_row_major, vec_len_aligned, [&](auto row_major, auto vec_len_aligned) {
+    // row_major and vec_len are std::integral_constants of type bool and int
+    // respectively.
 
-      // To keep compile times in check, we only specialize on veclen > 1 when
-      // the inner loop is relatively cheap (< 5 flops).
-      constexpr int vec_len_op = distance_op.expensive_inner_loop ? 1 : vec_len_aligned();
+    // To keep compile times in check, we only specialize on veclen > 1 when
+    // the inner loop is relatively cheap (< 5 flops).
+    constexpr int vec_len_op = distance_op.expensive_inner_loop ? 1 : vec_len_aligned();
 
-      // Prevent double, vec_len=4 combination (this is not supported)
-      constexpr int vec_len = std::min(vec_len_op, static_cast<int>(16 / sizeof(DataT)));
+    // Prevent double, vec_len=4 combination (this is not supported)
+    constexpr int vec_len = std::min(vec_len_op, static_cast<int>(16 / sizeof(DataT)));
 
-      typedef typename raft::linalg::Policy4x4<DataT, vec_len>::Policy RowPolicy;
-      typedef typename raft::linalg::Policy4x4<DataT, vec_len>::ColPolicy ColPolicy;
-      typedef typename std::conditional<row_major(), RowPolicy, ColPolicy>::type Policy;
+    typedef typename raft::linalg::Policy4x4<DataT, vec_len>::Policy RowPolicy;
+    typedef typename raft::linalg::Policy4x4<DataT, vec_len>::ColPolicy ColPolicy;
+    typedef typename std::conditional<row_major(), RowPolicy, ColPolicy>::type Policy;
 
-      // Create compile-time template parameter
-      using KP_T = kernel_params_T<DataT, AccT, OutT, IdxT, Policy, opT, FinOpT, row_major()>;
-
-      return pairwise_matrix<KP_T>(
-        distance_op, fin_op, x, y, x_norm, y_norm, m, n, k, ldx, ldy, ld_out, out, stream);
-    });
+    return pairwise_matrix<Policy, row_major(), DataT, AccT, OutT, IdxT, OpT, FinOpT>(
+      distance_op, fin_op, x, y, x_norm, y_norm, m, n, k, ldx, ldy, ld_out, out, stream);
+  });
 }
 
 template <typename opT,
@@ -168,30 +180,28 @@ void distance_matrix_cutlass_dispatch(opT cutlass_op,
     ldx = m, ldy = n, ld_out = n;
   }
 
-  size_t align_x = alignment_of_2d_array(x, ldx);
-  size_t align_y = alignment_of_2d_array(y, ldy);
+  size_t align_x        = alignment_of_2d_array(x, ldx);
+  size_t align_y        = alignment_of_2d_array(y, ldy);
   size_t byte_alignment = min(align_x, align_y);
 
   // Since alignment is in bytes, it could be smaller than sizeof(DataT).
   // Handle this (unlikely) case here.
-  RAFT_EXPECTS(sizeof(DataT) <= byte_alignment, "Input matrix must be aligned to size of elements.");
+  RAFT_EXPECTS(sizeof(DataT) <= byte_alignment,
+               "Input matrix must be aligned to size of elements.");
 
   // Compute number of elements that can be loaded in one instruction
   // without causing misalignent errors.
   int vec_len_aligned = (byte_alignment % sizeof(DataT) == 0) ? byte_alignment / sizeof(DataT) : 1;
 
-  dispatch(
-    is_row_major,
-    vec_len_aligned,
-    [&](auto row_major, auto vec_len_aligned) {
-      // row_major and vec_len are std::integral_constants of type bool and int
-      // respectively.
+  dispatch(is_row_major, vec_len_aligned, [&](auto row_major, auto vec_len_aligned) {
+    // row_major and vec_len are std::integral_constants of type bool and int
+    // respectively.
 
-      // Prevent double, vec_len=4 combination (this is not supported)
-      constexpr int vec_len = std::min(vec_len_aligned, static_cast<int>(16 / sizeof(DataT)));
+    // Prevent double, vec_len=4 combination (this is not supported)
+    constexpr int vec_len = std::min(vec_len_aligned, static_cast<int>(16 / sizeof(DataT)));
 
-      cutlassDistanceKernel<DataT, AccT, OutT, IdxT, vec_len, FinOpT, opT, row_major()>(
-        x, y, x_norm, y_norm, m, n, k, ldx, ldy, ld_out, out, fin_op, cutlass_op, stream);
+    cutlassDistanceKernel<DataT, AccT, OutT, IdxT, vec_len, FinOpT, opT, row_major()>(
+      x, y, x_norm, y_norm, m, n, k, ldx, ldy, ld_out, out, fin_op, cutlass_op, stream);
   });
 }
 
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
index eed50c36f7..1e450f9289 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
@@ -21,54 +21,28 @@
 
 namespace raft::distance::detail {
 
-template <typename data_type,
-          typename accumulate_type,
-          typename out_type,
-          typename index_type,
-
-          typename policy,
-          // Op (L2, L1, etc...)
-          typename op_type,
-          typename final_op_type,
-          bool row_major>
-struct kernel_params_T {
-  using DataT                        = data_type;
-  using AccT                         = accumulate_type;
-  using OutT                         = out_type;
-  using IdxT                         = index_type;
-  using PolicyT                      = policy;
-  using opT                          = op_type;
-  using FinOpT                       = final_op_type;
-  static constexpr bool is_row_major = row_major;
-};
-
-template <typename KP_T>
-__global__ __launch_bounds__(KP_T::PolicyT::Nthreads, 2)
-
-  void pairwise_matrix_kernel(const typename KP_T::DataT* x,
-                              const typename KP_T::DataT* y,
-                              const typename KP_T::DataT* _xn,
-                              const typename KP_T::DataT* _yn,
-                              typename KP_T::IdxT m,
-                              typename KP_T::IdxT n,
-                              typename KP_T::IdxT k,
-                              typename KP_T::IdxT lda,
-                              typename KP_T::IdxT ldb,
-                              typename KP_T::IdxT ldd,
-                              typename KP_T::OutT* dOutput,
-                              typename KP_T::opT distance_op,
-                              typename KP_T::FinOpT fin_op)
+template <typename Policy,
+          bool row_major,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename opT,
+          typename FinOpT>
+__global__ __launch_bounds__(Policy::Nthreads, 2) void pairwise_matrix_kernel(const DataT* x,
+                                                                              const DataT* y,
+                                                                              const DataT* _xn,
+                                                                              const DataT* _yn,
+                                                                              IdxT m,
+                                                                              IdxT n,
+                                                                              IdxT k,
+                                                                              IdxT lda,
+                                                                              IdxT ldb,
+                                                                              IdxT ldd,
+                                                                              OutT* dOutput,
+                                                                              opT distance_op,
+                                                                              FinOpT fin_op)
 {
-  using AccT  = typename KP_T::AccT;
-  using DataT = typename KP_T::DataT;
-  using OutT  = typename KP_T::OutT;
-  using IdxT  = typename KP_T::IdxT;
-
-  using Policy = typename KP_T::PolicyT;
-
-  // Instantiate compile time parameters to access constexpr members.
-  KP_T compile_time_params{};
-
   extern __shared__ char smem[];
 
   // Wrap operator back into lambdas. This is temporary and should be removed. (TODO)
@@ -80,6 +54,8 @@ __global__ __launch_bounds__(KP_T::PolicyT::Nthreads, 2)
                                             DataT * regyn,
                                             IdxT gridStrideX,
                                             IdxT gridStrideY) {
+    // Use .template to disambiguate (See:
+    // https://en.cppreference.com/w/cpp/language/dependent_name)
     distance_op.template epilog<Policy, AccT, DataT, IdxT>(
       acc, regxn, regyn, gridStrideX, gridStrideY);
   };
@@ -100,7 +76,7 @@ __global__ __launch_bounds__(KP_T::PolicyT::Nthreads, 2)
                     decltype(epilog_op),
                     decltype(fin_op),
                     decltype(row_epilog_op),
-                    compile_time_params.is_row_major,
+                    row_major,
                     write_out>
     obj(x,
         y,
@@ -121,32 +97,39 @@ __global__ __launch_bounds__(KP_T::PolicyT::Nthreads, 2)
   obj.run();
 }
 
-template <typename KP_T>
-static void pairwise_matrix(typename KP_T::opT distance_op,
-                            typename KP_T::FinOpT fin_op,
-                            const typename KP_T::DataT* x,
-                            const typename KP_T::DataT* y,
-                            const typename KP_T::DataT* _xn,
-                            const typename KP_T::DataT* _yn,
-                            typename KP_T::IdxT m,
-                            typename KP_T::IdxT n,
-                            typename KP_T::IdxT k,
-                            typename KP_T::IdxT lda,
-                            typename KP_T::IdxT ldb,
-                            typename KP_T::IdxT ldd,
-                            typename KP_T::OutT* dOutput,
-                            cudaStream_t stream)
+template <typename Policy,
+          bool row_major,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename OpT,
+          typename FinOpT>
+void pairwise_matrix(OpT distance_op,
+                     FinOpT fin_op,
+                     const DataT* x,
+                     const DataT* y,
+                     const DataT* _xn,
+                     const DataT* _yn,
+                     IdxT m,
+                     IdxT n,
+                     IdxT k,
+                     IdxT lda,
+                     IdxT ldb,
+                     IdxT ldd,
+                     OutT* dOutput,
+                     cudaStream_t stream)
 {
-  using Policy = typename KP_T::PolicyT;
-  using DataT = typename KP_T::DataT;
-
   dim3 blk(Policy::Nthreads);
+  // Use .template to disambiguate (See:
+  // https://en.cppreference.com/w/cpp/language/dependent_name)
   size_t smem_size = distance_op.template shared_mem_size<Policy, DataT>();
-  dim3 grid        = launchConfigGenerator<Policy>(m, n, smem_size, pairwise_matrix_kernel<KP_T>);
+  // Obtain function pointer to kernel
+  auto kernel = pairwise_matrix_kernel<Policy, row_major, DataT, AccT, OutT, IdxT, OpT, FinOpT>;
+  dim3 grid   = launchConfigGenerator<Policy>(m, n, smem_size, kernel);
 
-  pairwise_matrix_kernel<KP_T><<<grid, blk, smem_size, stream>>>(
+  kernel<<<grid, blk, smem_size, stream>>>(
     x, y, _xn, _yn, m, n, k, lda, ldb, ldd, dOutput, distance_op, fin_op);
-
   RAFT_CUDA_TRY(cudaGetLastError());
 }
 

From 2613e8a72d69278d7e2b50e6e5404e9c457dd685 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 20 Feb 2023 16:03:23 +0100
Subject: [PATCH 37/60] Remove duplicate DistanceType enum definition

---
 cpp/include/raft/distance/detail/distance.cuh | 48 +------------------
 1 file changed, 1 insertion(+), 47 deletions(-)

diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index b459c73bee..5887155401 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cuda_runtime_api.h>
+#include <raft/distance/distance_types.hpp>
 #include <raft/distance/detail/canberra.cuh>
 #include <raft/distance/detail/chebyshev.cuh>
 #include <raft/distance/detail/correlation.cuh>
@@ -37,53 +38,6 @@ namespace raft {
 namespace distance {
 namespace detail {
 
-/** enum to tell how to compute distance */
-enum DistanceType : unsigned short {
-
-  /** evaluate as dist_ij = sum(x_ik^2) + sum(y_ij)^2 - 2*sum(x_ik * y_jk) */
-  L2Expanded = 0,
-  /** same as above, but inside the epilogue, perform square root operation */
-  L2SqrtExpanded = 1,
-  /** cosine distance */
-  CosineExpanded = 2,
-  /** L1 distance */
-  L1 = 3,
-  /** evaluate as dist_ij += (x_ik - y-jk)^2 */
-  L2Unexpanded = 4,
-  /** same as above, but inside the epilogue, perform square root operation */
-  L2SqrtUnexpanded = 5,
-  /** basic inner product **/
-  InnerProduct = 6,
-  /** Chebyshev (Linf) distance **/
-  Linf = 7,
-  /** Canberra distance **/
-  Canberra = 8,
-  /** Generalized Minkowski distance **/
-  LpUnexpanded = 9,
-  /** Correlation distance **/
-  CorrelationExpanded = 10,
-  /** Jaccard distance **/
-  JaccardExpanded = 11,
-  /** Hellinger distance **/
-  HellingerExpanded = 12,
-  /** Haversine distance **/
-  Haversine = 13,
-  /** Bray-Curtis distance **/
-  BrayCurtis = 14,
-  /** Jensen-Shannon distance**/
-  JensenShannon = 15,
-  /** Hamming distance **/
-  HammingUnexpanded = 16,
-  /** KLDivergence **/
-  KLDivergence = 17,
-  /** RusselRao **/
-  RusselRaoExpanded = 18,
-  /** Dice-Sorensen distance **/
-  DiceExpanded = 19,
-  /** Precomputed (special value) **/
-  Precomputed = 100
-};
-
 namespace {
 template <raft::distance::DistanceType distanceType,
           typename InType,

From 62ed53a4b5d1e8b44d066bdd4c6cfebd35f55658 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 20 Feb 2023 17:38:24 +0100
Subject: [PATCH 38/60] Remove pairwiseDistanceMatKernel

Has been replaced by pairwise_matrix_kernel
---
 .../detail/pairwise_distance_base.cuh         | 164 ------------------
 1 file changed, 164 deletions(-)

diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index 140664f394..5acdf91c67 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -268,170 +268,6 @@ struct PairwiseDistances : public BaseClass {
   }
 };  // struct PairwiseDistances
 
-/**
- * @brief the distance matrix calculation kernel for L1, L2 and cosine
- * @tparam useNorms       whether norms are needed
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
- * @tparam Policy         struct which tunes the Contraction kernel
- * @tparam CoreLambda     lambda which implements accumulation operation
- * @tparam EpilogueLambda lambda which implements operation for calculating
-                          final value.
- * @tparam FinalLambda    final lambda called on final distance value
- * @tparam isRowMajor     true if input/output is row major(default),
-                          false for column major
- *
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       xn row norms of input matrix A.
- * @param[in]       yn row norms of input matrix B.
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of columns of B and C/D
- * @param[in]       k number of cols of A and rows of B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[output]   pD output matrix
- * @param core_op   the core lambda
- * @param epilog_op the epilogue lambda
- * @param fin_op    the final gemm epilogue lambda
- */
-
-template <bool useNorms,
-          typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename Policy,
-          typename CoreLambda,
-          typename EpilogueLambda,
-          typename FinalLambda,
-          bool isRowMajor = true,
-          bool writeOut   = true>
-__global__ __launch_bounds__(Policy::Nthreads, 2)
-
-  void pairwiseDistanceMatKernel(const DataT* x,
-                                 const DataT* y,
-                                 const DataT* _xn,
-                                 const DataT* _yn,
-                                 IdxT m,
-                                 IdxT n,
-                                 IdxT k,
-                                 IdxT lda,
-                                 IdxT ldb,
-                                 IdxT ldd,
-                                 OutT* dOutput,
-                                 CoreLambda core_op,
-                                 EpilogueLambda epilog_op,
-                                 FinalLambda fin_op)
-{
-  extern __shared__ char smem[];
-  auto rowEpilog = raft::void_op();
-
-  PairwiseDistances<useNorms,
-                    DataT,
-                    AccT,
-                    OutT,
-                    IdxT,
-                    Policy,
-                    CoreLambda,
-                    EpilogueLambda,
-                    FinalLambda,
-                    decltype(rowEpilog),
-                    isRowMajor,
-                    writeOut>
-    obj(
-      x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, epilog_op, fin_op, rowEpilog);
-  obj.run();
-}
-
-/**
- * @brief the distance matrix calculation kernel for L2 and cosine
- * for GPU arch < SM 8.0, this version is to make sure we don't recompile
- * these kernels for ampere or higher as we use cutlass kernel for it.
- * @tparam useNorms       whether norms are needed
- * @tparam DataT          input data-type (for A and B matrices)
- * @tparam AccT           accumulation data-type
- * @tparam OutT           output data-type (for C and D matrices)
- * @tparam IdxT           index data-type
- * @tparam Policy         struct which tunes the Contraction kernel
- * @tparam CoreLambda     lambda which implements accumulation operation
- * @tparam EpilogueLambda lambda which implements operation for calculating
-                          final value.
- * @tparam FinalLambda    final lambda called on final distance value
- * @tparam isRowMajor     true if input/output is row major(default),
-                          false for column major
- *
- * @param[in]       x input matrix
- * @param[in]       y input matrix
- * @param[in]       xn row norms of input matrix A.
- * @param[in]       yn row norms of input matrix B.
- * @param[in]       m number of rows of A and C/D
- * @param[in]       n number of columns of B and C/D
- * @param[in]       k number of cols of A and rows of B
- * @param[in]       lda leading dimension of A
- * @param[in]       ldb leading dimension of B
- * @param[in]       ldd leading dimension of C/D
- * @param[output]   pD output matrix
- * @param core_op   the core lambda
- * @param epilog_op the epilogue lambda
- * @param fin_op    the final gemm epilogue lambda
- */
-
-template <bool useNorms,
-          typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename Policy,
-          typename CoreLambda,
-          typename EpilogueLambda,
-          typename FinalLambda,
-          bool isRowMajor = true,
-          bool writeOut   = true>
-__global__ __launch_bounds__(Policy::Nthreads, 2)
-
-  void pairwiseDistanceMatKernelPriorToAmpere(const DataT* x,
-                                              const DataT* y,
-                                              const DataT* _xn,
-                                              const DataT* _yn,
-                                              IdxT m,
-                                              IdxT n,
-                                              IdxT k,
-                                              IdxT lda,
-                                              IdxT ldb,
-                                              IdxT ldd,
-                                              OutT* dOutput,
-                                              CoreLambda core_op,
-                                              EpilogueLambda epilog_op,
-                                              FinalLambda fin_op)
-{
-  //#if __CUDA_ARCH__ < 800
-  // TODO: re-enable the CUDA_ARCH guard for below Ampere once cutlass based
-  //  kernels are enabled for CUDA 12.0
-  extern __shared__ char smem[];
-  auto rowEpilog = raft::void_op();
-
-  PairwiseDistances<useNorms,
-                    DataT,
-                    AccT,
-                    OutT,
-                    IdxT,
-                    Policy,
-                    CoreLambda,
-                    EpilogueLambda,
-                    FinalLambda,
-                    decltype(rowEpilog),
-                    isRowMajor,
-                    writeOut>
-    obj(
-      x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, epilog_op, fin_op, rowEpilog);
-  obj.run();
-  //#endif
-}
-
 template <typename P, typename IdxT, typename T>
 dim3 launchConfigGenerator(IdxT m, IdxT n, std::size_t sMemSize, T func)
 {

From c334ba33df43c9eb8e29cde0821aa63d3d531a8e Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 20 Feb 2023 19:26:25 +0100
Subject: [PATCH 39/60] Remove distance::detail::pairwise_distance_impl

---
 cpp/include/raft/distance/detail/distance.cuh | 38 --------
 cpp/include/raft/distance/distance.cuh        | 88 +++++++++----------
 2 files changed, 41 insertions(+), 85 deletions(-)

diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index 5887155401..8d4155356b 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cuda_runtime_api.h>
-#include <raft/distance/distance_types.hpp>
 #include <raft/distance/detail/canberra.cuh>
 #include <raft/distance/detail/chebyshev.cuh>
 #include <raft/distance/detail/correlation.cuh>
@@ -648,43 +647,6 @@ size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, In
   return worksize;
 }
 
-/**
- * @defgroup pairwise_distance pairwise distance prims
- * @{
- * @brief Convenience wrapper around 'distance' prim to convert runtime metric
- * into compile time for the purpose of dispatch
- * @tparam Type input/accumulation/output data-type
- * @tparam Index_ indexing type
- * @param x first set of points
- * @param y second set of points
- * @param dist output distance matrix
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- * @param workspace temporary workspace buffer which can get resized as per the
- * needed workspace size
- * @param metric distance metric
- * @param stream cuda stream
- * @param isRowMajor whether the matrices are row-major or col-major
- */
-template <typename Type, typename Index_, raft::distance::DistanceType DistType>
-void pairwise_distance_impl(const Type* x,
-                            const Type* y,
-                            Type* dist,
-                            Index_ m,
-                            Index_ n,
-                            Index_ k,
-                            rmm::device_uvector<char>& workspace,
-                            cudaStream_t stream,
-                            bool isRowMajor,
-                            Type metric_arg = 2.0f)
-{
-  auto worksize = getWorkspaceSize<DistType, Type, Type, Type, Index_>(x, y, m, n, k);
-  workspace.resize(worksize, stream);
-  distance<DistType, Type, Type, Type, Index_>(
-    x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor, metric_arg);
-}
-/** @} */
 };  // namespace detail
 };  // namespace distance
 };  // namespace raft
diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh
index 93a5ce7f1a..90eeb90d38 100644
--- a/cpp/include/raft/distance/distance.cuh
+++ b/cpp/include/raft/distance/distance.cuh
@@ -22,6 +22,7 @@
 #include <raft/distance/detail/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <rmm/device_uvector.hpp>
+#include <type_traits>
 
 #include <raft/core/device_mdspan.hpp>
 
@@ -250,67 +251,60 @@ void pairwise_distance(raft::device_resources const& handle,
                        bool isRowMajor = true,
                        Type metric_arg = 2.0f)
 {
+  auto stream = handle.get_stream();
+
+  auto dispatch = [&](auto distance_type) {
+    auto worksize = getWorkspaceSize<distance_type(), Type, Type, Type, Index_>(x, y, m, n, k);
+    workspace.resize(worksize, stream);
+    detail::distance<distance_type(), Type, Type, Type, Index_>(
+      x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor, metric_arg);
+  };
+
   switch (metric) {
-    case raft::distance::DistanceType::L2Expanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2Expanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+    case DistanceType::Canberra:
+      dispatch(std::integral_constant<DistanceType, DistanceType::Canberra>{});
       break;
-    case raft::distance::DistanceType::L2SqrtExpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2SqrtExpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+    case DistanceType::CorrelationExpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::CorrelationExpanded>{});
       break;
-    case raft::distance::DistanceType::CosineExpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::CosineExpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+    case DistanceType::CosineExpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::CosineExpanded>{});
       break;
-    case raft::distance::DistanceType::L1:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L1>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+    case DistanceType::HammingUnexpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::HammingUnexpanded>{});
       break;
-    case raft::distance::DistanceType::L2Unexpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2Unexpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+    case DistanceType::HellingerExpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::HellingerExpanded>{});
       break;
-    case raft::distance::DistanceType::L2SqrtUnexpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2SqrtUnexpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+    case DistanceType::JensenShannon:
+      dispatch(std::integral_constant<DistanceType, DistanceType::JensenShannon>{});
       break;
-    case raft::distance::DistanceType::Linf:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::Linf>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+    case DistanceType::KLDivergence:
+      dispatch(std::integral_constant<DistanceType, DistanceType::KLDivergence>{});
       break;
-    case raft::distance::DistanceType::HellingerExpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::HellingerExpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+    case DistanceType::L1:
+      dispatch(std::integral_constant<DistanceType, DistanceType::L1>{});
       break;
-    case raft::distance::DistanceType::LpUnexpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::LpUnexpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor, metric_arg);
+    case DistanceType::L2Expanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::L2Expanded>{});
       break;
-    case raft::distance::DistanceType::Canberra:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::Canberra>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+    case DistanceType::L2SqrtExpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::L2SqrtExpanded>{});
       break;
-    case raft::distance::DistanceType::HammingUnexpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::HammingUnexpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+    case DistanceType::L2SqrtUnexpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::L2SqrtUnexpanded>{});
       break;
-    case raft::distance::DistanceType::JensenShannon:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::JensenShannon>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+    case DistanceType::L2Unexpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::L2Unexpanded>{});
       break;
-    case raft::distance::DistanceType::RusselRaoExpanded:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::RusselRaoExpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+    case DistanceType::Linf:
+      dispatch(std::integral_constant<DistanceType, DistanceType::Linf>{});
       break;
-    case raft::distance::DistanceType::KLDivergence:
-      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::KLDivergence>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+    case DistanceType::LpUnexpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::LpUnexpanded>{});
       break;
-    case raft::distance::DistanceType::CorrelationExpanded:
-      detail::
-        pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::CorrelationExpanded>(
-          x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+    case DistanceType::RusselRaoExpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::RusselRaoExpanded>{});
       break;
     default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
   };
@@ -481,4 +475,4 @@ void pairwise_distance(raft::device_resources const& handle,
 };  // namespace distance
 };  // namespace raft
 
-#endif
\ No newline at end of file
+#endif

From 8e432383eae1b2017b0c531270956a6a69638ce9 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 21 Feb 2023 09:57:03 +0100
Subject: [PATCH 40/60] distance_ops: Include cuda_utils.cuh

---
 cpp/include/raft/distance/detail/distance_ops/correlation.cuh | 2 ++
 cpp/include/raft/distance/detail/distance_ops/cosine.cuh      | 2 ++
 cpp/include/raft/distance/detail/distance_ops/hamming.cuh     | 2 ++
 cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh      | 2 ++
 cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh    | 2 ++
 cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh  | 2 ++
 cpp/include/raft/distance/detail/distance_ops/template.cuh    | 2 ++
 7 files changed, 14 insertions(+)

diff --git a/cpp/include/raft/distance/detail/distance_ops/correlation.cuh b/cpp/include/raft/distance/detail/distance_ops/correlation.cuh
index eb18355ca9..f17d67953e 100644
--- a/cpp/include/raft/distance/detail/distance_ops/correlation.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/correlation.cuh
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <raft/util/cuda_utils.cuh>
+
 namespace raft::distance::detail::ops {
 
 // Describes the computation the correlation distance
diff --git a/cpp/include/raft/distance/detail/distance_ops/cosine.cuh b/cpp/include/raft/distance/detail/distance_ops/cosine.cuh
index bbc1ffcba2..aa2eac01bc 100644
--- a/cpp/include/raft/distance/detail/distance_ops/cosine.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/cosine.cuh
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <raft/util/cuda_utils.cuh>
+
 namespace raft::distance::detail::ops {
 
 // Describes the computation the cosine distance
diff --git a/cpp/include/raft/distance/detail/distance_ops/hamming.cuh b/cpp/include/raft/distance/detail/distance_ops/hamming.cuh
index c8b3b7658e..b4f610be0a 100644
--- a/cpp/include/raft/distance/detail/distance_ops/hamming.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/hamming.cuh
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <raft/util/cuda_utils.cuh>
+
 namespace raft::distance::detail::ops {
 
 // Describes the computation the hamming distance
diff --git a/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
index 13a41190c1..523019f417 100644
--- a/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <raft/util/cuda_utils.cuh>
+
 namespace raft::distance::detail::ops {
 
 // Describes the computation the l2 expanded distance
diff --git a/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh b/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh
index 31fbd11667..f5e2f278b7 100644
--- a/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <raft/util/cuda_utils.cuh>
+
 namespace raft::distance::detail::ops {
 
 // Describes the computation the l2 unexpanded distance
diff --git a/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh b/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh
index f46a1a5e67..e114ef8224 100644
--- a/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <raft/util/cuda_utils.cuh>
+
 namespace raft::distance::detail::ops {
 
 // Describes the computation the russel_rao distance
diff --git a/cpp/include/raft/distance/detail/distance_ops/template.cuh b/cpp/include/raft/distance/detail/distance_ops/template.cuh
index d7bbfc7fca..378bcf0c9f 100644
--- a/cpp/include/raft/distance/detail/distance_ops/template.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/template.cuh
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <raft/util/cuda_utils.cuh>
+
 namespace raft::distance::detail::ops {
 
 // Describes the computation the template distance

From e176351d9d885d8f0a918851220317f74a587e73 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 21 Feb 2023 10:33:14 +0100
Subject: [PATCH 41/60] Replace DistanceImpl with method overloads

---
 cpp/include/raft/distance/detail/distance.cuh | 1205 ++++++++++-------
 .../detail/pairwise_matrix/dispatch.cuh       |   26 +-
 2 files changed, 705 insertions(+), 526 deletions(-)

diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index 8d4155356b..58c4dbd275 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -16,19 +16,26 @@
 
 #pragma once
 
+#include <type_traits>
 #include <cuda_runtime_api.h>
-#include <raft/distance/detail/canberra.cuh>
-#include <raft/distance/detail/chebyshev.cuh>
-#include <raft/distance/detail/correlation.cuh>
-#include <raft/distance/detail/cosine.cuh>
-#include <raft/distance/detail/euclidean.cuh>
-#include <raft/distance/detail/hamming.cuh>
-#include <raft/distance/detail/hellinger.cuh>
-#include <raft/distance/detail/jensen_shannon.cuh>
-#include <raft/distance/detail/kl_divergence.cuh>
-#include <raft/distance/detail/l1.cuh>
-#include <raft/distance/detail/minkowski.cuh>
-#include <raft/distance/detail/russell_rao.cuh>
+#include <raft/core/operators.hpp>
+
+#include <raft/distance/detail/distance_ops/canberra.cuh>
+#include <raft/distance/detail/distance_ops/correlation.cuh>
+#include <raft/distance/detail/distance_ops/cosine.cuh>
+#include <raft/distance/detail/distance_ops/hamming.cuh>
+#include <raft/distance/detail/distance_ops/hellinger.cuh>
+#include <raft/distance/detail/distance_ops/jensen_shannon.cuh>
+#include <raft/distance/detail/distance_ops/kl_divergence.cuh>
+#include <raft/distance/detail/distance_ops/l1.cuh>
+#include <raft/distance/detail/distance_ops/l2_exp.cuh>
+#include <raft/distance/detail/distance_ops/l2_unexp.cuh>
+#include <raft/distance/detail/distance_ops/chebyshev.cuh>
+#include <raft/distance/detail/distance_ops/minkowski.cuh>
+#include <raft/distance/detail/distance_ops/russel_rao.cuh>
+
+#include <raft/distance/detail/pairwise_matrix/dispatch.cuh>
+
 #include <raft/distance/distance_types.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <rmm/device_uvector.hpp>
@@ -37,526 +44,710 @@ namespace raft {
 namespace distance {
 namespace detail {
 
-namespace {
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void* workspace,
-           size_t worksize,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType metric_arg = 2.0f)
-  {
-  }
-};
+/**
+ * @brief: A tag type for overload resolution based on DistanceType
+ *
+ * It is not possible to partially specialize function templates on a single
+ * parameter. Intead, it is often easier to use a combination of conventional
+ * method overloading and a parameter with a specific tag type. The following
+ * type is used to help method overloading based on the DistanceType enum.
+ */
+template <DistanceType d>
+using distance_tag = std::integral_constant<DistanceType, d>;
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L2Expanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void* workspace,
-           size_t worksize,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::euclideanAlgo1<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, false, (AccType*)workspace, worksize, fin_op, stream, isRowMajor);
-  }
-};
+/**
+ * @brief Implement pairwise_matrix for specific distance
+ *
+ * There are multiple overloads for this function, one for each distance type.
+ * They are implemented below. The documentation of this function serves as
+ * documentation for all functions. The following overloads are defined:
+ *
+ * - DistanceType::Canberra:
+ * - DistanceType::CorrelationExpanded:
+ * - DistanceType::CosineExpanded:
+ * - DistanceType::HammingUnexpanded:
+ * - DistanceType::HellingerExpanded:
+ * - DistanceType::JensenShannon:
+ * - DistanceType::KLDivergence:
+ * - DistanceType::L1:
+ * - DistanceType::L2Expanded:
+ * - DistanceType::L2SqrtExpanded:
+ * - DistanceType::L2Unexpanded:
+ * - DistanceType::L2SqrtUnexpanded:
+ * - DistanceType::Linf:
+ * - DistanceType::LpUnexpanded:
+ * - DistanceType::RusselRaoExpanded:
+ *
+ * @tparam DataT   Input data type
+ * @tparam AccT    Accumulation data type
+ * @tparam OutT    Output data type
+ * @tparam FinOpT  Type of final operation
+ * @tparam IdxT    Index type
+ *
+ * @param distance_type A tag type to indicate which distance is calculated.
+ * @param x             First set of points
+ * @param y             Second set of points
+ * @param out           Output distance matrix
+ * @param m             Number of points in x
+ * @param n             Number of points in y
+ * @param k             Dimensionality of points in x, y
+ * @param workspace     Temporary workspace needed for computations
+ * @param worksize      Number of bytes of the workspace
+ * @param stream        CUDA stream
+ * @param is_row_major  Whether the matrices are row-major or col-major
+ * @param metric_arg    The `p` argument for Lp.
+ */
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void distance_impl(
+  distance_tag<DistanceType::Canberra> distance_type,
+  const DataT* x,
+  const DataT* y,
+  OutT* out,
+  IdxT m,
+  IdxT n,
+  IdxT k,
+  AccT* workspace,                                   // unused
+  size_t worksize,                                   // unused
+  FinOpT fin_op,
+  cudaStream_t stream,
+  bool is_row_major,
+  DataT metric_arg)                                  // unused
+{
+  ops::canberra_distance_op distance_op{};
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L2SqrtExpanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void* workspace,
-           size_t worksize,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::euclideanAlgo1<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, true, (AccType*)workspace, worksize, fin_op, stream, isRowMajor);
-  }
-};
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::CosineExpanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void* workspace,
-           size_t worksize,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::cosineAlgo1<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, (AccType*)workspace, worksize, fin_op, stream, isRowMajor);
-  }
-};
+  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
+}
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L2Unexpanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void*,
-           size_t,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::euclideanAlgo2<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, false, fin_op, stream, isRowMajor);
-  }
-};
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void distance_impl(
+  distance_tag<DistanceType::CorrelationExpanded> distance_type,
+  const DataT* x,
+  const DataT* y,
+  OutT* out,
+  IdxT m,
+  IdxT n,
+  IdxT k,
+  AccT* workspace,
+  size_t worksize,
+  FinOpT fin_op,
+  cudaStream_t stream,
+  bool is_row_major,
+  DataT)                                             // unused
+{
+  ASSERT(!(((x != y) && (worksize < 2 * (m + n) * sizeof(AccT))) ||
+           (worksize < 2 * m * sizeof(AccT))),
+         "workspace size error");
+  ASSERT(workspace != nullptr, "workspace is null");
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L2SqrtUnexpanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void*,
-           size_t,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::euclideanAlgo2<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, true, fin_op, stream, isRowMajor);
-  }
-};
+  AccT* norm_col_vec    = workspace;
+  AccT* norm_row_vec    = workspace;
+  AccT* sq_norm_col_vec = workspace;
+  AccT* sq_norm_row_vec = workspace;
+  if (x != y) {
+    norm_row_vec += m;
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L1,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void*,
-           size_t,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::l1Impl<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
-  }
-};
+    raft::linalg::reduce(norm_col_vec,
+                         x,
+                         k,
+                         m,
+                         (AccT)0,
+                         is_row_major,
+                         true,
+                         stream,
+                         false,
+                         raft::identity_op(),
+                         raft::add_op());
+    raft::linalg::reduce(norm_row_vec,
+                         y,
+                         k,
+                         n,
+                         (AccT)0,
+                         is_row_major,
+                         true,
+                         stream,
+                         false,
+                         raft::identity_op(),
+                         raft::add_op());
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::Linf,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void*,
-           size_t,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::chebyshevImpl<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
+    sq_norm_col_vec += (m + n);
+    sq_norm_row_vec = sq_norm_col_vec + m;
+    raft::linalg::rowNorm(sq_norm_col_vec, x, k, m, raft::linalg::L2Norm, is_row_major, stream);
+    raft::linalg::rowNorm(sq_norm_row_vec, y, k, n, raft::linalg::L2Norm, is_row_major, stream);
+  } else {
+    raft::linalg::reduce(norm_col_vec,
+                         x,
+                         k,
+                         m,
+                         (AccT)0,
+                         is_row_major,
+                         true,
+                         stream,
+                         false,
+                         raft::identity_op(),
+                         raft::add_op());
+    sq_norm_col_vec += m;
+    sq_norm_row_vec = sq_norm_col_vec;
+    raft::linalg::rowNorm(sq_norm_col_vec, x, k, m, raft::linalg::L2Norm, is_row_major, stream);
   }
-};
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::HellingerExpanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void*,
-           size_t,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::hellingerImpl<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
+  using CorrOp = ops::correlation_distance_op<DataT, IdxT>;
+  CorrOp corr_op(is_row_major, sq_norm_col_vec, sq_norm_row_vec, m, n, k);
+  distance_matrix_dispatch<decltype(corr_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    corr_op, m, n, k, x, y, norm_col_vec, norm_row_vec, out, fin_op, stream, is_row_major);
+}
+
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void distance_impl(
+  distance_tag<DistanceType::CosineExpanded> distance_type,
+  const DataT* x,
+  const DataT* y,
+  OutT* out,
+  IdxT m,
+  IdxT n,
+  IdxT k,
+  AccT* workspace,
+  size_t worksize,
+  FinOpT fin_op,
+  cudaStream_t stream,
+  bool is_row_major,
+  DataT)                                             // unused
+{
+  // raft distance support inputs as float/double and output as uint8_t/float/double.
+  static_assert(!((sizeof(OutT) > 1) && (sizeof(AccT) != sizeof(OutT))),
+                "OutT can be uint8_t, float, double,"
+                "if sizeof(OutT) > 1 then sizeof(AccT) == sizeof(OutT).");
+
+  ASSERT(
+    !(((x != y) && (worksize < (m + n) * sizeof(AccT))) || (worksize < m * sizeof(AccT))),
+    "workspace size error");
+  ASSERT(workspace != nullptr, "workspace is null");
+
+
+  DataT* norm_A = workspace;
+  DataT* norm_B = workspace;
+  if (x != y) {
+    norm_B += m;
+    raft::linalg::rowNorm(
+      norm_A, x, k, m, raft::linalg::L2Norm, is_row_major, stream, raft::sqrt_op{});
+    raft::linalg::rowNorm(
+      norm_B, y, k, n, raft::linalg::L2Norm, is_row_major, stream, raft::sqrt_op{});
+  } else {
+    raft::linalg::rowNorm(
+      norm_A, x, k, m, raft::linalg::L2Norm, is_row_major, stream, raft::sqrt_op{});
   }
-};
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::LpUnexpanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void*,
-           size_t,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType metric_arg)
-  {
-    raft::distance::detail::minkowskiImpl<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, fin_op, stream, isRowMajor, metric_arg);
+  // On CUDA 12:
+  // - always execute normal kernel
+  //
+  // On CUDA 11 and below:
+  // - execute CUTLASS-based kernel on SM_80 and above
+  // - execute normal kernel otherwise.
+
+  if constexpr (__CUDACC_VER_MAJOR__ == 12) {
+    // Always execute legacy kernels on CUDA 12
+    ops::cosine_distance_op distance_op{};
+    distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+      distance_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
+  } else {
+    const auto deviceVersion = getComputeCapability();
+    if (deviceVersion.first >= 8) {
+      // If device is SM_80 or later, use CUTLASS-based kernel.
+      using Op = ops::cosine_cutlass_op<DataT, AccT>;
+      Op distance_op{};
+
+      distance_matrix_cutlass_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+        distance_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
+    } else {
+      // Else use "legacy" L2
+      ops::cosine_distance_op distance_op{};
+      distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+        distance_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
+    }
   }
-};
+}
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::Canberra,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void*,
-           size_t,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::canberraImpl<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void distance_impl(
+  distance_tag<DistanceType::HammingUnexpanded> distance_type,
+  const DataT* x,
+  const DataT* y,
+  OutT* out,
+  IdxT m,
+  IdxT n,
+  IdxT k,
+  AccT*,                                             // workspace unused
+  size_t,                                            // worksize unused
+  FinOpT fin_op,
+  cudaStream_t stream,
+  bool is_row_major,
+  DataT)                                             // metric_arg unused
+{
+  ops::hamming_distance_op<IdxT> distance_op{k};
+
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
+
+  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
+}
+
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void distance_impl(
+  distance_tag<DistanceType::HellingerExpanded> distance_type,
+  const DataT* x,
+  const DataT* y,
+  OutT* out,
+  IdxT m,
+  IdxT n,
+  IdxT k,
+  AccT*,                                             // workspace unused
+  size_t,                                            // worksize unused
+  FinOpT fin_op,
+  cudaStream_t stream,
+  bool is_row_major,
+  DataT)                                             // metric_arg unused
+{
+  // First sqrt x and y
+  const auto raft_sqrt = raft::linalg::unaryOp<DataT, raft::sqrt_op, IdxT>;
+
+  raft_sqrt((DataT*)x, x, m * k, raft::sqrt_op{}, stream);
+  if (x != y) {
+    raft_sqrt((DataT*)y, y, n * k, raft::sqrt_op{}, stream);
   }
-};
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::HammingUnexpanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void*,
-           size_t,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::hammingUnexpandedImpl<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
+  // Then calculate Hellinger distance
+  ops::hellinger_distance_op distance_op{};
+
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
+
+  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
+
+  // Finally revert sqrt of x and y
+  raft_sqrt((DataT*)x, x, m * k, raft::sqrt_op{}, stream);
+  if (x != y) {
+    raft_sqrt((DataT*)y, y, n * k, raft::sqrt_op{}, stream);
   }
-};
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::JensenShannon,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void*,
-           size_t,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::jensenShannonImpl<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void distance_impl(
+  distance_tag<DistanceType::JensenShannon> distance_type,
+  const DataT* x,
+  const DataT* y,
+  OutT* out,
+  IdxT m,
+  IdxT n,
+  IdxT k,
+  AccT*,                                             // workspace unused
+  size_t,                                            // worksize unused
+  FinOpT fin_op,
+  cudaStream_t stream,
+  bool is_row_major,
+  DataT)                                             // metric_arg unused
+{
+  ops::jensen_shannon_distance_op distance_op{};
+
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
+
+  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
+}
+
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void distance_impl(
+  distance_tag<DistanceType::KLDivergence> distance_type,
+  const DataT* x,
+  const DataT* y,
+  OutT* out,
+  IdxT m,
+  IdxT n,
+  IdxT k,
+  AccT*,                                             // workspace unused
+  size_t,                                            // worksize unused
+  FinOpT fin_op,
+  cudaStream_t stream,
+  bool is_row_major,
+  DataT)                                             // metric_arg unused
+{
+  auto unaryOp_lambda = [] __device__(DataT input) {
+  const bool x_zero = (input == 0);
+  return (!x_zero) * raft::log(input + x_zero);  };
+
+  auto unaryOp_lambda_reverse = [] __device__(DataT input) {
+  // reverse previous log (x) back to x using (e ^ log(x))
+  const bool x_zero = (input == 0);
+  return (!x_zero) * raft::exp(input);  };
+
+  // This op takes some shortcuts when x equals y. So its behavior changes based
+  // on this.
+  ops::kl_divergence_op kl_divergence{is_row_major, x == y};
+
+  if (x != y) {
+    raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
+      (DataT*)y, y, n * k, unaryOp_lambda, stream);
   }
-};
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::RusselRaoExpanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void*,
-           size_t,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::russellRaoImpl<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
+
+  distance_matrix_dispatch<decltype(kl_divergence), DataT, AccT, OutT, FinOpT, IdxT>(
+    kl_divergence, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
+
+  if (x != y) {
+    // Now reverse previous log (x) back to x using (e ^ log(x))
+    raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda_reverse), IdxT>(
+      (DataT*)y, y, n * k, unaryOp_lambda_reverse, stream);
   }
-};
+}
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::KLDivergence,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void*,
-           size_t,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::klDivergenceImpl<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
+
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void distance_impl(
+  distance_tag<DistanceType::L1> distance_type,
+  const DataT* x,
+  const DataT* y,
+  OutT* out,
+  IdxT m,
+  IdxT n,
+  IdxT k,
+  AccT*,                                             // workspace unused
+  size_t,                                            // worksize unused
+  FinOpT fin_op,
+  cudaStream_t stream,
+  bool is_row_major,
+  DataT)                                             // metric_arg unused
+{
+  ops::l1_distance_op distance_op{};
+
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
+
+  distance_matrix_dispatch<ops::l1_distance_op, DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
+}
+
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void distance_impl_l2_expanded(                      // NOTE: different name
+  bool perform_sqrt,                                 // dispatch on sqrt
+  const DataT* x,
+  const DataT* y,
+  OutT* out,
+  IdxT m,
+  IdxT n,
+  IdxT k,
+  AccT* workspace,
+  size_t worksize,
+  FinOpT fin_op,
+  cudaStream_t stream,
+  bool is_row_major)
+{
+  // raft distance support inputs as float/double and output as uint8_t/float/double.
+  static_assert(!((sizeof(OutT) > 1) && (sizeof(AccT) != sizeof(OutT))),
+                "OutT can be uint8_t, float, double,"
+                "if sizeof(OutT) > 1 then sizeof(AccT) == sizeof(OutT).");
+
+  ASSERT(
+    !(((x != y) && (worksize < (m + n) * sizeof(AccT))) || (worksize < m * sizeof(AccT))),
+    "workspace size error");
+  ASSERT(workspace != nullptr, "workspace is null");
+
+  DataT* norm_A = workspace;
+  DataT* norm_B = workspace;
+  if (x != y) {
+    norm_B += m;
+    raft::linalg::rowNorm(
+      norm_A, x, k, m, raft::linalg::L2Norm, is_row_major, stream, raft::identity_op{});
+    raft::linalg::rowNorm(
+      norm_B, y, k, n, raft::linalg::L2Norm, is_row_major, stream, raft::identity_op{});
+  } else {
+    raft::linalg::rowNorm(
+      norm_A, x, k, m, raft::linalg::L2Norm, is_row_major, stream, raft::identity_op{});
   }
-};
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::CorrelationExpanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void* workspace,
-           size_t worksize,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType)
-  {
-    raft::distance::detail::correlationImpl<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, (AccType*)workspace, worksize, fin_op, stream, isRowMajor);
+  // On CUDA 12:
+  // - always execute normal kernel
+  //
+  // On CUDA 11 and below:
+  // - execute CUTLASS-based kernel on SM_80 and above
+  // - execute normal kernel otherwise.
+
+  if constexpr (__CUDACC_VER_MAJOR__ == 12) {
+    // Always execute legacy kernels on CUDA 12
+    ops::l2_exp_distance_op l2_op(perform_sqrt);
+    distance_matrix_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
+      l2_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
+  } else {
+    const auto deviceVersion = getComputeCapability();
+    if (deviceVersion.first >= 8) {
+      // If device is SM_80 or later, use CUTLASS-based kernel.
+      using L2Op = ops::l2_exp_cutlass_op<DataT, AccT>;
+      L2Op l2_op(perform_sqrt);
+
+      distance_matrix_cutlass_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
+        l2_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
+    } else {
+      // Else use "legacy" L2
+      ops::l2_exp_distance_op l2_op(perform_sqrt);
+      distance_matrix_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
+        l2_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
+    }
   }
-};
+}
 
-}  // anonymous namespace
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void distance_impl(
+  distance_tag<DistanceType::L2Expanded> distance_type,
+  const DataT* x,
+  const DataT* y,
+  OutT* out,
+  IdxT m,
+  IdxT n,
+  IdxT k,
+  AccT* workspace,
+  size_t worksize,
+  FinOpT fin_op,
+  cudaStream_t stream,
+  bool is_row_major,
+  DataT)                                             // metric_arg unused
+{
+  bool perform_sqrt = false;
+  distance_impl_l2_expanded(perform_sqrt, x, y, out, m, n, k, workspace, worksize, fin_op, stream, is_row_major);
+}
 
-/**
- * @brief Evaluate pairwise distances with the user epilogue lamba allowed
- * @tparam DistanceType which distance to evaluate
- * @tparam InType input argument type
- * @tparam AccType accumulation type
- * @tparam OutType output type
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
- * @param x first set of points
- * @param y second set of points
- * @param dist output distance matrix
- * @param m number of points in x
- * @param n number of points in y
- * @param k dimensionality
- * @param workspace temporary workspace needed for computations
- * @param worksize number of bytes of the workspace
- * @param fin_op the final gemm epilogue lambda
- * @param stream cuda stream
- * @param isRowMajor whether the matrices are row-major or col-major
- *
- * @note fin_op: This is a device lambda which is supposed to operate upon the
- * input which is AccType and returns the output in OutType. It's signature is
- * as follows:  <pre>OutType fin_op(AccType in, int g_idx);</pre>. If one needs
- * any other parameters, feel free to pass them via closure.
- */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void distance(const InType* x,
-              const InType* y,
-              OutType* dist,
-              Index_ m,
-              Index_ n,
-              Index_ k,
-              void* workspace,
-              size_t worksize,
-              FinalLambda fin_op,
-              cudaStream_t stream,
-              bool isRowMajor   = true,
-              InType metric_arg = 2.0f)
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void distance_impl(
+  distance_tag<DistanceType::L2SqrtExpanded> distance_type,
+  const DataT* x,
+  const DataT* y,
+  OutT* out,
+  IdxT m,
+  IdxT n,
+  IdxT k,
+  AccT* workspace,
+  size_t worksize,
+  FinOpT fin_op,
+  cudaStream_t stream,
+  bool is_row_major,
+  DataT)                                             // metric_arg unused
 {
-  DistanceImpl<distanceType, InType, AccType, OutType, FinalLambda, Index_> distImpl;
-  distImpl.run(x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  bool perform_sqrt = true;
+  distance_impl_l2_expanded(perform_sqrt, x, y, out, m, n, k, workspace, worksize, fin_op, stream, is_row_major);
+}
+
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void distance_impl(
+  distance_tag<DistanceType::L2Unexpanded> distance_type,
+  const DataT* x,
+  const DataT* y,
+  OutT* out,
+  IdxT m,
+  IdxT n,
+  IdxT k,
+  AccT*,                                             // workspace unused
+  size_t,                                            // worksize unused
+  FinOpT fin_op,
+  cudaStream_t stream,
+  bool is_row_major,
+  DataT)                                             // metric_arg unused
+{
+  bool perform_sqrt = false;
+  ops::l2_unexp_distance_op l2_op(perform_sqrt);
+
+  // The unexpanded L2 does not require the norms of a and b to be calculated.
+  const DataT* norm_A = nullptr;
+  const DataT* norm_B = nullptr;
+
+  distance_matrix_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    l2_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
+}
+
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void distance_impl(
+  distance_tag<DistanceType::L2SqrtUnexpanded> distance_type,
+  const DataT* x,
+  const DataT* y,
+  OutT* out,
+  IdxT m,
+  IdxT n,
+  IdxT k,
+  AccT*,                                             // workspace unused
+  size_t,                                            // worksize unused
+  FinOpT fin_op,
+  cudaStream_t stream,
+  bool is_row_major,
+  DataT)                                             // metric_arg unused
+{
+  bool perform_sqrt = true;
+  ops::l2_unexp_distance_op l2_op(perform_sqrt);
+
+  // The unexpanded L2 does not require the norms of a and b to be calculated.
+  const DataT* norm_A = nullptr;
+  const DataT* norm_B = nullptr;
+
+  distance_matrix_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    l2_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
+}
+
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void distance_impl(
+  distance_tag<DistanceType::Linf> distance_type,
+  const DataT* x,
+  const DataT* y,
+  OutT* out,
+  IdxT m,
+  IdxT n,
+  IdxT k,
+  AccT*,                                             // workspace unused
+  size_t,                                            // worksize unused
+  FinOpT fin_op,
+  cudaStream_t stream,
+  bool is_row_major,
+  DataT)                                             // metric_arg unused
+{
+  ops::chebyshev_distance_op distance_op{};
+
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
+
+  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
+}
+
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void distance_impl(
+  distance_tag<DistanceType::LpUnexpanded> distance_type,
+  const DataT* x,
+  const DataT* y,
+  OutT* out,
+  IdxT m,
+  IdxT n,
+  IdxT k,
+  AccT*,                                             // workspace unused
+  size_t,                                            // worksize unused
+  FinOpT fin_op,
+  cudaStream_t stream,
+  bool is_row_major,
+  DataT metric_arg)
+{
+  ops::minkowski_distance_op<DataT> distance_op{metric_arg};
+
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
+
+  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
+}
+
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename FinOpT,
+          typename IdxT = int>
+void distance_impl(
+  distance_tag<DistanceType::RusselRaoExpanded> distance_type,
+  const DataT* x,
+  const DataT* y,
+  OutT* out,
+  IdxT m,
+  IdxT n,
+  IdxT k,
+  AccT*,                                             // workspace unused
+  size_t,                                            // worksize unused
+  FinOpT fin_op,
+  cudaStream_t stream,
+  bool is_row_major,
+  DataT)                                             // metric_arg unused
+{
+  ops::russel_rao_distance_op<IdxT> distance_op{k};
+
+  const DataT* x_norm = nullptr;
+  const DataT* y_norm = nullptr;
+
+  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
 }
 
 /**
- * @brief Evaluate pairwise distances for the simple use case
+ * @brief Evaluate pairwise distances and write to matrix
+ *
  * @tparam DistanceType which distance to evaluate
  * @tparam InType input argument type
  * @tparam AccType accumulation type
  * @tparam OutType output type
  * @tparam Index_ Index type
+ *
  * @param x first set of points
  * @param y second set of points
- * @param dist output distance matrix
+ * @param out output distance matrix
  * @param m number of points in x
  * @param n number of points in y
  * @param k dimensionality
@@ -568,19 +759,6 @@ void distance(const InType* x,
  * @note if workspace is passed as nullptr, this will return in
  *  worksize, the number of bytes of workspace required
  */
-
-// Default final op functor which facilitates elementwise operation on
-// final distance value if any.
-template <typename AccType, typename OutType, typename Index>
-struct default_fin_op {
-  __host__ __device__ default_fin_op() noexcept {};
-  // functor signature.
-  __host__ __device__ OutType operator()(AccType d_val, Index g_d_idx) const noexcept
-  {
-    return d_val;
-  }
-};
-
 template <raft::distance::DistanceType distanceType,
           typename InType,
           typename AccType,
@@ -588,7 +766,7 @@ template <raft::distance::DistanceType distanceType,
           typename Index_ = int>
 void distance(const InType* x,
               const InType* y,
-              OutType* dist,
+              OutType* out,
               Index_ m,
               Index_ n,
               Index_ k,
@@ -598,15 +776,16 @@ void distance(const InType* x,
               bool isRowMajor   = true,
               InType metric_arg = 2.0f)
 {
-  using final_op_type = default_fin_op<AccType, OutType, Index_>;
-  final_op_type fin_op;
+  auto fin_op = raft::identity_op();
 
   // raft distance support inputs as float/double and output as uint8_t/float/double.
   static_assert(!((sizeof(OutType) > 1) && (sizeof(AccType) != sizeof(OutType))),
                 "OutType can be uint8_t, float, double,"
                 "if sizeof(OutType) > 1 then sizeof(AccType) == sizeof(OutType).");
-  distance<distanceType, InType, AccType, OutType, final_op_type, Index_>(
-    x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg);
+
+  distance_impl<InType, AccType, OutType, decltype(fin_op), Index_>(
+    distance_tag<distanceType>{},
+    x, y, out, m, n, k, reinterpret_cast<AccType*>(workspace), worksize, fin_op, stream, isRowMajor, metric_arg);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
index b3362e7647..4a7c1f999f 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
@@ -69,19 +69,19 @@ using vec_len_constant = std::integral_constant<int, n>;
 template <typename F>
 void dispatch(bool row_major, int vec_len, F&& f)
 {
-  if (row_major) {
-    switch (vec_len) {
-      case 4: f(std::bool_constant<true>(), vec_len_constant<4>()); break;
-      case 2: f(std::bool_constant<true>(), vec_len_constant<2>()); break;
-      default: f(std::bool_constant<true>(), vec_len_constant<1>()); break;
-    }
-  } else {
-    switch (vec_len) {
-      case 4: f(std::bool_constant<false>(), vec_len_constant<4>()); break;
-      case 2: f(std::bool_constant<false>(), vec_len_constant<2>()); break;
-      default: f(std::bool_constant<false>(), vec_len_constant<1>()); break;
-    }
-  }
+  // if (row_major) {
+  //   switch (vec_len) {
+  //     case 4: f(std::bool_constant<true>(), vec_len_constant<4>()); break;
+  //     case 2: f(std::bool_constant<true>(), vec_len_constant<2>()); break;
+  //     default: f(std::bool_constant<true>(), vec_len_constant<1>()); break;
+  //   }
+  // } else {
+  //   switch (vec_len) {
+  //     case 4: f(std::bool_constant<false>(), vec_len_constant<4>()); break;
+  //     case 2: f(std::bool_constant<false>(), vec_len_constant<2>()); break;
+  //     default: f(std::bool_constant<false>(), vec_len_constant<1>()); break;
+  //   }
+  // }
 }
 
 template <typename OpT,

From 6ddd14f6e8a10173ec17c789d1a66817a4b42f93 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 21 Feb 2023 11:17:24 +0100
Subject: [PATCH 42/60] Remove impl files and move doc strings

---
 cpp/include/raft/distance/detail/canberra.cuh |  71 ---
 .../raft/distance/detail/chebyshev.cuh        |  68 ---
 .../raft/distance/detail/correlation.cuh      | 127 ----
 cpp/include/raft/distance/detail/cosine.cuh   | 123 ----
 cpp/include/raft/distance/detail/distance.cuh | 569 ++++++++----------
 .../distance/detail/distance_ops/canberra.cuh |   9 +-
 .../detail/distance_ops/chebyshev.cuh         |   9 +-
 .../detail/distance_ops/correlation.cuh       |  32 +-
 .../distance/detail/distance_ops/cosine.cuh   |  11 +-
 .../distance/detail/distance_ops/hamming.cuh  |  10 +-
 .../detail/distance_ops/hellinger.cuh         |  12 +-
 .../detail/distance_ops/jensen_shannon.cuh    |   8 +
 .../detail/distance_ops/kl_divergence.cuh     |  19 +-
 .../raft/distance/detail/distance_ops/l1.cuh  |   8 +-
 .../distance/detail/distance_ops/l2_exp.cuh   |  11 +-
 .../distance/detail/distance_ops/l2_unexp.cuh |   8 +-
 .../detail/distance_ops/minkowski.cuh         |  11 +-
 .../detail/distance_ops/russel_rao.cuh        |  14 +-
 .../raft/distance/detail/euclidean.cuh        | 169 ------
 cpp/include/raft/distance/detail/hamming.cuh  |  71 ---
 .../raft/distance/detail/hellinger.cuh        |  94 ---
 .../raft/distance/detail/jensen_shannon.cuh   |  72 ---
 .../raft/distance/detail/kl_divergence.cuh    |  98 ---
 cpp/include/raft/distance/detail/l1.cuh       |  51 --
 .../raft/distance/detail/minkowski.cuh        |  70 ---
 .../raft/distance/detail/russell_rao.cuh      |  70 ---
 26 files changed, 361 insertions(+), 1454 deletions(-)
 delete mode 100644 cpp/include/raft/distance/detail/canberra.cuh
 delete mode 100644 cpp/include/raft/distance/detail/chebyshev.cuh
 delete mode 100644 cpp/include/raft/distance/detail/correlation.cuh
 delete mode 100644 cpp/include/raft/distance/detail/cosine.cuh
 delete mode 100644 cpp/include/raft/distance/detail/euclidean.cuh
 delete mode 100644 cpp/include/raft/distance/detail/hamming.cuh
 delete mode 100644 cpp/include/raft/distance/detail/hellinger.cuh
 delete mode 100644 cpp/include/raft/distance/detail/jensen_shannon.cuh
 delete mode 100644 cpp/include/raft/distance/detail/kl_divergence.cuh
 delete mode 100644 cpp/include/raft/distance/detail/l1.cuh
 delete mode 100644 cpp/include/raft/distance/detail/minkowski.cuh
 delete mode 100644 cpp/include/raft/distance/detail/russell_rao.cuh

diff --git a/cpp/include/raft/distance/detail/canberra.cuh b/cpp/include/raft/distance/detail/canberra.cuh
deleted file mode 100644
index 3f0c2fa268..0000000000
--- a/cpp/include/raft/distance/detail/canberra.cuh
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "distance_ops/canberra.cuh"
-#include "pairwise_matrix/dispatch.cuh"
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-
-/**
- * @brief the canberra distance matrix calculation
- *  It computes the following equation: cij = max(cij, op(ai-bj))
- * @tparam DataT input data-type (for A and B matrices)
- * @tparam AccT accumulation data-type
- * @tparam OutT output data-type (for C and D matrices)
- * @tparam FinOpT user-defined epilogue lamba
- * @tparam IdxT Index type
- * @param[in] m number of rows of A and C/D
- * @param[in] n number of rows of B and cols of C/D
- * @param[in] k number of cols of A and B
- * @param[in] pA input matrix
- * @param[in] pB input matrix
- * @param[out] pD output matrix
- * @param[in] fin_op the final element-wise epilogue lambda
- * @param[in] stream cuda stream to launch work
- * @param[in] isRowMajor whether the input and output matrices are row major
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void canberraImpl(int m,
-                  int n,
-                  int k,
-                  const DataT* x,
-                  const DataT* y,
-                  OutT* out,
-                  FinOpT fin_op,
-                  cudaStream_t stream,
-                  bool is_row_major)
-{
-  ops::canberra_distance_op distance_op{};
-
-  const DataT* x_norm = nullptr;
-  const DataT* y_norm = nullptr;
-
-  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
-    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
-}
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/chebyshev.cuh b/cpp/include/raft/distance/detail/chebyshev.cuh
deleted file mode 100644
index 9f49660301..0000000000
--- a/cpp/include/raft/distance/detail/chebyshev.cuh
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include "distance_ops/chebyshev.cuh"
-#include "pairwise_matrix/dispatch.cuh"
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-/**
- * @brief the chebyshev distance matrix calculation
- *  It computes the following equation: cij = max(cij, op(ai-bj))
- * @tparam InType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OutType output data-type (for C and D matrices)
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
- * @param[in] m number of rows of A and C/D
- * @param[in] n number of rows of B and cols of C/D
- * @param[in] k number of cols of A and B
- * @param[in] pA input matrix
- * @param[in] pB input matrix
- * @param[out] pD output matrix
- * @param[in] fin_op the final element-wise epilogue lambda
- * @param[in] stream cuda stream to launch work
- * @param[in] isRowMajor whether the input and output matrices are row major
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void chebyshevImpl(int m,
-                  int n,
-                  int k,
-                  const DataT* x,
-                  const DataT* y,
-                  OutT* out,
-                  FinOpT fin_op,
-                  cudaStream_t stream,
-                  bool is_row_major)
-{
-  ops::chebyshev_distance_op distance_op{};
-
-  const DataT* x_norm = nullptr;
-  const DataT* y_norm = nullptr;
-
-  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
-    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
-}
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/correlation.cuh b/cpp/include/raft/distance/detail/correlation.cuh
deleted file mode 100644
index 89828c9ba2..0000000000
--- a/cpp/include/raft/distance/detail/correlation.cuh
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/linalg/reduce.cuh>
-
-#include "pairwise_matrix/dispatch.cuh"
-#include "distance_ops/correlation.cuh"
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-/**
- * @brief the Correlation distance matrix calculation
- *
- * @tparam InType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OutType output data-type (for C and D matrices)
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
- * @param m number of rows of A and C/D
- * @param n number of columns of B and C/D
- * @param k number of cols of A and rows of B
- * @param pA input matrix
- * @param pB input matrix
- * @param pD output matrix
- * @param fin_op the final element-wise epilogue lambda
- * @param stream cuda stream where to launch work
- * @param isRowMajor whether the input and output matrices are row major
- */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void correlationImpl(int m,
-                     int n,
-                     int k,
-                     const InType* pA,
-                     const InType* pB,
-                     OutType* pD,
-                     AccType* workspace,
-                     size_t& worksize,
-                     FinalLambda fin_op,
-                     cudaStream_t stream,
-                     bool isRowMajor)
-{
-  ASSERT(!(((pA != pB) && (worksize < 2 * (m + n) * sizeof(AccType))) ||
-           (worksize < 2 * m * sizeof(AccType))),
-         "workspace size error");
-  ASSERT(workspace != nullptr, "workspace is null");
-
-  AccType* norm_col_vec    = workspace;
-  AccType* norm_row_vec    = workspace;
-  AccType* sq_norm_col_vec = workspace;
-  AccType* sq_norm_row_vec = workspace;
-  if (pA != pB) {
-    norm_row_vec += m;
-
-    raft::linalg::reduce(norm_col_vec,
-                         pA,
-                         k,
-                         m,
-                         (AccType)0,
-                         isRowMajor,
-                         true,
-                         stream,
-                         false,
-                         raft::identity_op(),
-                         raft::add_op());
-    raft::linalg::reduce(norm_row_vec,
-                         pB,
-                         k,
-                         n,
-                         (AccType)0,
-                         isRowMajor,
-                         true,
-                         stream,
-                         false,
-                         raft::identity_op(),
-                         raft::add_op());
-
-    sq_norm_col_vec += (m + n);
-    sq_norm_row_vec = sq_norm_col_vec + m;
-    raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream);
-    raft::linalg::rowNorm(sq_norm_row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream);
-  } else {
-    raft::linalg::reduce(norm_col_vec,
-                         pA,
-                         k,
-                         m,
-                         (AccType)0,
-                         isRowMajor,
-                         true,
-                         stream,
-                         false,
-                         raft::identity_op(),
-                         raft::add_op());
-    sq_norm_col_vec += m;
-    sq_norm_row_vec = sq_norm_col_vec;
-    raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream);
-  }
-
-  using CorrOp = ops::correlation_distance_op<InType, Index_>;
-  CorrOp corr_op(isRowMajor, sq_norm_col_vec, sq_norm_row_vec, m, n, k);
-  distance_matrix_dispatch<decltype(corr_op), InType, AccType, OutType, FinalLambda, Index_>(
-    corr_op, m, n, k, pA, pB, norm_col_vec, norm_row_vec, pD, fin_op, stream, isRowMajor);
-}
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/cosine.cuh b/cpp/include/raft/distance/detail/cosine.cuh
deleted file mode 100644
index 4ae0c285f5..0000000000
--- a/cpp/include/raft/distance/detail/cosine.cuh
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include <raft/linalg/norm.cuh>
-
-#include "pairwise_matrix/dispatch.cuh"
-#include "distance_ops/cosine.cuh"
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-/**
- * @brief the expanded cosine distance matrix calculation
- *  It computes the following equation:
- *              C = 1 - op(A * B / sqrt(A^2) * sqrt(B^2)))
- * @tparam IType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OType output data-type (for C and D matrices)
- * @tparam OutputTile_ output tile size for the thread block
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
- * @param m number of rows of A and C/D
- * @param n number of columns of B and C/D
- * @param k number of cols of A and rows of B
- * @param pA input matrix
- * @param pB input matrix
- * @param pD output matrix
- * @tparam in_params user-defined input parameter
- * @param workspace temporary workspace needed for computations
- * @param worksize number of bytes of the workspace
- * @param fin_op the final gemm epilogue lambda
- * @param stream cuda stream where to launch work
- * @param isRowMajor whether the input and output matrices are row major
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void cosineAlgo1(IdxT m,
-                    IdxT n,
-                    IdxT k,
-                    const DataT* pA,
-                    const DataT* pB,
-                    OutT* pD,
-                    AccT* workspace,
-                    size_t worksize,
-                    FinOpT fin_op,
-                    cudaStream_t stream,
-                    bool isRowMajor)
-{
-  // raft distance support inputs as float/double and output as uint8_t/float/double.
-  static_assert(!((sizeof(OutT) > 1) && (sizeof(AccT) != sizeof(OutT))),
-                "OutT can be uint8_t, float, double,"
-                "if sizeof(OutT) > 1 then sizeof(AccT) == sizeof(OutT).");
-
-  ASSERT(
-    !(((pA != pB) && (worksize < (m + n) * sizeof(AccT))) || (worksize < m * sizeof(AccT))),
-    "workspace size error");
-  ASSERT(workspace != nullptr, "workspace is null");
-
-
-  DataT* norm_A = workspace;
-  DataT* norm_B = workspace;
-  if (pA != pB) {
-    norm_B += m;
-    raft::linalg::rowNorm(
-      norm_A, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::sqrt_op{});
-    raft::linalg::rowNorm(
-      norm_B, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, raft::sqrt_op{});
-  } else {
-    raft::linalg::rowNorm(
-      norm_A, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::sqrt_op{});
-  }
-
-  // On CUDA 12:
-  // - always execute normal kernel
-  //
-  // On CUDA 11 and below:
-  // - execute CUTLASS-based kernel on SM_80 and above
-  // - execute normal kernel otherwise.
-
-  if constexpr (__CUDACC_VER_MAJOR__ == 12) {
-    // Always execute legacy kernels on CUDA 12
-    ops::cosine_distance_op distance_op{};
-    distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
-      distance_op, m, n, k, pA, pB, norm_A, norm_B, pD, fin_op, stream, isRowMajor);
-  } else {
-    const auto deviceVersion = getComputeCapability();
-    if (deviceVersion.first >= 8) {
-      // If device is SM_80 or later, use CUTLASS-based kernel.
-      using Op = ops::cosine_cutlass_op<DataT, AccT>;
-      Op distance_op{};
-
-      distance_matrix_cutlass_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
-        distance_op, m, n, k, pA, pB, norm_A, norm_B, pD, fin_op, stream, isRowMajor);
-    } else {
-      // Else use "legacy" L2
-      ops::cosine_distance_op distance_op{};
-      distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
-        distance_op, m, n, k, pA, pB, norm_A, norm_B, pD, fin_op, stream, isRowMajor);
-    }
-  }
-}
-
-};  // end namespace detail
-};  // end namespace distance
-};  // end namespace raft
diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index 58c4dbd275..5275d26ab2 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,17 @@
 
 #pragma once
 
-#include <type_traits>
 #include <cuda_runtime_api.h>
+#include <type_traits>
+
+#include <raft/linalg/norm.cuh>
+#include <raft/linalg/reduce.cuh>
+#include <raft/linalg/unary_op.cuh>
+
 #include <raft/core/operators.hpp>
 
 #include <raft/distance/detail/distance_ops/canberra.cuh>
+#include <raft/distance/detail/distance_ops/chebyshev.cuh>
 #include <raft/distance/detail/distance_ops/correlation.cuh>
 #include <raft/distance/detail/distance_ops/cosine.cuh>
 #include <raft/distance/detail/distance_ops/hamming.cuh>
@@ -30,7 +36,6 @@
 #include <raft/distance/detail/distance_ops/l1.cuh>
 #include <raft/distance/detail/distance_ops/l2_exp.cuh>
 #include <raft/distance/detail/distance_ops/l2_unexp.cuh>
-#include <raft/distance/detail/distance_ops/chebyshev.cuh>
 #include <raft/distance/detail/distance_ops/minkowski.cuh>
 #include <raft/distance/detail/distance_ops/russel_rao.cuh>
 
@@ -48,7 +53,7 @@ namespace detail {
  * @brief: A tag type for overload resolution based on DistanceType
  *
  * It is not possible to partially specialize function templates on a single
- * parameter. Intead, it is often easier to use a combination of conventional
+ * parameter. Instead, it is often easier to use a combination of conventional
  * method overloading and a parameter with a specific tag type. The following
  * type is used to help method overloading based on the DistanceType enum.
  */
@@ -97,25 +102,20 @@ using distance_tag = std::integral_constant<DistanceType, d>;
  * @param is_row_major  Whether the matrices are row-major or col-major
  * @param metric_arg    The `p` argument for Lp.
  */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void distance_impl(
-  distance_tag<DistanceType::Canberra> distance_type,
-  const DataT* x,
-  const DataT* y,
-  OutT* out,
-  IdxT m,
-  IdxT n,
-  IdxT k,
-  AccT* workspace,                                   // unused
-  size_t worksize,                                   // unused
-  FinOpT fin_op,
-  cudaStream_t stream,
-  bool is_row_major,
-  DataT metric_arg)                                  // unused
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(distance_tag<DistanceType::Canberra> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT* workspace,  // unused
+                   size_t worksize,  // unused
+                   FinOpT fin_op,
+                   cudaStream_t stream,
+                   bool is_row_major,
+                   DataT metric_arg)  // unused
 {
   ops::canberra_distance_op distance_op{};
 
@@ -126,29 +126,24 @@ void distance_impl(
     distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
 }
 
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void distance_impl(
-  distance_tag<DistanceType::CorrelationExpanded> distance_type,
-  const DataT* x,
-  const DataT* y,
-  OutT* out,
-  IdxT m,
-  IdxT n,
-  IdxT k,
-  AccT* workspace,
-  size_t worksize,
-  FinOpT fin_op,
-  cudaStream_t stream,
-  bool is_row_major,
-  DataT)                                             // unused
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(distance_tag<DistanceType::CorrelationExpanded> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT* workspace,
+                   size_t worksize,
+                   FinOpT fin_op,
+                   cudaStream_t stream,
+                   bool is_row_major,
+                   DataT)  // unused
 {
-  ASSERT(!(((x != y) && (worksize < 2 * (m + n) * sizeof(AccT))) ||
-           (worksize < 2 * m * sizeof(AccT))),
-         "workspace size error");
+  ASSERT(
+    !(((x != y) && (worksize < 2 * (m + n) * sizeof(AccT))) || (worksize < 2 * m * sizeof(AccT))),
+    "workspace size error");
   ASSERT(workspace != nullptr, "workspace is null");
 
   AccT* norm_col_vec    = workspace;
@@ -208,37 +203,30 @@ void distance_impl(
     corr_op, m, n, k, x, y, norm_col_vec, norm_row_vec, out, fin_op, stream, is_row_major);
 }
 
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void distance_impl(
-  distance_tag<DistanceType::CosineExpanded> distance_type,
-  const DataT* x,
-  const DataT* y,
-  OutT* out,
-  IdxT m,
-  IdxT n,
-  IdxT k,
-  AccT* workspace,
-  size_t worksize,
-  FinOpT fin_op,
-  cudaStream_t stream,
-  bool is_row_major,
-  DataT)                                             // unused
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(distance_tag<DistanceType::CosineExpanded> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT* workspace,
+                   size_t worksize,
+                   FinOpT fin_op,
+                   cudaStream_t stream,
+                   bool is_row_major,
+                   DataT)  // unused
 {
   // raft distance support inputs as float/double and output as uint8_t/float/double.
   static_assert(!((sizeof(OutT) > 1) && (sizeof(AccT) != sizeof(OutT))),
                 "OutT can be uint8_t, float, double,"
                 "if sizeof(OutT) > 1 then sizeof(AccT) == sizeof(OutT).");
 
-  ASSERT(
-    !(((x != y) && (worksize < (m + n) * sizeof(AccT))) || (worksize < m * sizeof(AccT))),
-    "workspace size error");
+  ASSERT(!(((x != y) && (worksize < (m + n) * sizeof(AccT))) || (worksize < m * sizeof(AccT))),
+         "workspace size error");
   ASSERT(workspace != nullptr, "workspace is null");
 
-
   DataT* norm_A = workspace;
   DataT* norm_B = workspace;
   if (x != y) {
@@ -282,25 +270,20 @@ void distance_impl(
   }
 }
 
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void distance_impl(
-  distance_tag<DistanceType::HammingUnexpanded> distance_type,
-  const DataT* x,
-  const DataT* y,
-  OutT* out,
-  IdxT m,
-  IdxT n,
-  IdxT k,
-  AccT*,                                             // workspace unused
-  size_t,                                            // worksize unused
-  FinOpT fin_op,
-  cudaStream_t stream,
-  bool is_row_major,
-  DataT)                                             // metric_arg unused
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(distance_tag<DistanceType::HammingUnexpanded> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT*,   // workspace unused
+                   size_t,  // worksize unused
+                   FinOpT fin_op,
+                   cudaStream_t stream,
+                   bool is_row_major,
+                   DataT)  // metric_arg unused
 {
   ops::hamming_distance_op<IdxT> distance_op{k};
 
@@ -311,33 +294,26 @@ void distance_impl(
     distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
 }
 
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void distance_impl(
-  distance_tag<DistanceType::HellingerExpanded> distance_type,
-  const DataT* x,
-  const DataT* y,
-  OutT* out,
-  IdxT m,
-  IdxT n,
-  IdxT k,
-  AccT*,                                             // workspace unused
-  size_t,                                            // worksize unused
-  FinOpT fin_op,
-  cudaStream_t stream,
-  bool is_row_major,
-  DataT)                                             // metric_arg unused
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(distance_tag<DistanceType::HellingerExpanded> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT*,   // workspace unused
+                   size_t,  // worksize unused
+                   FinOpT fin_op,
+                   cudaStream_t stream,
+                   bool is_row_major,
+                   DataT)  // metric_arg unused
 {
   // First sqrt x and y
   const auto raft_sqrt = raft::linalg::unaryOp<DataT, raft::sqrt_op, IdxT>;
 
   raft_sqrt((DataT*)x, x, m * k, raft::sqrt_op{}, stream);
-  if (x != y) {
-    raft_sqrt((DataT*)y, y, n * k, raft::sqrt_op{}, stream);
-  }
+  if (x != y) { raft_sqrt((DataT*)y, y, n * k, raft::sqrt_op{}, stream); }
 
   // Then calculate Hellinger distance
   ops::hellinger_distance_op distance_op{};
@@ -350,32 +326,25 @@ void distance_impl(
 
   // Finally revert sqrt of x and y
   raft_sqrt((DataT*)x, x, m * k, raft::sqrt_op{}, stream);
-  if (x != y) {
-    raft_sqrt((DataT*)y, y, n * k, raft::sqrt_op{}, stream);
-  }
+  if (x != y) { raft_sqrt((DataT*)y, y, n * k, raft::sqrt_op{}, stream); }
 
   RAFT_CUDA_TRY(cudaGetLastError());
 }
 
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void distance_impl(
-  distance_tag<DistanceType::JensenShannon> distance_type,
-  const DataT* x,
-  const DataT* y,
-  OutT* out,
-  IdxT m,
-  IdxT n,
-  IdxT k,
-  AccT*,                                             // workspace unused
-  size_t,                                            // worksize unused
-  FinOpT fin_op,
-  cudaStream_t stream,
-  bool is_row_major,
-  DataT)                                             // metric_arg unused
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(distance_tag<DistanceType::JensenShannon> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT*,   // workspace unused
+                   size_t,  // worksize unused
+                   FinOpT fin_op,
+                   cudaStream_t stream,
+                   bool is_row_major,
+                   DataT)  // metric_arg unused
 {
   ops::jensen_shannon_distance_op distance_op{};
 
@@ -386,34 +355,31 @@ void distance_impl(
     distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
 }
 
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void distance_impl(
-  distance_tag<DistanceType::KLDivergence> distance_type,
-  const DataT* x,
-  const DataT* y,
-  OutT* out,
-  IdxT m,
-  IdxT n,
-  IdxT k,
-  AccT*,                                             // workspace unused
-  size_t,                                            // worksize unused
-  FinOpT fin_op,
-  cudaStream_t stream,
-  bool is_row_major,
-  DataT)                                             // metric_arg unused
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(distance_tag<DistanceType::KLDivergence> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT*,   // workspace unused
+                   size_t,  // worksize unused
+                   FinOpT fin_op,
+                   cudaStream_t stream,
+                   bool is_row_major,
+                   DataT)  // metric_arg unused
 {
   auto unaryOp_lambda = [] __device__(DataT input) {
-  const bool x_zero = (input == 0);
-  return (!x_zero) * raft::log(input + x_zero);  };
+    const bool x_zero = (input == 0);
+    return (!x_zero) * raft::log(input + x_zero);
+  };
 
   auto unaryOp_lambda_reverse = [] __device__(DataT input) {
-  // reverse previous log (x) back to x using (e ^ log(x))
-  const bool x_zero = (input == 0);
-  return (!x_zero) * raft::exp(input);  };
+    // reverse previous log (x) back to x using (e ^ log(x))
+    const bool x_zero = (input == 0);
+    return (!x_zero) * raft::exp(input);
+  };
 
   // This op takes some shortcuts when x equals y. So its behavior changes based
   // on this.
@@ -437,26 +403,20 @@ void distance_impl(
   }
 }
 
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void distance_impl(
-  distance_tag<DistanceType::L1> distance_type,
-  const DataT* x,
-  const DataT* y,
-  OutT* out,
-  IdxT m,
-  IdxT n,
-  IdxT k,
-  AccT*,                                             // workspace unused
-  size_t,                                            // worksize unused
-  FinOpT fin_op,
-  cudaStream_t stream,
-  bool is_row_major,
-  DataT)                                             // metric_arg unused
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(distance_tag<DistanceType::L1> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT*,   // workspace unused
+                   size_t,  // worksize unused
+                   FinOpT fin_op,
+                   cudaStream_t stream,
+                   bool is_row_major,
+                   DataT)  // metric_arg unused
 {
   ops::l1_distance_op distance_op{};
 
@@ -472,8 +432,8 @@ template <typename DataT,
           typename OutT,
           typename FinOpT,
           typename IdxT = int>
-void distance_impl_l2_expanded(                      // NOTE: different name
-  bool perform_sqrt,                                 // dispatch on sqrt
+void distance_impl_l2_expanded(  // NOTE: different name
+  bool perform_sqrt,             // dispatch on sqrt
   const DataT* x,
   const DataT* y,
   OutT* out,
@@ -491,9 +451,8 @@ void distance_impl_l2_expanded(                      // NOTE: different name
                 "OutT can be uint8_t, float, double,"
                 "if sizeof(OutT) > 1 then sizeof(AccT) == sizeof(OutT).");
 
-  ASSERT(
-    !(((x != y) && (worksize < (m + n) * sizeof(AccT))) || (worksize < m * sizeof(AccT))),
-    "workspace size error");
+  ASSERT(!(((x != y) && (worksize < (m + n) * sizeof(AccT))) || (worksize < m * sizeof(AccT))),
+         "workspace size error");
   ASSERT(workspace != nullptr, "workspace is null");
 
   DataT* norm_A = workspace;
@@ -539,73 +498,60 @@ void distance_impl_l2_expanded(                      // NOTE: different name
   }
 }
 
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void distance_impl(
-  distance_tag<DistanceType::L2Expanded> distance_type,
-  const DataT* x,
-  const DataT* y,
-  OutT* out,
-  IdxT m,
-  IdxT n,
-  IdxT k,
-  AccT* workspace,
-  size_t worksize,
-  FinOpT fin_op,
-  cudaStream_t stream,
-  bool is_row_major,
-  DataT)                                             // metric_arg unused
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(distance_tag<DistanceType::L2Expanded> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT* workspace,
+                   size_t worksize,
+                   FinOpT fin_op,
+                   cudaStream_t stream,
+                   bool is_row_major,
+                   DataT)  // metric_arg unused
 {
   bool perform_sqrt = false;
-  distance_impl_l2_expanded(perform_sqrt, x, y, out, m, n, k, workspace, worksize, fin_op, stream, is_row_major);
+  distance_impl_l2_expanded(
+    perform_sqrt, x, y, out, m, n, k, workspace, worksize, fin_op, stream, is_row_major);
 }
 
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void distance_impl(
-  distance_tag<DistanceType::L2SqrtExpanded> distance_type,
-  const DataT* x,
-  const DataT* y,
-  OutT* out,
-  IdxT m,
-  IdxT n,
-  IdxT k,
-  AccT* workspace,
-  size_t worksize,
-  FinOpT fin_op,
-  cudaStream_t stream,
-  bool is_row_major,
-  DataT)                                             // metric_arg unused
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(distance_tag<DistanceType::L2SqrtExpanded> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT* workspace,
+                   size_t worksize,
+                   FinOpT fin_op,
+                   cudaStream_t stream,
+                   bool is_row_major,
+                   DataT)  // metric_arg unused
 {
   bool perform_sqrt = true;
-  distance_impl_l2_expanded(perform_sqrt, x, y, out, m, n, k, workspace, worksize, fin_op, stream, is_row_major);
+  distance_impl_l2_expanded(
+    perform_sqrt, x, y, out, m, n, k, workspace, worksize, fin_op, stream, is_row_major);
 }
 
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void distance_impl(
-  distance_tag<DistanceType::L2Unexpanded> distance_type,
-  const DataT* x,
-  const DataT* y,
-  OutT* out,
-  IdxT m,
-  IdxT n,
-  IdxT k,
-  AccT*,                                             // workspace unused
-  size_t,                                            // worksize unused
-  FinOpT fin_op,
-  cudaStream_t stream,
-  bool is_row_major,
-  DataT)                                             // metric_arg unused
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(distance_tag<DistanceType::L2Unexpanded> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT*,   // workspace unused
+                   size_t,  // worksize unused
+                   FinOpT fin_op,
+                   cudaStream_t stream,
+                   bool is_row_major,
+                   DataT)  // metric_arg unused
 {
   bool perform_sqrt = false;
   ops::l2_unexp_distance_op l2_op(perform_sqrt);
@@ -618,25 +564,20 @@ void distance_impl(
     l2_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
 }
 
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void distance_impl(
-  distance_tag<DistanceType::L2SqrtUnexpanded> distance_type,
-  const DataT* x,
-  const DataT* y,
-  OutT* out,
-  IdxT m,
-  IdxT n,
-  IdxT k,
-  AccT*,                                             // workspace unused
-  size_t,                                            // worksize unused
-  FinOpT fin_op,
-  cudaStream_t stream,
-  bool is_row_major,
-  DataT)                                             // metric_arg unused
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(distance_tag<DistanceType::L2SqrtUnexpanded> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT*,   // workspace unused
+                   size_t,  // worksize unused
+                   FinOpT fin_op,
+                   cudaStream_t stream,
+                   bool is_row_major,
+                   DataT)  // metric_arg unused
 {
   bool perform_sqrt = true;
   ops::l2_unexp_distance_op l2_op(perform_sqrt);
@@ -649,25 +590,20 @@ void distance_impl(
     l2_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
 }
 
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void distance_impl(
-  distance_tag<DistanceType::Linf> distance_type,
-  const DataT* x,
-  const DataT* y,
-  OutT* out,
-  IdxT m,
-  IdxT n,
-  IdxT k,
-  AccT*,                                             // workspace unused
-  size_t,                                            // worksize unused
-  FinOpT fin_op,
-  cudaStream_t stream,
-  bool is_row_major,
-  DataT)                                             // metric_arg unused
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(distance_tag<DistanceType::Linf> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT*,   // workspace unused
+                   size_t,  // worksize unused
+                   FinOpT fin_op,
+                   cudaStream_t stream,
+                   bool is_row_major,
+                   DataT)  // metric_arg unused
 {
   ops::chebyshev_distance_op distance_op{};
 
@@ -678,25 +614,20 @@ void distance_impl(
     distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
 }
 
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void distance_impl(
-  distance_tag<DistanceType::LpUnexpanded> distance_type,
-  const DataT* x,
-  const DataT* y,
-  OutT* out,
-  IdxT m,
-  IdxT n,
-  IdxT k,
-  AccT*,                                             // workspace unused
-  size_t,                                            // worksize unused
-  FinOpT fin_op,
-  cudaStream_t stream,
-  bool is_row_major,
-  DataT metric_arg)
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(distance_tag<DistanceType::LpUnexpanded> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT*,   // workspace unused
+                   size_t,  // worksize unused
+                   FinOpT fin_op,
+                   cudaStream_t stream,
+                   bool is_row_major,
+                   DataT metric_arg)
 {
   ops::minkowski_distance_op<DataT> distance_op{metric_arg};
 
@@ -707,25 +638,20 @@ void distance_impl(
     distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
 }
 
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void distance_impl(
-  distance_tag<DistanceType::RusselRaoExpanded> distance_type,
-  const DataT* x,
-  const DataT* y,
-  OutT* out,
-  IdxT m,
-  IdxT n,
-  IdxT k,
-  AccT*,                                             // workspace unused
-  size_t,                                            // worksize unused
-  FinOpT fin_op,
-  cudaStream_t stream,
-  bool is_row_major,
-  DataT)                                             // metric_arg unused
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(distance_tag<DistanceType::RusselRaoExpanded> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT*,   // workspace unused
+                   size_t,  // worksize unused
+                   FinOpT fin_op,
+                   cudaStream_t stream,
+                   bool is_row_major,
+                   DataT)  // metric_arg unused
 {
   ops::russel_rao_distance_op<IdxT> distance_op{k};
 
@@ -785,7 +711,18 @@ void distance(const InType* x,
 
   distance_impl<InType, AccType, OutType, decltype(fin_op), Index_>(
     distance_tag<distanceType>{},
-    x, y, out, m, n, k, reinterpret_cast<AccType*>(workspace), worksize, fin_op, stream, isRowMajor, metric_arg);
+    x,
+    y,
+    out,
+    m,
+    n,
+    k,
+    reinterpret_cast<AccType*>(workspace),
+    worksize,
+    fin_op,
+    stream,
+    isRowMajor,
+    metric_arg);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
diff --git a/cpp/include/raft/distance/detail/distance_ops/canberra.cuh b/cpp/include/raft/distance/detail/distance_ops/canberra.cuh
index e9c16d6d6d..6491b24e3d 100644
--- a/cpp/include/raft/distance/detail/distance_ops/canberra.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/canberra.cuh
@@ -20,8 +20,13 @@
 
 namespace raft::distance::detail::ops {
 
-// Describes the computation the canberra distance
-
+/**
+ * @brief The canberra distance matrix calculation
+ *
+ * It computes the following equation:
+ *
+ *  c_ij = sum_k |x_ik - y_kj| / ( |x_ik| + |y_kj| )
+ */
 struct canberra_distance_op {
   // Load norms of input data
   static constexpr bool use_norms = false;
diff --git a/cpp/include/raft/distance/detail/distance_ops/chebyshev.cuh b/cpp/include/raft/distance/detail/distance_ops/chebyshev.cuh
index a68d9fc21c..d390f75460 100644
--- a/cpp/include/raft/distance/detail/distance_ops/chebyshev.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/chebyshev.cuh
@@ -20,8 +20,13 @@
 
 namespace raft::distance::detail::ops {
 
-// Describes the computation the chebyshev distance
-
+/**
+ * @brief the L_inf (Chebyshev) distance matrix calculation
+ *
+ * It computes the following equation:
+ *
+ *  c_ij = max_k | x_ik - y_kj |
+ */
 struct chebyshev_distance_op {
   // Load norms of input data
   static constexpr bool use_norms = false;
diff --git a/cpp/include/raft/distance/detail/distance_ops/correlation.cuh b/cpp/include/raft/distance/detail/distance_ops/correlation.cuh
index f17d67953e..11cc3ed4f4 100644
--- a/cpp/include/raft/distance/detail/distance_ops/correlation.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/correlation.cuh
@@ -20,9 +20,14 @@
 
 namespace raft::distance::detail::ops {
 
-// Describes the computation the correlation distance
-
-
+/** @brief The correlation distance
+ *
+ * It computes the following equation:
+ *
+ * d(x, y) = ((x - mean(x)) ⋅ (y - mean(y)))
+ *           /
+ *           (|| x - mean(x) ||_2 || y - mean(y) ||_2)
+ */
 template <typename DataT_struct, typename IdxT_struct>
 struct correlation_distance_op {
   const DataT_struct* x2n;
@@ -31,19 +36,13 @@ struct correlation_distance_op {
   IdxT_struct n;
   IdxT_struct k;
 
-  correlation_distance_op(
-    bool is_row_major,
-    const DataT_struct* x2n_,
-    const DataT_struct* y2n_,
-    IdxT_struct m_,
-    IdxT_struct n_,
-    IdxT_struct k_
-  ) noexcept
-  : x2n(x2n_),
-    y2n(y2n_),
-    m(m_),
-    n(n_),
-    k(k_)
+  correlation_distance_op(bool is_row_major,
+                          const DataT_struct* x2n_,
+                          const DataT_struct* y2n_,
+                          IdxT_struct m_,
+                          IdxT_struct n_,
+                          IdxT_struct k_) noexcept
+    : x2n(x2n_), y2n(y2n_), m(m_), n(n_), k(k_)
   {
     // The distance op is typically created before the row-major/col-major
     // swapping has been done. So we do it here.
@@ -53,7 +52,6 @@ struct correlation_distance_op {
     }
   }
 
-
   // Load norms of input data
   static constexpr bool use_norms = true;
   // Whether the core function requires so many instructions that it makes sense
diff --git a/cpp/include/raft/distance/detail/distance_ops/cosine.cuh b/cpp/include/raft/distance/detail/distance_ops/cosine.cuh
index aa2eac01bc..d26b5aeda0 100644
--- a/cpp/include/raft/distance/detail/distance_ops/cosine.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/cosine.cuh
@@ -20,8 +20,13 @@
 
 namespace raft::distance::detail::ops {
 
-// Describes the computation the cosine distance
-
+/**
+ * @brief the expanded cosine distance matrix calculation
+ *
+ * It computes the following equation:
+ *
+ * d(x, y) = 1 - (x ⋅ y) / ( ||x||_2 ||y||_2)
+ */
 struct cosine_distance_op {
   // Load norms of input data
   static constexpr bool use_norms = true;
@@ -60,7 +65,6 @@ struct cosine_distance_op {
   }
 };
 
-
 template <typename DataT, typename AccT>
 struct cosine_cutlass_op {
   __device__ cosine_cutlass_op() noexcept {}
@@ -71,5 +75,4 @@ struct cosine_cutlass_op {
   __device__ AccT operator()(DataT aData) const noexcept { return aData; }
 };
 
-
 }  // namespace raft::distance::detail::ops
diff --git a/cpp/include/raft/distance/detail/distance_ops/hamming.cuh b/cpp/include/raft/distance/detail/distance_ops/hamming.cuh
index b4f610be0a..02087e2874 100644
--- a/cpp/include/raft/distance/detail/distance_ops/hamming.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/hamming.cuh
@@ -20,13 +20,17 @@
 
 namespace raft::distance::detail::ops {
 
-// Describes the computation the hamming distance
-
+/**
+ * @brief the Hamming Unexpanded distance matrix calculation
+ *  It computes the following equation:
+ *
+ *    c_ij = sum_k (x_ik != y_kj) / k
+ */
 template <typename IdxT_struct>
 struct hamming_distance_op {
   IdxT_struct k;
 
-  hamming_distance_op(IdxT_struct k_) noexcept : k(k_) { }
+  hamming_distance_op(IdxT_struct k_) noexcept : k(k_) {}
 
   // Load norms of input data
   static constexpr bool use_norms = false;
diff --git a/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh b/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh
index b0fae700b5..0314565a03 100644
--- a/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh
@@ -19,10 +19,14 @@
 
 namespace raft::distance::detail::ops {
 
-// Describes the computation the hellinger distance
-//
-// Fill in the TODO items.
-
+/**
+ * @brief the Hellinger distance matrix calculation
+ *
+ * It computes the following equation:
+ *
+ *  c_ij = sqrt(1 - sum_k sqrt(x_ik * y_kj))
+ *
+ */
 struct hellinger_distance_op {
   // Load norms of input data
   static constexpr bool use_norms = false;
diff --git a/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh b/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh
index 124010e96d..5e00faef74 100644
--- a/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh
@@ -21,6 +21,14 @@ namespace raft::distance::detail::ops {
 
 // Describes the computation the jensen_shannon distance
 
+/**
+ * @brief the Jensen Shannon distance matrix calculation
+ *
+ * It computes the following equation:
+ *
+ * c_ij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i))
+ *       + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i)))))
+ */
 struct jensen_shannon_distance_op {
   // Load norms of input data
   static constexpr bool use_norms = false;
diff --git a/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh b/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh
index a97582aa5a..fe6e0dbbe1 100644
--- a/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh
@@ -19,18 +19,21 @@
 
 namespace raft::distance::detail::ops {
 
-// Describes the computation of the kl_divergence
+/**
+ * @brief the KL Divergence distance matrix calculation
+ *
+ * It computes the following equation:
+ *
+ *   c_ij = 0.5 * sum(x * log (x / y));
+ */
 struct kl_divergence_op {
   const bool is_row_major;
   const bool x_equal_y;
 
-  kl_divergence_op(
-    bool row_major_,
-    bool x_equal_y_=false
-  ) noexcept
-  : is_row_major(row_major_),
-    x_equal_y(x_equal_y_)
-  { }
+  kl_divergence_op(bool row_major_, bool x_equal_y_ = false) noexcept
+    : is_row_major(row_major_), x_equal_y(x_equal_y_)
+  {
+  }
 
   // Load norms of input data
   static constexpr bool use_norms = false;
diff --git a/cpp/include/raft/distance/detail/distance_ops/l1.cuh b/cpp/include/raft/distance/detail/distance_ops/l1.cuh
index 4bb4a8796c..bb71a7801f 100644
--- a/cpp/include/raft/distance/detail/distance_ops/l1.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l1.cuh
@@ -19,7 +19,13 @@
 
 namespace raft::distance::detail::ops {
 
-// Describes the computation the l1 distance
+/**
+ * @brief the L1 distance matrix calculation
+ *
+ * It computes the following equation:
+ *
+ *   c_ij = sum_k abs(x_ik  - y_kj)
+ */
 struct l1_distance_op {
   // Do not load norms of data, the computation of L1 distance does not use them.
   static constexpr bool use_norms = false;
diff --git a/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
index 523019f417..d491493a63 100644
--- a/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
@@ -20,9 +20,14 @@
 
 namespace raft::distance::detail::ops {
 
-// Describes the computation the l2 expanded distance
-//
-// TODO: more explanation.
+/**
+ * @brief the expanded euclidean distance matrix calculation
+ *
+ * It computes the following equation:
+ *
+ * c_ij = - 2 sum_k x_ik * y_kj + ||x_i.||_2 + ||y_.j||_2
+ *
+ */
 struct l2_exp_distance_op {
   bool sqrt;
 
diff --git a/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh b/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh
index f5e2f278b7..6e75cc95e8 100644
--- a/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh
@@ -20,7 +20,13 @@
 
 namespace raft::distance::detail::ops {
 
-// Describes the computation the l2 unexpanded distance
+/**
+ * @brief the unexpanded euclidean distance matrix calculation
+ *
+ * It computes the following equation:
+ *
+ * c_ij = optional_sqrt ( sum_k (x_ik - y_kj)^2 )
+ */
 struct l2_unexp_distance_op {
   bool sqrt;
 
diff --git a/cpp/include/raft/distance/detail/distance_ops/minkowski.cuh b/cpp/include/raft/distance/detail/distance_ops/minkowski.cuh
index 8deb42d1fe..0640cc72a7 100644
--- a/cpp/include/raft/distance/detail/distance_ops/minkowski.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/minkowski.cuh
@@ -19,13 +19,18 @@
 
 namespace raft::distance::detail::ops {
 
-// Describes the computation the minkowski distance
-
+/**
+ * @brief the unexpanded Lp (Minkowski) distance matrix calculation
+ *
+ * It computes the following equation:
+ *
+ *   c_ij = (sum_k |x_ik - y_jk|^p)^(1/p)
+ */
 template <typename DataT_struct>
 struct minkowski_distance_op {
   DataT_struct p;
 
-  minkowski_distance_op(DataT_struct p_) noexcept : p(p_) { }
+  minkowski_distance_op(DataT_struct p_) noexcept : p(p_) {}
 
   // Load norms of input data
   static constexpr bool use_norms = false;
diff --git a/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh b/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh
index e114ef8224..f9fbc7221b 100644
--- a/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh
@@ -20,17 +20,19 @@
 
 namespace raft::distance::detail::ops {
 
-// Describes the computation the russel_rao distance
-
+/**
+ * @brief the Russell Rao distance matrix calculation
+ *
+ * It computes the following equation:
+ *
+ *  c_ij = (k - (sum_k x_ik * y_kj)) / k
+ */
 template <typename IdxT_struct>
 struct russel_rao_distance_op {
   IdxT_struct k;
   const float one_over_k;
 
-  russel_rao_distance_op(IdxT_struct k_) noexcept
-    : k(k_),
-      one_over_k(1.0f / k_)
-  { }
+  russel_rao_distance_op(IdxT_struct k_) noexcept : k(k_), one_over_k(1.0f / k_) {}
 
   // Load norms of input data
   static constexpr bool use_norms = false;
diff --git a/cpp/include/raft/distance/detail/euclidean.cuh b/cpp/include/raft/distance/detail/euclidean.cuh
deleted file mode 100644
index 3cdc5489a6..0000000000
--- a/cpp/include/raft/distance/detail/euclidean.cuh
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/linalg/norm.cuh>
-
-#include "pairwise_matrix/dispatch.cuh"
-#include "distance_ops/l2_exp.cuh"
-#include "distance_ops/l2_unexp.cuh"
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-// /**
-//  * @brief the expanded euclidean distance matrix calculation
-//  *  It computes the following equation: C = op(A^2 + B^2 - 2AB)
-//  * @tparam InType input data-type (for A and B matrices)
-//  * @tparam AccType accumulation data-type
-//  * @tparam OutType output data-type (for C and D matrices)
-//  * @tparam FinalLambda the final lambda called by FragmentMultiplyAdd_
-//  * @tparam Index_ index type
-//  * @param m number of rows of A and C/D
-//  * @param n number of columns of B and C/D
-//  * @param k number of cols of A and rows of B
-//  * @param pA input matrix
-//  * @param pB input matrix
-//  * @param pD output matrix
-//  * @param enable_sqrt if the square root is computed or not
-//  * @param workspace temporary workspace needed for computations
-//  * @param worksize number of bytes of the workspace
-//  * @param fin_op the final gemm epilogue lambda
-//  * @param stream cuda stream where to launch work
-//  * @param isRowMajor whether the input and output matrices are row major
-//  */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void euclideanAlgo1(IdxT m,
-                    IdxT n,
-                    IdxT k,
-                    const DataT* pA,
-                    const DataT* pB,
-                    OutT* pD,
-                    bool enable_sqrt,
-                    AccT* workspace,
-                    size_t& worksize,
-                    FinOpT fin_op,
-                    cudaStream_t stream,
-                    bool isRowMajor)
-{
-  // raft distance support inputs as float/double and output as uint8_t/float/double.
-  static_assert(!((sizeof(OutT) > 1) && (sizeof(AccT) != sizeof(OutT))),
-                "OutT can be uint8_t, float, double,"
-                "if sizeof(OutT) > 1 then sizeof(AccT) == sizeof(OutT).");
-
-  ASSERT(
-    !(((pA != pB) && (worksize < (m + n) * sizeof(AccT))) || (worksize < m * sizeof(AccT))),
-    "workspace size error");
-  ASSERT(workspace != nullptr, "workspace is null");
-
-  DataT* norm_A = workspace;
-  DataT* norm_B = workspace;
-  if (pA != pB) {
-    norm_B += m;
-    raft::linalg::rowNorm(
-      norm_A, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
-    raft::linalg::rowNorm(
-      norm_B, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
-  } else {
-    raft::linalg::rowNorm(
-      norm_A, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, raft::identity_op{});
-  }
-
-  // On CUDA 12:
-  // - always execute normal kernel
-  //
-  // On CUDA 11 and below:
-  // - execute CUTLASS-based kernel on SM_80 and above
-  // - execute normal kernel otherwise.
-
-  if constexpr (__CUDACC_VER_MAJOR__ == 12) {
-    // Always execute legacy kernels on CUDA 12
-    ops::l2_exp_distance_op l2_op(enable_sqrt);
-    distance_matrix_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
-      l2_op, m, n, k, pA, pB, norm_A, norm_B, pD, fin_op, stream, isRowMajor);
-  } else {
-    const auto deviceVersion = getComputeCapability();
-    if (deviceVersion.first >= 8) {
-      // If device is SM_80 or later, use CUTLASS-based kernel.
-      using L2Op = ops::l2_exp_cutlass_op<DataT, AccT>;
-      L2Op l2_op(enable_sqrt);
-
-      distance_matrix_cutlass_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
-        l2_op, m, n, k, pA, pB, norm_A, norm_B, pD, fin_op, stream, isRowMajor);
-    } else {
-      // Else use "legacy" L2
-      ops::l2_exp_distance_op l2_op(enable_sqrt);
-      distance_matrix_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
-        l2_op, m, n, k, pA, pB, norm_A, norm_B, pD, fin_op, stream, isRowMajor);
-    }
-  }
-}
-
-
-/**
- * @brief the unexpanded euclidean distance matrix calculation
- *  It computes the following equation: cij = op((ai-bj)^2)
- * @tparam InType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OutType output data-type (for C and D matrices)
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ index type
- * @param m number of rows of A and C/D
- * @param n number of columns of B and C/D
- * @param k number of cols of A and rows of B
- * @param pA input matrix
- * @param pB input matrix
- * @param pD output matrix
- * @param enable_sqrt if the square root is computed or not
- * @param fin_op the final gemm epilogue lambda
- * @param stream cuda stream where to launch work
- * @param isRowMajor whether the input and output matrices are row major
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void euclideanAlgo2(IdxT m,
-                    IdxT n,
-                    IdxT k,
-                    const DataT* pA,
-                    const DataT* pB,
-                    OutT* pD,
-                    bool enable_sqrt,
-                    FinOpT fin_op,
-                    cudaStream_t stream,
-                    bool isRowMajor)
-{
-  ops::l2_unexp_distance_op l2_op(enable_sqrt);
-
-  // The unexpanded L2 does not require the norms of a and b to be calculated.
-  const DataT* norm_A = nullptr;
-  const DataT* norm_B = nullptr;
-
-  distance_matrix_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
-    l2_op, m, n, k, pA, pB, norm_A, norm_B, pD, fin_op, stream, isRowMajor);
-}
-
-};  // end namespace detail
-};  // end namespace distance
-};  // end namespace raft
diff --git a/cpp/include/raft/distance/detail/hamming.cuh b/cpp/include/raft/distance/detail/hamming.cuh
deleted file mode 100644
index 824e930023..0000000000
--- a/cpp/include/raft/distance/detail/hamming.cuh
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include "distance_ops/hamming.cuh"
-#include "pairwise_matrix/dispatch.cuh"
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-/**
- * @brief the Hamming Unexpanded distance matrix calculation
- *  It computes the following equation:
-    Cij = sum(x_i != y_i) / k
- *
- * @tparam InType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OutType output data-type (for C and D matrices)
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
- * @param m number of rows of A and C/D
- * @param n number of columns of B and C/D
- * @param k number of cols of A and rows of B
- * @param pA input matrix
- * @param pB input matrix
- * @param pD output matrix
- * @param fin_op the final element-wise epilogue lambda
- * @param stream cuda stream where to launch work
- * @param isRowMajor whether the input and output matrices are row major
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void hammingUnexpandedImpl(int m,
-                           int n,
-                           int k,
-                           const DataT* x,
-                           const DataT* y,
-                           OutT* out,
-                           FinOpT fin_op,
-                           cudaStream_t stream,
-                           bool is_row_major)
-{
-  ops::hamming_distance_op<IdxT> distance_op{k};
-
-  const DataT* x_norm = nullptr;
-  const DataT* y_norm = nullptr;
-
-  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
-    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
-}
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/hellinger.cuh b/cpp/include/raft/distance/detail/hellinger.cuh
deleted file mode 100644
index 306977f266..0000000000
--- a/cpp/include/raft/distance/detail/hellinger.cuh
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include <raft/core/operators.cuh>
-#include <raft/linalg/unary_op.cuh>
-
-#include "pairwise_matrix/dispatch.cuh"
-#include "distance_ops/hellinger.cuh"
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-/**
- * @brief the Hellinger distance matrix calculation
- *  It computes the following equation:
-    sqrt(1 - sum(sqrt(x_k * y_k))
- * This distance computation modifies A and B by computing a sqrt
- * and then performing a `pow(x, 2)` to convert it back. Because of this,
- * it is possible that the values in A and B might differ slightly
- * after this is invoked.
- *
- * @tparam InType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OutType output data-type (for C and D matrices)
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
- * @param m number of rows of A and C/D
- * @param n number of columns of B and C/D
- * @param k number of cols of A and rows of B
- * @param pA input matrix
- * @param pB input matrix
- * @param pD output matrix
- * @param fin_op the final element-wise epilogue lambda
- * @param stream cuda stream where to launch work
- * @param isRowMajor whether the input and output matrices are row major
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void hellingerImpl(int m,
-                   int n,
-                   int k,
-                   const DataT* x,
-                   const DataT* y,
-                   OutT* out,
-                   FinOpT fin_op,
-                   cudaStream_t stream,
-                   bool is_row_major)
-{
-  // First sqrt x and y
-  const auto raft_sqrt = raft::linalg::unaryOp<DataT, raft::sqrt_op, IdxT>;
-
-  raft_sqrt((DataT*)x, x, m * k, raft::sqrt_op{}, stream);
-  if (x != y) {
-    raft_sqrt((DataT*)y, y, n * k, raft::sqrt_op{}, stream);
-  }
-
-  // Then calculate Hellinger distance
-  ops::hellinger_distance_op distance_op{};
-
-  const DataT* x_norm = nullptr;
-  const DataT* y_norm = nullptr;
-
-  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
-    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
-
-  // Finally revert sqrt of x and y
-  raft_sqrt((DataT*)x, x, m * k, raft::sqrt_op{}, stream);
-  if (x != y) {
-    raft_sqrt((DataT*)y, y, n * k, raft::sqrt_op{}, stream);
-  }
-
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/jensen_shannon.cuh b/cpp/include/raft/distance/detail/jensen_shannon.cuh
deleted file mode 100644
index 71339e0c1a..0000000000
--- a/cpp/include/raft/distance/detail/jensen_shannon.cuh
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include "distance_ops/jensen_shannon.cuh"
-#include "pairwise_matrix/dispatch.cuh"
-
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-/**
- * @brief the Jensen Shannon distance matrix calculation
- *  It computes the following equation:
-    Cij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i))
-            + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i)))))
- *
- * @tparam InType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OutType output data-type (for C and D matrices)
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
- * @param m number of rows of A and C/D
- * @param n number of columns of B and C/D
- * @param k number of cols of A and rows of B
- * @param pA input matrix
- * @param pB input matrix
- * @param pD output matrix
- * @param fin_op the final element-wise epilogue lambda
- * @param stream cuda stream where to launch work
- * @param isRowMajor whether the input and output matrices are row major
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void jensenShannonImpl(int m,
-                       int n,
-                       int k,
-                       const DataT* x,
-                       const DataT* y,
-                       OutT* out,
-                       FinOpT fin_op,
-                       cudaStream_t stream,
-                       bool is_row_major)
-{
-  ops::jensen_shannon_distance_op distance_op{};
-
-  const DataT* x_norm = nullptr;
-  const DataT* y_norm = nullptr;
-
-  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
-    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
-}
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/kl_divergence.cuh b/cpp/include/raft/distance/detail/kl_divergence.cuh
deleted file mode 100644
index e2f7bf2beb..0000000000
--- a/cpp/include/raft/distance/detail/kl_divergence.cuh
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include <raft/util/cuda_utils.cuh>
-#include <raft/linalg/unary_op.cuh>
-
-#include "distance_ops/kl_divergence.cuh"
-#include "pairwise_matrix/dispatch.cuh"
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-/**
- * @brief the KL Divergence distance matrix calculation
- *  It computes the following equation:
-      Cij = 0.5 * sum(x * log (x / y));
- * This distance computation modifies A or B by computing a log(x)
- * and then performing a `pow(e, log(x))` to convert it back. Because of this,
- * it is possible that the values in A or B might differ slightly
- * after this is invoked.
- * @tparam InType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OutType output data-type (for C and D matrices)
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
- * @param m number of rows of A and C/D
- * @param n number of columns of B and C/D
- * @param k number of cols of A and rows of B
- * @param pA input matrix
- * @param pB input matrix
- * @param pD output matrix
- * @param fin_op the final element-wise epilogue lambda
- * @param stream cuda stream where to launch work
- * @param isRowMajor whether the input and output matrices are row major
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void klDivergenceImpl(int m,
-                      int n,
-                      int k,
-                      const DataT* x,
-                      const DataT* y,
-                      OutT* out,
-                      FinOpT fin_op,
-                      cudaStream_t stream,
-                      bool is_row_major)
-{
-  auto unaryOp_lambda = [] __device__(DataT input) {
-  const bool x_zero = (input == 0);
-  return (!x_zero) * raft::log(input + x_zero);  };
-
-  auto unaryOp_lambda_reverse = [] __device__(DataT input) {
-  // reverse previous log (x) back to x using (e ^ log(x))
-  const bool x_zero = (input == 0);
-  return (!x_zero) * raft::exp(input);  };
-
-  // This op takes some shortcuts when x equals y. So its behavior changes based
-  // on this.
-  ops::kl_divergence_op kl_divergence{is_row_major, x == y};
-
-  if (x != y) {
-    raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-      (DataT*)y, y, n * k, unaryOp_lambda, stream);
-  }
-
-  const DataT* x_norm = nullptr;
-  const DataT* y_norm = nullptr;
-
-  distance_matrix_dispatch<decltype(kl_divergence), DataT, AccT, OutT, FinOpT, IdxT>(
-    kl_divergence, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
-
-  if (x != y) {
-    // Now reverse previous log (x) back to x using (e ^ log(x))
-    raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda_reverse), IdxT>(
-      (DataT*)y, y, n * k, unaryOp_lambda_reverse, stream);
-  }
-}
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/l1.cuh b/cpp/include/raft/distance/detail/l1.cuh
deleted file mode 100644
index cceb432c7d..0000000000
--- a/cpp/include/raft/distance/detail/l1.cuh
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include "distance_ops/l1.cuh"
-#include "pairwise_matrix/dispatch.cuh"
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void l1Impl(int m,
-            int n,
-            int k,
-            const DataT* x,
-            const DataT* y,
-            OutT* out,
-            FinOpT fin_op,
-            cudaStream_t stream,
-            bool is_row_major)
-{
-  ops::l1_distance_op distance_op{};
-
-  const DataT* x_norm = nullptr;
-  const DataT* y_norm = nullptr;
-
-  distance_matrix_dispatch<ops::l1_distance_op, DataT, AccT, OutT, FinOpT, IdxT>(
-    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
-}
-
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft
diff --git a/cpp/include/raft/distance/detail/minkowski.cuh b/cpp/include/raft/distance/detail/minkowski.cuh
deleted file mode 100644
index 778ceb45cf..0000000000
--- a/cpp/include/raft/distance/detail/minkowski.cuh
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include "pairwise_matrix/dispatch.cuh"
-#include "distance_ops/minkowski.cuh"
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-/**
- * @brief the unexpanded minkowski distance matrix calculation
- *  It computes the following equation: cij = sum(|x - y|^p)^(1/p)
- * @tparam InType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OutType output data-type (for C and D matrices)
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ index type
- * @param[in] m number of rows of A and C/D
- * @param[in] n number of rows of B and cols of C/D
- * @param[in] k number of cols of A and B
- * @param[in] pA input matrix
- * @param[in] pB input matrix
- * @param[out] pD output matrix
- * @param[in] fin_op the final gemm epilogue lambda
- * @param[in] stream cuda stream to launch work
- * @param[in] isRowMajor whether the input and output matrices are row major
- * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances.
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void minkowskiImpl(IdxT m,
-                   IdxT n,
-                   IdxT k,
-                   const DataT* x,
-                   const DataT* y,
-                   OutT* out,
-                   FinOpT fin_op,
-                   cudaStream_t stream,
-                   bool is_row_major,
-                   DataT metric_arg)
-{
-  ops::minkowski_distance_op<DataT> distance_op{metric_arg};
-
-  const DataT* x_norm = nullptr;
-  const DataT* y_norm = nullptr;
-
-  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
-    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
-}
-};  // end namespace detail
-};  // end namespace distance
-};  // end namespace raft
diff --git a/cpp/include/raft/distance/detail/russell_rao.cuh b/cpp/include/raft/distance/detail/russell_rao.cuh
deleted file mode 100644
index 6bf5ae04bb..0000000000
--- a/cpp/include/raft/distance/detail/russell_rao.cuh
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include "distance_ops/russel_rao.cuh"
-#include "pairwise_matrix/dispatch.cuh"
-
-namespace raft {
-namespace distance {
-namespace detail {
-
-/**
- * @brief the Russell Rao distance matrix calculation
- *  It computes the following equation:
-    Cij = (k - sum(x_i * y_i)) / k
- *
- * @tparam InType input data-type (for A and B matrices)
- * @tparam AccType accumulation data-type
- * @tparam OutType output data-type (for C and D matrices)
- * @tparam FinalLambda user-defined epilogue lamba
- * @tparam Index_ Index type
- * @param m number of rows of A and C/D
- * @param n number of columns of B and C/D
- * @param k number of cols of A and rows of B
- * @param pA input matrix
- * @param pB input matrix
- * @param pD output matrix
- * @param fin_op the final element-wise epilogue lambda
- * @param stream cuda stream where to launch work
- * @param isRowMajor whether the input and output matrices are row major
- */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename FinOpT,
-          typename IdxT = int>
-void russellRaoImpl(int m,
-                    int n,
-                    int k,
-                    const DataT* x,
-                    const DataT* y,
-                    OutT* out,
-                    FinOpT fin_op,
-                    cudaStream_t stream,
-                    bool is_row_major)
-{
-  ops::russel_rao_distance_op<IdxT> distance_op{k};
-
-  const DataT* x_norm = nullptr;
-  const DataT* y_norm = nullptr;
-
-  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
-    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
-}
-}  // namespace detail
-}  // namespace distance
-}  // namespace raft

From 34ccddc0f09c36190f6da783249925cbd4cd2791 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 21 Feb 2023 11:18:33 +0100
Subject: [PATCH 43/60] Update readme

---
 cpp/include/raft/distance/detail/README.org | 53 ++++++++++++---------
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/cpp/include/raft/distance/detail/README.org b/cpp/include/raft/distance/detail/README.org
index 03d540cb84..99e59547d0 100644
--- a/cpp/include/raft/distance/detail/README.org
+++ b/cpp/include/raft/distance/detail/README.org
@@ -1,25 +1,32 @@
 #+title: Readme
 
-- [X] Euclidean
-  - *Notes*:
-    - enable_sqrt is now a runtime parameter. Was it a compile time
-      parameter before?
-    - CUTLASS fails on CUDA 12 (but prior to refactoring CUDA 12 did not work
-      either). I have not yet tested if everything works correctly on CUDA 11.
-- [X] canberra.cuh
-- [X] chebyshev.cuh
-- [X] correlation.cuh
-- [X] cosine.cuh
-  - *Notes*: cutlass fails on CUDA 12 (but prior to refactoring CUDA 12 did not
-    work either). I have not yet tested if everything works correctly on
-    CUDA 11.
-- [X] hamming.cuh
-- [X] hellinger.cuh
-- [X] jensen_shannon.cuh
-- [X] kl_divergence.cuh
-  - *Notes*: the isRowMajor and x_equal_y boolean parameters where previously
-    template / constexpr parameters. Now they are passed by value. This greatly
-    reduces the number of kernels, but may have negative consequences for run
-    time.
-- [X] minkowski.cuh
-- [X] russell_rao.cuh
+* Overview
+
+| Metric         | Epilog | Uses norms | Has params                | Pre- & post-processing | Expensive inner loop | Depends on row_major | CUTLASS |
+|----------------+--------+------------+---------------------------+------------------------+----------------------+----------------------+---------|
+| Canberra       |        |            |                           |                        | x                    |                      |         |
+| Chebyshev      |        |            |                           |                        |                      |                      |         |
+| Correlation    | x      | x (twice)  | x (many)                  | compute norms          |                      | x                    |         |
+| Cosine         | x      | x          |                           | compute norms          |                      |                      | x       |
+| Hamming        | x      |            | x (k)                     |                        |                      |                      |         |
+| Hellinger      | x      |            |                           | sqrt and square        |                      |                      |         |
+| Jensen Shannon | x      |            |                           |                        | x                    |                      |         |
+| KL divergence  | x      |            | x (row_major, x_equals_y) | yes                    | x                    | x                    |         |
+| L1             |        |            |                           |                        |                      |                      |         |
+| L2 expanded    | x      | x          | x (sqrt)                  | compute norms          |                      |                      | x       |
+| L2 unexpanded  | x      |            | x (sqrt)                  |                        |                      |                      |         |
+| Minkowski      | x      |            | x (p)                     |                        | x                    |                      |         |
+| Russel-Rao     | x      |            | x (k, 1/k)                |                        |                      |                      |         |
+
+* Tasks
+
+** TODO Architecture-conditional compilation
+** TODO Clean up template arguments for kernel
+** TODO Can we remove DataT_struct?
+** TODO Include raft_cuda_utils
+** TODO rename chebyshev -> Linf
+** TODO remove this note about workspace
+
+: * @note if workspace is passed as nullptr, this will return in
+: *  worksize, the number of bytes of workspace required
+** TODO Think of something wrt templates of distance_ops

From 6a12ded58ba14f14b66b5d54dab266888b9b6e2e Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 21 Feb 2023 12:23:45 +0100
Subject: [PATCH 44/60] Reenable device code generation

Some code in dispatch was commented out in a futile attempt to keep
compile times limited.
---
 .../detail/pairwise_matrix/dispatch.cuh       | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
index 4a7c1f999f..b3362e7647 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
@@ -69,19 +69,19 @@ using vec_len_constant = std::integral_constant<int, n>;
 template <typename F>
 void dispatch(bool row_major, int vec_len, F&& f)
 {
-  // if (row_major) {
-  //   switch (vec_len) {
-  //     case 4: f(std::bool_constant<true>(), vec_len_constant<4>()); break;
-  //     case 2: f(std::bool_constant<true>(), vec_len_constant<2>()); break;
-  //     default: f(std::bool_constant<true>(), vec_len_constant<1>()); break;
-  //   }
-  // } else {
-  //   switch (vec_len) {
-  //     case 4: f(std::bool_constant<false>(), vec_len_constant<4>()); break;
-  //     case 2: f(std::bool_constant<false>(), vec_len_constant<2>()); break;
-  //     default: f(std::bool_constant<false>(), vec_len_constant<1>()); break;
-  //   }
-  // }
+  if (row_major) {
+    switch (vec_len) {
+      case 4: f(std::bool_constant<true>(), vec_len_constant<4>()); break;
+      case 2: f(std::bool_constant<true>(), vec_len_constant<2>()); break;
+      default: f(std::bool_constant<true>(), vec_len_constant<1>()); break;
+    }
+  } else {
+    switch (vec_len) {
+      case 4: f(std::bool_constant<false>(), vec_len_constant<4>()); break;
+      case 2: f(std::bool_constant<false>(), vec_len_constant<2>()); break;
+      default: f(std::bool_constant<false>(), vec_len_constant<1>()); break;
+    }
+  }
 }
 
 template <typename OpT,

From 486393eff4e0cf1d45ab9d7990b64d607e835d70 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 21 Feb 2023 21:37:14 +0100
Subject: [PATCH 45/60] Readd overload of raft::distance::detail::distance

---
 cpp/include/raft/distance/detail/distance.cuh | 62 ++++++++++++++++---
 cpp/include/raft/distance/distance.cuh        |  2 +-
 2 files changed, 56 insertions(+), 8 deletions(-)

diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index 35a5b798b3..3e5b676294 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -722,12 +722,12 @@ void distance_impl(raft::resources const& handle,
 }
 
 /**
- * @brief Evaluate pairwise distances and write to matrix
- *
+ * @brief Evaluate pairwise distances with the user epilogue lamba allowed
  * @tparam DistanceType which distance to evaluate
  * @tparam InType input argument type
  * @tparam AccType accumulation type
  * @tparam OutType output type
+ * @tparam FinalLambda user-defined epilogue lamba
  * @tparam Index_ Index type
  *
  * @param x first set of points
@@ -738,15 +738,20 @@ void distance_impl(raft::resources const& handle,
  * @param k dimensionality
  * @param workspace temporary workspace needed for computations
  * @param worksize number of bytes of the workspace
+ * @param fin_op the final gemm epilogue lambda
+ * @param stream cuda stream
  * @param isRowMajor whether the matrices are row-major or col-major
  *
- * @note if workspace is passed as nullptr, this will return in
- *  worksize, the number of bytes of workspace required
+ * @note fin_op: This is a device lambda which is supposed to operate upon the
+ * input which is AccType and returns the output in OutType. It's signature is
+ * as follows:  <pre>OutType fin_op(AccType in, int g_idx);</pre>. If one needs
+ * any other parameters, feel free to pass them via closure.
  */
 template <raft::distance::DistanceType distanceType,
           typename InType,
           typename AccType,
           typename OutType,
+          typename FinalLambda,
           typename Index_ = int>
 void distance(raft::resources const& handle,
               const InType* x,
@@ -757,17 +762,16 @@ void distance(raft::resources const& handle,
               Index_ k,
               void* workspace,
               size_t worksize,
+              FinalLambda fin_op,
               bool isRowMajor   = true,
               InType metric_arg = 2.0f)
 {
-  auto fin_op = raft::identity_op();
-
   // raft distance support inputs as float/double and output as uint8_t/float/double.
   static_assert(!((sizeof(OutType) > 1) && (sizeof(AccType) != sizeof(OutType))),
                 "OutType can be uint8_t, float, double,"
                 "if sizeof(OutType) > 1 then sizeof(AccType) == sizeof(OutType).");
 
-  distance_impl<InType, AccType, OutType, decltype(fin_op), Index_>(
+  distance_impl<InType, AccType, OutType, FinalLambda, Index_>(
     handle,
     distance_tag<distanceType>{},
     x,
@@ -784,6 +788,50 @@ void distance(raft::resources const& handle,
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
+/**
+ * @brief Evaluate pairwise distances for the simple use case
+ * @tparam DistanceType which distance to evaluate
+ * @tparam InType input argument type
+ * @tparam AccType accumulation type
+ * @tparam OutType output type
+ * @tparam Index_ Index type
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param workspace temporary workspace needed for computations
+ * @param worksize number of bytes of the workspace
+ * @param stream cuda stream
+ * @param isRowMajor whether the matrices are row-major or col-major
+ *
+ * @note if workspace is passed as nullptr, this will return in
+ *  worksize, the number of bytes of workspace required
+ */
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename Index_ = int>
+void distance(raft::resources const& handle,
+              const InType* x,
+              const InType* y,
+              OutType* out,
+              Index_ m,
+              Index_ n,
+              Index_ k,
+              void* workspace,
+              size_t worksize,
+              bool isRowMajor   = true,
+              InType metric_arg = 2.0f)
+{
+  auto fin_op = raft::identity_op();
+
+  distance<distanceType, InType, AccType, OutType, decltype(fin_op), Index_>(
+    handle, x, y, out, m, n, k, workspace, worksize, fin_op, isRowMajor, metric_arg);
+}
+
 /**
  * @brief Return the exact workspace size to compute the distance
  * @tparam DistanceType which distance to evaluate
diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh
index 59bf52a2ca..ddda68f789 100644
--- a/cpp/include/raft/distance/distance.cuh
+++ b/cpp/include/raft/distance/distance.cuh
@@ -253,7 +253,7 @@ void pairwise_distance(raft::resources const& handle,
                        bool isRowMajor = true,
                        Type metric_arg = 2.0f)
 {
-  auto stream = handle.get_stream();
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
 
   auto dispatch = [&](auto distance_type) {
     auto worksize = getWorkspaceSize<distance_type(), Type, Type, Type, Index_>(x, y, m, n, k);

From ca29e2d008827743748c0e4416b2717b1ea844e3 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 21 Feb 2023 22:02:55 +0100
Subject: [PATCH 46/60] Fix style

---
 cpp/include/raft/distance/detail/distance.cuh | 29 +++++++++----------
 .../distance/detail/distance_ops/template.cuh |  2 +-
 .../detail/pairwise_distance_cutlass_base.cuh |  6 ++--
 3 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index 3e5b676294..573d5c2778 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -304,7 +304,6 @@ void distance_impl(raft::resources const& handle,
     distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
 }
 
-
 template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
 void distance_impl(raft::resources const& handle,
                    distance_tag<DistanceType::InnerProduct> distance_type,
@@ -320,18 +319,18 @@ void distance_impl(raft::resources const& handle,
                    bool is_row_major,
                    DataT)  // metric_arg unused
 {
-    cudaStream_t stream = raft::resource::get_cuda_stream(handle);
-    raft::linalg::gemm(handle,
-                       out,
-                       const_cast<DataT*>(x),
-                       const_cast<DataT*>(y),
-                       m,
-                       n,
-                       k,
-                       !is_row_major,
-                       !is_row_major,
-                       is_row_major,
-                       stream);
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+  raft::linalg::gemm(handle,
+                     out,
+                     const_cast<DataT*>(x),
+                     const_cast<DataT*>(y),
+                     m,
+                     n,
+                     k,
+                     !is_row_major,
+                     !is_row_major,
+                     is_row_major,
+                     stream);
 }
 
 template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
@@ -560,7 +559,7 @@ void distance_impl(raft::resources const& handle,
                    bool is_row_major,
                    DataT)  // metric_arg unused
 {
-  bool perform_sqrt = false;
+  bool perform_sqrt   = false;
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
   distance_impl_l2_expanded(
     perform_sqrt, x, y, out, m, n, k, workspace, worksize, fin_op, stream, is_row_major);
@@ -581,7 +580,7 @@ void distance_impl(raft::resources const& handle,
                    bool is_row_major,
                    DataT)  // metric_arg unused
 {
-  bool perform_sqrt = true;
+  bool perform_sqrt   = true;
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
   distance_impl_l2_expanded(
     perform_sqrt, x, y, out, m, n, k, workspace, worksize, fin_op, stream, is_row_major);
diff --git a/cpp/include/raft/distance/detail/distance_ops/template.cuh b/cpp/include/raft/distance/detail/distance_ops/template.cuh
index 378bcf0c9f..1d2d681b18 100644
--- a/cpp/include/raft/distance/detail/distance_ops/template.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/template.cuh
@@ -27,7 +27,7 @@ namespace raft::distance::detail::ops {
 struct template_distance_op {
   TODO member;
 
-  template_distance_op(TODO member_) noexcept : member(member_) { }
+  template_distance_op(TODO member_) noexcept : member(member_) {}
 
   // Load norms of input data
   static constexpr bool use_norms = TODO;
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh
index 0d26d940b3..2ab5c69b0d 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_cutlass_base.cuh
@@ -169,8 +169,8 @@ void cutlassDistanceKernel(const DataT* x,
   CUTLASS_CHECK(status);
 }
 
-};      // namespace detail
-};      // namespace distance
-};      // namespace raft
+};  // namespace detail
+};  // namespace distance
+};  // namespace raft
 
 #pragma GCC diagnostic pop

From 28c95a12901cc311d73f30ab9fe2b5596ff00ff3 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 22 Feb 2023 11:31:11 +0100
Subject: [PATCH 47/60] Fix 11.8 compilation error

---
 cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
index b3362e7647..23d0f34489 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
@@ -198,7 +198,7 @@ void distance_matrix_cutlass_dispatch(opT cutlass_op,
     // respectively.
 
     // Prevent double, vec_len=4 combination (this is not supported)
-    constexpr int vec_len = std::min(vec_len_aligned, static_cast<int>(16 / sizeof(DataT)));
+    constexpr int vec_len = std::min(vec_len_aligned(), static_cast<int>(16 / sizeof(DataT)));
 
     cutlassDistanceKernel<DataT, AccT, OutT, IdxT, vec_len, FinOpT, opT, row_major()>(
       x, y, x_norm, y_norm, m, n, k, ldx, ldy, ld_out, out, fin_op, cutlass_op, stream);

From a5592b9d2165fbd49efecbcf08c94d83b305200c Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 22 Feb 2023 11:40:41 +0100
Subject: [PATCH 48/60] Rename minkowski -> lp_unexp

---
 cpp/include/raft/distance/detail/distance.cuh                 | 4 ++--
 .../detail/distance_ops/{minkowski.cuh => lp_unexp.cuh}       | 4 ++--
 cpp/test/CMakeLists.txt                                       | 2 +-
 cpp/test/distance/{dist_minkowski.cu => dist_lp_unexp.cu}     | 0
 4 files changed, 5 insertions(+), 5 deletions(-)
 rename cpp/include/raft/distance/detail/distance_ops/{minkowski.cuh => lp_unexp.cuh} (96%)
 rename cpp/test/distance/{dist_minkowski.cu => dist_lp_unexp.cu} (100%)

diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index 573d5c2778..6d14fcca28 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -38,7 +38,7 @@
 #include <raft/distance/detail/distance_ops/l1.cuh>
 #include <raft/distance/detail/distance_ops/l2_exp.cuh>
 #include <raft/distance/detail/distance_ops/l2_unexp.cuh>
-#include <raft/distance/detail/distance_ops/minkowski.cuh>
+#include <raft/distance/detail/distance_ops/lp_unexp.cuh>
 #include <raft/distance/detail/distance_ops/russel_rao.cuh>
 
 #include <raft/distance/detail/pairwise_matrix/dispatch.cuh>
@@ -683,7 +683,7 @@ void distance_impl(raft::resources const& handle,
                    bool is_row_major,
                    DataT metric_arg)
 {
-  ops::minkowski_distance_op<DataT> distance_op{metric_arg};
+  ops::lp_unexp_distance_op<DataT> distance_op{metric_arg};
 
   const DataT* x_norm = nullptr;
   const DataT* y_norm = nullptr;
diff --git a/cpp/include/raft/distance/detail/distance_ops/minkowski.cuh b/cpp/include/raft/distance/detail/distance_ops/lp_unexp.cuh
similarity index 96%
rename from cpp/include/raft/distance/detail/distance_ops/minkowski.cuh
rename to cpp/include/raft/distance/detail/distance_ops/lp_unexp.cuh
index 0640cc72a7..4af6888ddf 100644
--- a/cpp/include/raft/distance/detail/distance_ops/minkowski.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/lp_unexp.cuh
@@ -27,10 +27,10 @@ namespace raft::distance::detail::ops {
  *   c_ij = (sum_k |x_ik - y_jk|^p)^(1/p)
  */
 template <typename DataT_struct>
-struct minkowski_distance_op {
+struct lp_unexp_distance_op {
   DataT_struct p;
 
-  minkowski_distance_op(DataT_struct p_) noexcept : p(p_) {}
+  lp_unexp_distance_op(DataT_struct p_) noexcept : p(p_) {}
 
   // Load norms of input data
   static constexpr bool use_norms = false;
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 575e8cf84b..928412568a 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -121,7 +121,7 @@ if(BUILD_TESTS)
     test/distance/dist_jensen_shannon.cu
     test/distance/dist_kl_divergence.cu
     test/distance/dist_l1.cu
-    test/distance/dist_minkowski.cu
+    test/distance/dist_lp_unexp.cu
     test/distance/dist_russell_rao.cu
     test/distance/masked_nn.cu
     test/distance/masked_nn_compress_to_bits.cu
diff --git a/cpp/test/distance/dist_minkowski.cu b/cpp/test/distance/dist_lp_unexp.cu
similarity index 100%
rename from cpp/test/distance/dist_minkowski.cu
rename to cpp/test/distance/dist_lp_unexp.cu

From 265ba0718aed82f8d5107119040ed8f7e4e53888 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 22 Feb 2023 12:11:08 +0100
Subject: [PATCH 49/60] Rename Chebyshev -> l_inf

---
 cpp/CMakeLists.txt                                            | 4 ++--
 cpp/include/raft/distance/detail/distance.cuh                 | 4 ++--
 .../distance/detail/distance_ops/{chebyshev.cuh => l_inf.cuh} | 2 +-
 .../specializations/detail/{chebyshev.cuh => l_inf.cuh}       | 0
 cpp/include/raft/distance/specializations/distance.cuh        | 2 +-
 ...double_double_int.cu => l_inf_double_double_double_int.cu} | 0
 ...loat_float_float_int.cu => l_inf_float_float_float_int.cu} | 0
 cpp/test/CMakeLists.txt                                       | 2 +-
 cpp/test/distance/{dist_chebyshev.cu => dist_l_inf.cu}        | 0
 9 files changed, 7 insertions(+), 7 deletions(-)
 rename cpp/include/raft/distance/detail/distance_ops/{chebyshev.cuh => l_inf.cuh} (98%)
 rename cpp/include/raft/distance/specializations/detail/{chebyshev.cuh => l_inf.cuh} (100%)
 rename cpp/src/distance/distance/specializations/detail/{chebyshev_double_double_double_int.cu => l_inf_double_double_double_int.cu} (100%)
 rename cpp/src/distance/distance/specializations/detail/{chebyshev_float_float_float_int.cu => l_inf_float_float_float_int.cu} (100%)
 rename cpp/test/distance/{dist_chebyshev.cu => dist_l_inf.cu} (100%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7e5b10b227..679a1747c1 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -317,8 +317,6 @@ if(RAFT_COMPILE_DIST_LIBRARY)
     src/distance/cluster/kmeans_init_plus_plus_float.cu
     src/distance/distance/specializations/detail/canberra_double_double_double_int.cu
     src/distance/distance/specializations/detail/canberra_float_float_float_int.cu
-    src/distance/distance/specializations/detail/chebyshev_double_double_double_int.cu
-    src/distance/distance/specializations/detail/chebyshev_float_float_float_int.cu
     src/distance/distance/specializations/detail/correlation_double_double_double_int.cu
     src/distance/distance/specializations/detail/correlation_float_float_float_int.cu
     src/distance/distance/specializations/detail/cosine_double_double_double_int.cu
@@ -352,6 +350,8 @@ if(RAFT_COMPILE_DIST_LIBRARY)
     src/distance/distance/specializations/detail/l2_sqrt_unexpanded_double_double_double_int.cu
     src/distance/distance/specializations/detail/l2_unexpanded_double_double_double_int.cu
     src/distance/distance/specializations/detail/l2_unexpanded_float_float_float_int.cu
+    src/distance/distance/specializations/detail/l_inf_double_double_double_int.cu
+    src/distance/distance/specializations/detail/l_inf_float_float_float_int.cu
     src/distance/distance/specializations/detail/lp_unexpanded_double_double_double_int.cu
     src/distance/distance/specializations/detail/lp_unexpanded_float_float_float_int.cu
     src/distance/distance/specializations/detail/russel_rao_double_double_double_int.cu
diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index 6d14fcca28..95cc9afa42 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -28,7 +28,6 @@
 #include <raft/core/operators.hpp>
 
 #include <raft/distance/detail/distance_ops/canberra.cuh>
-#include <raft/distance/detail/distance_ops/chebyshev.cuh>
 #include <raft/distance/detail/distance_ops/correlation.cuh>
 #include <raft/distance/detail/distance_ops/cosine.cuh>
 #include <raft/distance/detail/distance_ops/hamming.cuh>
@@ -38,6 +37,7 @@
 #include <raft/distance/detail/distance_ops/l1.cuh>
 #include <raft/distance/detail/distance_ops/l2_exp.cuh>
 #include <raft/distance/detail/distance_ops/l2_unexp.cuh>
+#include <raft/distance/detail/distance_ops/l_inf.cuh>
 #include <raft/distance/detail/distance_ops/lp_unexp.cuh>
 #include <raft/distance/detail/distance_ops/russel_rao.cuh>
 
@@ -657,7 +657,7 @@ void distance_impl(raft::resources const& handle,
                    bool is_row_major,
                    DataT)  // metric_arg unused
 {
-  ops::chebyshev_distance_op distance_op{};
+  ops::l_inf_distance_op distance_op{};
 
   const DataT* x_norm = nullptr;
   const DataT* y_norm = nullptr;
diff --git a/cpp/include/raft/distance/detail/distance_ops/chebyshev.cuh b/cpp/include/raft/distance/detail/distance_ops/l_inf.cuh
similarity index 98%
rename from cpp/include/raft/distance/detail/distance_ops/chebyshev.cuh
rename to cpp/include/raft/distance/detail/distance_ops/l_inf.cuh
index d390f75460..0d515faa23 100644
--- a/cpp/include/raft/distance/detail/distance_ops/chebyshev.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l_inf.cuh
@@ -27,7 +27,7 @@ namespace raft::distance::detail::ops {
  *
  *  c_ij = max_k | x_ik - y_kj |
  */
-struct chebyshev_distance_op {
+struct l_inf_distance_op {
   // Load norms of input data
   static constexpr bool use_norms = false;
   // Whether the core function requires so many instructions that it makes sense
diff --git a/cpp/include/raft/distance/specializations/detail/chebyshev.cuh b/cpp/include/raft/distance/specializations/detail/l_inf.cuh
similarity index 100%
rename from cpp/include/raft/distance/specializations/detail/chebyshev.cuh
rename to cpp/include/raft/distance/specializations/detail/l_inf.cuh
diff --git a/cpp/include/raft/distance/specializations/distance.cuh b/cpp/include/raft/distance/specializations/distance.cuh
index a0c35ca9a8..8daa398b49 100644
--- a/cpp/include/raft/distance/specializations/distance.cuh
+++ b/cpp/include/raft/distance/specializations/distance.cuh
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <raft/distance/specializations/detail/canberra.cuh>
-#include <raft/distance/specializations/detail/chebyshev.cuh>
 #include <raft/distance/specializations/detail/correlation.cuh>
 #include <raft/distance/specializations/detail/cosine.cuh>
 #include <raft/distance/specializations/detail/hamming_unexpanded.cuh>
@@ -31,6 +30,7 @@
 #include <raft/distance/specializations/detail/l2_sqrt_expanded.cuh>
 #include <raft/distance/specializations/detail/l2_sqrt_unexpanded.cuh>
 #include <raft/distance/specializations/detail/l2_unexpanded.cuh>
+#include <raft/distance/specializations/detail/l_inf.cuh>
 #include <raft/distance/specializations/detail/lp_unexpanded.cuh>
 #include <raft/distance/specializations/detail/russel_rao.cuh>
 #include <raft/distance/specializations/fused_l2_nn_min.cuh>
diff --git a/cpp/src/distance/distance/specializations/detail/chebyshev_double_double_double_int.cu b/cpp/src/distance/distance/specializations/detail/l_inf_double_double_double_int.cu
similarity index 100%
rename from cpp/src/distance/distance/specializations/detail/chebyshev_double_double_double_int.cu
rename to cpp/src/distance/distance/specializations/detail/l_inf_double_double_double_int.cu
diff --git a/cpp/src/distance/distance/specializations/detail/chebyshev_float_float_float_int.cu b/cpp/src/distance/distance/specializations/detail/l_inf_float_float_float_int.cu
similarity index 100%
rename from cpp/src/distance/distance/specializations/detail/chebyshev_float_float_float_int.cu
rename to cpp/src/distance/distance/specializations/detail/l_inf_float_float_float_int.cu
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 928412568a..f0347b09be 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -109,7 +109,6 @@ if(BUILD_TESTS)
     PATH
     test/distance/dist_adj.cu
     test/distance/dist_canberra.cu
-    test/distance/dist_chebyshev.cu
     test/distance/dist_correlation.cu
     test/distance/dist_cos.cu
     test/distance/dist_euc_exp.cu
@@ -121,6 +120,7 @@ if(BUILD_TESTS)
     test/distance/dist_jensen_shannon.cu
     test/distance/dist_kl_divergence.cu
     test/distance/dist_l1.cu
+    test/distance/dist_l_inf.cu
     test/distance/dist_lp_unexp.cu
     test/distance/dist_russell_rao.cu
     test/distance/masked_nn.cu
diff --git a/cpp/test/distance/dist_chebyshev.cu b/cpp/test/distance/dist_l_inf.cu
similarity index 100%
rename from cpp/test/distance/dist_chebyshev.cu
rename to cpp/test/distance/dist_l_inf.cu

From 7ccb8a7e5c428c7d8acda207f5194a82a2e275a5 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 22 Feb 2023 12:14:27 +0100
Subject: [PATCH 50/60] Rename euc -> l2

---
 cpp/test/CMakeLists.txt                                     | 6 +++---
 cpp/test/distance/{dist_euc_exp.cu => dist_l2_exp.cu}       | 0
 .../distance/{dist_eucsqrt_exp.cu => dist_l2_sqrt_exp.cu}   | 0
 cpp/test/distance/{dist_euc_unexp.cu => dist_l2_unexp.cu}   | 0
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename cpp/test/distance/{dist_euc_exp.cu => dist_l2_exp.cu} (100%)
 rename cpp/test/distance/{dist_eucsqrt_exp.cu => dist_l2_sqrt_exp.cu} (100%)
 rename cpp/test/distance/{dist_euc_unexp.cu => dist_l2_unexp.cu} (100%)

diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index f0347b09be..aa4487e9d5 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -111,15 +111,15 @@ if(BUILD_TESTS)
     test/distance/dist_canberra.cu
     test/distance/dist_correlation.cu
     test/distance/dist_cos.cu
-    test/distance/dist_euc_exp.cu
-    test/distance/dist_euc_unexp.cu
-    test/distance/dist_eucsqrt_exp.cu
     test/distance/dist_hamming.cu
     test/distance/dist_hellinger.cu
     test/distance/dist_inner_product.cu
     test/distance/dist_jensen_shannon.cu
     test/distance/dist_kl_divergence.cu
     test/distance/dist_l1.cu
+    test/distance/dist_l2_exp.cu
+    test/distance/dist_l2_unexp.cu
+    test/distance/dist_l2_sqrt_exp.cu
     test/distance/dist_l_inf.cu
     test/distance/dist_lp_unexp.cu
     test/distance/dist_russell_rao.cu
diff --git a/cpp/test/distance/dist_euc_exp.cu b/cpp/test/distance/dist_l2_exp.cu
similarity index 100%
rename from cpp/test/distance/dist_euc_exp.cu
rename to cpp/test/distance/dist_l2_exp.cu
diff --git a/cpp/test/distance/dist_eucsqrt_exp.cu b/cpp/test/distance/dist_l2_sqrt_exp.cu
similarity index 100%
rename from cpp/test/distance/dist_eucsqrt_exp.cu
rename to cpp/test/distance/dist_l2_sqrt_exp.cu
diff --git a/cpp/test/distance/dist_euc_unexp.cu b/cpp/test/distance/dist_l2_unexp.cu
similarity index 100%
rename from cpp/test/distance/dist_euc_unexp.cu
rename to cpp/test/distance/dist_l2_unexp.cu

From 874d014ccb6c7e6816ffdfedeb3927c91f6883f2 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 22 Feb 2023 12:22:40 +0100
Subject: [PATCH 51/60] Update copyright headers

Files have moved
---
 cpp/test/distance/dist_l2_exp.cu      | 2 +-
 cpp/test/distance/dist_l2_sqrt_exp.cu | 2 +-
 cpp/test/distance/dist_l2_unexp.cu    | 2 +-
 cpp/test/distance/dist_l_inf.cu       | 2 +-
 cpp/test/distance/dist_lp_unexp.cu    | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/test/distance/dist_l2_exp.cu b/cpp/test/distance/dist_l2_exp.cu
index 567e279691..ae67215e51 100644
--- a/cpp/test/distance/dist_l2_exp.cu
+++ b/cpp/test/distance/dist_l2_exp.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/distance/dist_l2_sqrt_exp.cu b/cpp/test/distance/dist_l2_sqrt_exp.cu
index d717158649..94d254f44b 100644
--- a/cpp/test/distance/dist_l2_sqrt_exp.cu
+++ b/cpp/test/distance/dist_l2_sqrt_exp.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/distance/dist_l2_unexp.cu b/cpp/test/distance/dist_l2_unexp.cu
index 311ad190e2..d74a41d2a4 100644
--- a/cpp/test/distance/dist_l2_unexp.cu
+++ b/cpp/test/distance/dist_l2_unexp.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/distance/dist_l_inf.cu b/cpp/test/distance/dist_l_inf.cu
index abad828de7..b9d6413a10 100644
--- a/cpp/test/distance/dist_l_inf.cu
+++ b/cpp/test/distance/dist_l_inf.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/distance/dist_lp_unexp.cu b/cpp/test/distance/dist_lp_unexp.cu
index af2661da3a..9d6f5921a7 100644
--- a/cpp/test/distance/dist_lp_unexp.cu
+++ b/cpp/test/distance/dist_lp_unexp.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 757fb44f304979e6a4b4dcacb0adccba905f4952 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 22 Feb 2023 12:25:29 +0100
Subject: [PATCH 52/60] Remove misleading note about workspace nullptr

---
 cpp/include/raft/distance/detail/distance.cuh | 3 ---
 cpp/include/raft/distance/distance.cuh        | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index 95cc9afa42..bea5ced976 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -804,9 +804,6 @@ void distance(raft::resources const& handle,
  * @param worksize number of bytes of the workspace
  * @param stream cuda stream
  * @param isRowMajor whether the matrices are row-major or col-major
- *
- * @note if workspace is passed as nullptr, this will return in
- *  worksize, the number of bytes of workspace required
  */
 template <raft::distance::DistanceType distanceType,
           typename InType,
diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh
index ddda68f789..5216902635 100644
--- a/cpp/include/raft/distance/distance.cuh
+++ b/cpp/include/raft/distance/distance.cuh
@@ -102,9 +102,6 @@ void distance(raft::resources const& handle,
  * @param worksize number of bytes of the workspace
  * @param isRowMajor whether the matrices are row-major or col-major
  * @param metric_arg metric argument (used for Minkowski distance)
- *
- * @note if workspace is passed as nullptr, this will return in
- *  worksize, the number of bytes of workspace required
  */
 template <raft::distance::DistanceType distanceType,
           typename InType,

From d6e9261ce43bcef8497bc5026f70aac7b0900ab8 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 22 Feb 2023 12:26:51 +0100
Subject: [PATCH 53/60] Remove notes file

---
 cpp/include/raft/distance/detail/README.org | 32 ---------------------
 1 file changed, 32 deletions(-)
 delete mode 100644 cpp/include/raft/distance/detail/README.org

diff --git a/cpp/include/raft/distance/detail/README.org b/cpp/include/raft/distance/detail/README.org
deleted file mode 100644
index 99e59547d0..0000000000
--- a/cpp/include/raft/distance/detail/README.org
+++ /dev/null
@@ -1,32 +0,0 @@
-#+title: Readme
-
-* Overview
-
-| Metric         | Epilog | Uses norms | Has params                | Pre- & post-processing | Expensive inner loop | Depends on row_major | CUTLASS |
-|----------------+--------+------------+---------------------------+------------------------+----------------------+----------------------+---------|
-| Canberra       |        |            |                           |                        | x                    |                      |         |
-| Chebyshev      |        |            |                           |                        |                      |                      |         |
-| Correlation    | x      | x (twice)  | x (many)                  | compute norms          |                      | x                    |         |
-| Cosine         | x      | x          |                           | compute norms          |                      |                      | x       |
-| Hamming        | x      |            | x (k)                     |                        |                      |                      |         |
-| Hellinger      | x      |            |                           | sqrt and square        |                      |                      |         |
-| Jensen Shannon | x      |            |                           |                        | x                    |                      |         |
-| KL divergence  | x      |            | x (row_major, x_equals_y) | yes                    | x                    | x                    |         |
-| L1             |        |            |                           |                        |                      |                      |         |
-| L2 expanded    | x      | x          | x (sqrt)                  | compute norms          |                      |                      | x       |
-| L2 unexpanded  | x      |            | x (sqrt)                  |                        |                      |                      |         |
-| Minkowski      | x      |            | x (p)                     |                        | x                    |                      |         |
-| Russel-Rao     | x      |            | x (k, 1/k)                |                        |                      |                      |         |
-
-* Tasks
-
-** TODO Architecture-conditional compilation
-** TODO Clean up template arguments for kernel
-** TODO Can we remove DataT_struct?
-** TODO Include raft_cuda_utils
-** TODO rename chebyshev -> Linf
-** TODO remove this note about workspace
-
-: * @note if workspace is passed as nullptr, this will return in
-: *  worksize, the number of bytes of workspace required
-** TODO Think of something wrt templates of distance_ops

From 885bda66bd94f39b0e39053cedf627d1d3a6e6c2 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 22 Feb 2023 14:02:17 +0100
Subject: [PATCH 54/60] Put template on struct instead of methods

---
 cpp/include/raft/distance/detail/distance.cuh | 36 +++++++++----------
 .../distance/detail/distance_ops/canberra.cuh |  6 ++--
 .../detail/distance_ops/correlation.cuh       | 29 ++++++++-------
 .../distance/detail/distance_ops/cosine.cuh   |  6 ++--
 .../distance/detail/distance_ops/hamming.cuh  | 11 +++---
 .../detail/distance_ops/hellinger.cuh         |  6 ++--
 .../detail/distance_ops/jensen_shannon.cuh    |  6 ++--
 .../detail/distance_ops/kl_divergence.cuh     |  6 ++--
 .../raft/distance/detail/distance_ops/l1.cuh  |  6 ++--
 .../distance/detail/distance_ops/l2_exp.cuh   |  6 ++--
 .../distance/detail/distance_ops/l2_unexp.cuh |  6 ++--
 .../distance/detail/distance_ops/l_inf.cuh    |  6 ++--
 .../distance/detail/distance_ops/lp_unexp.cuh | 11 +++---
 .../detail/distance_ops/russel_rao.cuh        | 11 +++---
 .../distance/detail/distance_ops/template.cuh |  4 +--
 .../detail/pairwise_matrix/kernel_sm60.cuh    |  4 +--
 16 files changed, 78 insertions(+), 82 deletions(-)

diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index bea5ced976..621e2d15b9 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -120,7 +120,7 @@ void distance_impl(raft::resources const& handle,
                    bool is_row_major,
                    DataT metric_arg)  // unused
 {
-  ops::canberra_distance_op distance_op{};
+  ops::canberra_distance_op<DataT, AccT, IdxT> distance_op{};
 
   const DataT* x_norm = nullptr;
   const DataT* y_norm = nullptr;
@@ -203,8 +203,8 @@ void distance_impl(raft::resources const& handle,
     raft::linalg::rowNorm(sq_norm_col_vec, x, k, m, raft::linalg::L2Norm, is_row_major, stream);
   }
 
-  using CorrOp = ops::correlation_distance_op<DataT, IdxT>;
-  CorrOp corr_op(is_row_major, sq_norm_col_vec, sq_norm_row_vec, m, n, k);
+  using OpT = ops::correlation_distance_op<DataT, AccT, IdxT>;
+  OpT corr_op(is_row_major, sq_norm_col_vec, sq_norm_row_vec, m, n, k);
   distance_matrix_dispatch<decltype(corr_op), DataT, AccT, OutT, FinOpT, IdxT>(
     corr_op, m, n, k, x, y, norm_col_vec, norm_row_vec, out, fin_op, stream, is_row_major);
 }
@@ -257,7 +257,7 @@ void distance_impl(raft::resources const& handle,
 
   if constexpr (__CUDACC_VER_MAJOR__ == 12) {
     // Always execute legacy kernels on CUDA 12
-    ops::cosine_distance_op distance_op{};
+    ops::cosine_distance_op<DataT, AccT, IdxT> distance_op{};
     distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
       distance_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
   } else {
@@ -271,7 +271,7 @@ void distance_impl(raft::resources const& handle,
         distance_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
     } else {
       // Else use "legacy" L2
-      ops::cosine_distance_op distance_op{};
+      ops::cosine_distance_op<DataT, AccT, IdxT> distance_op{};
       distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
         distance_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
     }
@@ -293,7 +293,7 @@ void distance_impl(raft::resources const& handle,
                    bool is_row_major,
                    DataT)  // metric_arg unused
 {
-  ops::hamming_distance_op<IdxT> distance_op{k};
+  ops::hamming_distance_op<DataT, AccT, IdxT> distance_op{k};
 
   const DataT* x_norm = nullptr;
   const DataT* y_norm = nullptr;
@@ -357,7 +357,7 @@ void distance_impl(raft::resources const& handle,
   if (x != y) { raft_sqrt((DataT*)y, y, n * k, raft::sqrt_op{}, stream); }
 
   // Then calculate Hellinger distance
-  ops::hellinger_distance_op distance_op{};
+  ops::hellinger_distance_op<DataT, AccT, IdxT> distance_op{};
 
   const DataT* x_norm = nullptr;
   const DataT* y_norm = nullptr;
@@ -387,7 +387,7 @@ void distance_impl(raft::resources const& handle,
                    bool is_row_major,
                    DataT)  // metric_arg unused
 {
-  ops::jensen_shannon_distance_op distance_op{};
+  ops::jensen_shannon_distance_op<DataT, AccT, IdxT> distance_op{};
 
   const DataT* x_norm = nullptr;
   const DataT* y_norm = nullptr;
@@ -428,7 +428,7 @@ void distance_impl(raft::resources const& handle,
 
   // This op takes some shortcuts when x equals y. So its behavior changes based
   // on this.
-  ops::kl_divergence_op kl_divergence{is_row_major, x == y};
+  ops::kl_divergence_op<DataT, AccT, DataT> kl_divergence{is_row_major, x == y};
 
   if (x != y) {
     raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
@@ -463,13 +463,13 @@ void distance_impl(raft::resources const& handle,
                    bool is_row_major,
                    DataT)  // metric_arg unused
 {
-  ops::l1_distance_op distance_op{};
+  ops::l1_distance_op<DataT, AccT, IdxT> distance_op{};
 
   const DataT* x_norm = nullptr;
   const DataT* y_norm = nullptr;
 
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
-  distance_matrix_dispatch<ops::l1_distance_op, DataT, AccT, OutT, FinOpT, IdxT>(
+  distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
     distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
 }
 
@@ -523,7 +523,7 @@ void distance_impl_l2_expanded(  // NOTE: different name
 
   if constexpr (__CUDACC_VER_MAJOR__ == 12) {
     // Always execute legacy kernels on CUDA 12
-    ops::l2_exp_distance_op l2_op(perform_sqrt);
+    ops::l2_exp_distance_op<DataT, AccT, IdxT> l2_op(perform_sqrt);
     distance_matrix_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
       l2_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
   } else {
@@ -537,7 +537,7 @@ void distance_impl_l2_expanded(  // NOTE: different name
         l2_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
     } else {
       // Else use "legacy" L2
-      ops::l2_exp_distance_op l2_op(perform_sqrt);
+      ops::l2_exp_distance_op<DataT, AccT, IdxT> l2_op(perform_sqrt);
       distance_matrix_dispatch<decltype(l2_op), DataT, AccT, OutT, FinOpT, IdxT>(
         l2_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
     }
@@ -602,7 +602,7 @@ void distance_impl(raft::resources const& handle,
                    DataT)  // metric_arg unused
 {
   bool perform_sqrt = false;
-  ops::l2_unexp_distance_op l2_op(perform_sqrt);
+  ops::l2_unexp_distance_op<DataT, AccT, IdxT> l2_op(perform_sqrt);
 
   // The unexpanded L2 does not require the norms of a and b to be calculated.
   const DataT* norm_A = nullptr;
@@ -630,7 +630,7 @@ void distance_impl(raft::resources const& handle,
                    DataT)  // metric_arg unused
 {
   bool perform_sqrt = true;
-  ops::l2_unexp_distance_op l2_op(perform_sqrt);
+  ops::l2_unexp_distance_op<DataT, AccT, IdxT> l2_op(perform_sqrt);
 
   // The unexpanded L2 does not require the norms of a and b to be calculated.
   const DataT* norm_A = nullptr;
@@ -657,7 +657,7 @@ void distance_impl(raft::resources const& handle,
                    bool is_row_major,
                    DataT)  // metric_arg unused
 {
-  ops::l_inf_distance_op distance_op{};
+  ops::l_inf_distance_op<DataT, AccT, IdxT> distance_op{};
 
   const DataT* x_norm = nullptr;
   const DataT* y_norm = nullptr;
@@ -683,7 +683,7 @@ void distance_impl(raft::resources const& handle,
                    bool is_row_major,
                    DataT metric_arg)
 {
-  ops::lp_unexp_distance_op<DataT> distance_op{metric_arg};
+  ops::lp_unexp_distance_op<DataT, AccT, IdxT> distance_op{metric_arg};
 
   const DataT* x_norm = nullptr;
   const DataT* y_norm = nullptr;
@@ -709,7 +709,7 @@ void distance_impl(raft::resources const& handle,
                    bool is_row_major,
                    DataT)  // metric_arg unused
 {
-  ops::russel_rao_distance_op<IdxT> distance_op{k};
+  ops::russel_rao_distance_op<DataT, AccT, IdxT> distance_op{k};
 
   const DataT* x_norm = nullptr;
   const DataT* y_norm = nullptr;
diff --git a/cpp/include/raft/distance/detail/distance_ops/canberra.cuh b/cpp/include/raft/distance/detail/distance_ops/canberra.cuh
index 6491b24e3d..5ddf02e705 100644
--- a/cpp/include/raft/distance/detail/distance_ops/canberra.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/canberra.cuh
@@ -27,6 +27,7 @@ namespace raft::distance::detail::ops {
  *
  *  c_ij = sum_k |x_ik - y_kj| / ( |x_ik| + |y_kj| )
  */
+template <typename DataT, typename AccT, typename IdxT>
 struct canberra_distance_op {
   // Load norms of input data
   static constexpr bool use_norms = false;
@@ -36,13 +37,12 @@ struct canberra_distance_op {
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
-  template <typename Policy, typename DataT>
+  template <typename Policy>
   constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize;
   }
 
-  template <typename AccT, typename DataT>
   DI void core(AccT& acc, DataT& x, DataT& y) const
   {
     const auto diff = raft::abs(x - y);
@@ -52,7 +52,7 @@ struct canberra_distance_op {
     acc += ((add != 0) * diff / (add + (add == 0)));
   };
 
-  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  template <typename Policy>
   DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
                  DataT* regxn,
                  DataT* regyn,
diff --git a/cpp/include/raft/distance/detail/distance_ops/correlation.cuh b/cpp/include/raft/distance/detail/distance_ops/correlation.cuh
index 11cc3ed4f4..d46cbf6718 100644
--- a/cpp/include/raft/distance/detail/distance_ops/correlation.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/correlation.cuh
@@ -28,26 +28,26 @@ namespace raft::distance::detail::ops {
  *           /
  *           (|| x - mean(x) ||_2 || y - mean(y) ||_2)
  */
-template <typename DataT_struct, typename IdxT_struct>
+template <typename DataT, typename AccT, typename IdxT>
 struct correlation_distance_op {
-  const DataT_struct* x2n;
-  const DataT_struct* y2n;
-  IdxT_struct m;
-  IdxT_struct n;
-  IdxT_struct k;
+  const DataT* x2n;
+  const DataT* y2n;
+  IdxT m;
+  IdxT n;
+  IdxT k;
 
   correlation_distance_op(bool is_row_major,
-                          const DataT_struct* x2n_,
-                          const DataT_struct* y2n_,
-                          IdxT_struct m_,
-                          IdxT_struct n_,
-                          IdxT_struct k_) noexcept
+                          const DataT* x2n_,
+                          const DataT* y2n_,
+                          IdxT m_,
+                          IdxT n_,
+                          IdxT k_) noexcept
     : x2n(x2n_), y2n(y2n_), m(m_), n(n_), k(k_)
   {
     // The distance op is typically created before the row-major/col-major
     // swapping has been done. So we do it here.
     if (!is_row_major) {
-      std::swap<const DataT_struct*>(x2n, y2n);
+      std::swap<const DataT*>(x2n, y2n);
       std::swap(m, n);
     }
   }
@@ -60,19 +60,18 @@ struct correlation_distance_op {
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
-  template <typename Policy, typename DataT>
+  template <typename Policy>
   constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize + (2 * (Policy::Mblk + Policy::Nblk) * sizeof(DataT));
   }
 
-  template <typename AccT, typename DataT>
   DI void core(AccT& acc, DataT& x, DataT& y) const
   {
     acc += x * y;
   };
 
-  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  template <typename Policy>
   DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
                  DataT* regxn,
                  DataT* regyn,
diff --git a/cpp/include/raft/distance/detail/distance_ops/cosine.cuh b/cpp/include/raft/distance/detail/distance_ops/cosine.cuh
index d26b5aeda0..422ec4a3aa 100644
--- a/cpp/include/raft/distance/detail/distance_ops/cosine.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/cosine.cuh
@@ -27,6 +27,7 @@ namespace raft::distance::detail::ops {
  *
  * d(x, y) = 1 - (x ⋅ y) / ( ||x||_2 ||y||_2)
  */
+template <typename DataT, typename AccT, typename IdxT>
 struct cosine_distance_op {
   // Load norms of input data
   static constexpr bool use_norms = true;
@@ -36,19 +37,18 @@ struct cosine_distance_op {
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
-  template <typename Policy, typename DataT>
+  template <typename Policy>
   constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT));
   }
 
-  template <typename AccT, typename DataT>
   DI void core(AccT& acc, DataT& x, DataT& y) const
   {
     acc += x * y;
   };
 
-  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  template <typename Policy>
   DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
                  DataT* regxn,
                  DataT* regyn,
diff --git a/cpp/include/raft/distance/detail/distance_ops/hamming.cuh b/cpp/include/raft/distance/detail/distance_ops/hamming.cuh
index 02087e2874..6d050154d7 100644
--- a/cpp/include/raft/distance/detail/distance_ops/hamming.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/hamming.cuh
@@ -26,11 +26,11 @@ namespace raft::distance::detail::ops {
  *
  *    c_ij = sum_k (x_ik != y_kj) / k
  */
-template <typename IdxT_struct>
+template <typename DataT, typename AccT, typename IdxT>
 struct hamming_distance_op {
-  IdxT_struct k;
+  IdxT k;
 
-  hamming_distance_op(IdxT_struct k_) noexcept : k(k_) {}
+  hamming_distance_op(IdxT k_) noexcept : k(k_) {}
 
   // Load norms of input data
   static constexpr bool use_norms = false;
@@ -40,19 +40,18 @@ struct hamming_distance_op {
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
-  template <typename Policy, typename DataT>
+  template <typename Policy>
   constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize;
   }
 
-  template <typename AccT, typename DataT>
   DI void core(AccT& acc, DataT& x, DataT& y) const
   {
     acc += (x != y);
   };
 
-  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  template <typename Policy>
   DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
                  DataT* regxn,
                  DataT* regyn,
diff --git a/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh b/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh
index 0314565a03..c5e2b84ac2 100644
--- a/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/hellinger.cuh
@@ -27,6 +27,7 @@ namespace raft::distance::detail::ops {
  *  c_ij = sqrt(1 - sum_k sqrt(x_ik * y_kj))
  *
  */
+template <typename DataT, typename AccT, typename IdxT>
 struct hellinger_distance_op {
   // Load norms of input data
   static constexpr bool use_norms = false;
@@ -36,13 +37,12 @@ struct hellinger_distance_op {
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
-  template <typename Policy, typename DataT>
+  template <typename Policy>
   constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize;
   }
 
-  template <typename AccT, typename DataT>
   DI void core(AccT& acc, DataT& x, DataT& y) const
   {
     // This is sqrt(x) * sqrt(y).
@@ -50,7 +50,7 @@ struct hellinger_distance_op {
     acc += product;
   };
 
-  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  template <typename Policy>
   DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
                  DataT* regxn,
                  DataT* regyn,
diff --git a/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh b/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh
index 5e00faef74..df5aadcf3b 100644
--- a/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/jensen_shannon.cuh
@@ -29,6 +29,7 @@ namespace raft::distance::detail::ops {
  * c_ij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i))
  *       + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i)))))
  */
+template <typename DataT, typename AccT, typename IdxT>
 struct jensen_shannon_distance_op {
   // Load norms of input data
   static constexpr bool use_norms = false;
@@ -38,13 +39,12 @@ struct jensen_shannon_distance_op {
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
-  template <typename Policy, typename DataT>
+  template <typename Policy>
   constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize;
   }
 
-  template <typename AccT, typename DataT>
   DI void core(AccT& acc, DataT& x, DataT& y) const
   {
     const DataT m     = 0.5f * (x + y);
@@ -56,7 +56,7 @@ struct jensen_shannon_distance_op {
     acc += (-x * (logM - raft::log(x + x_zero))) + (-y * (logM - raft::log(y + y_zero)));
   };
 
-  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  template <typename Policy>
   DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
                  DataT* regxn,
                  DataT* regyn,
diff --git a/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh b/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh
index fe6e0dbbe1..526927243f 100644
--- a/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/kl_divergence.cuh
@@ -26,6 +26,7 @@ namespace raft::distance::detail::ops {
  *
  *   c_ij = 0.5 * sum(x * log (x / y));
  */
+template <typename DataT, typename AccT, typename IdxT>
 struct kl_divergence_op {
   const bool is_row_major;
   const bool x_equal_y;
@@ -43,13 +44,12 @@ struct kl_divergence_op {
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
-  template <typename Policy, typename DataT>
+  template <typename Policy>
   constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize;
   }
 
-  template <typename AccT, typename DataT>
   DI void core(AccT& acc, DataT& x, DataT& y) const
   {
     // TODO: make sure that these branches get hoisted out of main loop.. Could
@@ -75,7 +75,7 @@ struct kl_divergence_op {
     }
   };
 
-  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  template <typename Policy>
   DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
                  DataT* regxn,
                  DataT* regyn,
diff --git a/cpp/include/raft/distance/detail/distance_ops/l1.cuh b/cpp/include/raft/distance/detail/distance_ops/l1.cuh
index bb71a7801f..f152f1d83a 100644
--- a/cpp/include/raft/distance/detail/distance_ops/l1.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l1.cuh
@@ -26,6 +26,7 @@ namespace raft::distance::detail::ops {
  *
  *   c_ij = sum_k abs(x_ik  - y_kj)
  */
+template <typename DataT, typename AccT, typename IdxT>
 struct l1_distance_op {
   // Do not load norms of data, the computation of L1 distance does not use them.
   static constexpr bool use_norms = false;
@@ -35,19 +36,18 @@ struct l1_distance_op {
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
-  template <typename Policy, typename DataT>
+  template <typename Policy>
   constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize;
   }
 
-  template <typename AccT, typename DataT>
   DI void core(AccT& acc, DataT& x, DataT& y) const
   {
     acc += raft::abs(x - y);
   };
 
-  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  template <typename Policy>
   DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
                  DataT* regxn,
                  DataT* regyn,
diff --git a/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
index d491493a63..785e7804d6 100644
--- a/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
@@ -28,6 +28,7 @@ namespace raft::distance::detail::ops {
  * c_ij = - 2 sum_k x_ik * y_kj + ||x_i.||_2 + ||y_.j||_2
  *
  */
+template <typename DataT, typename AccT, typename IdxT>
 struct l2_exp_distance_op {
   bool sqrt;
 
@@ -41,19 +42,18 @@ struct l2_exp_distance_op {
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
-  template <typename Policy, typename DataT>
+  template <typename Policy>
   constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT));
   }
 
-  template <typename AccT, typename DataT>
   DI void core(AccT& acc, DataT& x, DataT& y) const
   {
     acc += x * y;
   };
 
-  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  template <typename Policy>
   DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
                  DataT* regxn,
                  DataT* regyn,
diff --git a/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh b/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh
index 6e75cc95e8..e03eb0a97e 100644
--- a/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l2_unexp.cuh
@@ -27,6 +27,7 @@ namespace raft::distance::detail::ops {
  *
  * c_ij = optional_sqrt ( sum_k (x_ik - y_kj)^2 )
  */
+template <typename DataT, typename AccT, typename IdxT>
 struct l2_unexp_distance_op {
   bool sqrt;
 
@@ -40,20 +41,19 @@ struct l2_unexp_distance_op {
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
-  template <typename Policy, typename DataT>
+  template <typename Policy>
   constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize;
   }
 
-  template <typename AccT, typename DataT>
   DI void core(AccT& acc, DataT& x, DataT& y) const
   {
     const auto diff = x - y;
     acc += diff * diff;
   };
 
-  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  template <typename Policy>
   DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
                  DataT* regxn,
                  DataT* regyn,
diff --git a/cpp/include/raft/distance/detail/distance_ops/l_inf.cuh b/cpp/include/raft/distance/detail/distance_ops/l_inf.cuh
index 0d515faa23..caa1379133 100644
--- a/cpp/include/raft/distance/detail/distance_ops/l_inf.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l_inf.cuh
@@ -27,6 +27,7 @@ namespace raft::distance::detail::ops {
  *
  *  c_ij = max_k | x_ik - y_kj |
  */
+template <typename DataT, typename AccT, typename IdxT>
 struct l_inf_distance_op {
   // Load norms of input data
   static constexpr bool use_norms = false;
@@ -36,20 +37,19 @@ struct l_inf_distance_op {
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
-  template <typename Policy, typename DataT>
+  template <typename Policy>
   constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize;
   }
 
-  template <typename AccT, typename DataT>
   DI void core(AccT& acc, DataT& x, DataT& y) const
   {
     const auto diff = raft::abs(x - y);
     acc             = raft::max(acc, diff);
   };
 
-  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  template <typename Policy>
   DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
                  DataT* regxn,
                  DataT* regyn,
diff --git a/cpp/include/raft/distance/detail/distance_ops/lp_unexp.cuh b/cpp/include/raft/distance/detail/distance_ops/lp_unexp.cuh
index 4af6888ddf..a4a090d058 100644
--- a/cpp/include/raft/distance/detail/distance_ops/lp_unexp.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/lp_unexp.cuh
@@ -26,11 +26,11 @@ namespace raft::distance::detail::ops {
  *
  *   c_ij = (sum_k |x_ik - y_jk|^p)^(1/p)
  */
-template <typename DataT_struct>
+template <typename DataT, typename AccT, typename IdxT>
 struct lp_unexp_distance_op {
-  DataT_struct p;
+  DataT p;
 
-  lp_unexp_distance_op(DataT_struct p_) noexcept : p(p_) {}
+  lp_unexp_distance_op(DataT p_) noexcept : p(p_) {}
 
   // Load norms of input data
   static constexpr bool use_norms = false;
@@ -40,20 +40,19 @@ struct lp_unexp_distance_op {
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
-  template <typename Policy, typename DataT>
+  template <typename Policy>
   constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize;
   }
 
-  template <typename AccT, typename DataT>
   DI void core(AccT& acc, DataT& x, DataT& y) const
   {
     const auto diff = raft::abs(x - y);
     acc += raft::pow(diff, p);
   };
 
-  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  template <typename Policy>
   DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
                  DataT* regxn,
                  DataT* regyn,
diff --git a/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh b/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh
index f9fbc7221b..0bac3beaff 100644
--- a/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh
@@ -27,12 +27,12 @@ namespace raft::distance::detail::ops {
  *
  *  c_ij = (k - (sum_k x_ik * y_kj)) / k
  */
-template <typename IdxT_struct>
+template <typename DataT, typename AccT, typename IdxT>
 struct russel_rao_distance_op {
-  IdxT_struct k;
+  IdxT k;
   const float one_over_k;
 
-  russel_rao_distance_op(IdxT_struct k_) noexcept : k(k_), one_over_k(1.0f / k_) {}
+  russel_rao_distance_op(IdxT k_) noexcept : k(k_), one_over_k(1.0f / k_) {}
 
   // Load norms of input data
   static constexpr bool use_norms = false;
@@ -42,19 +42,18 @@ struct russel_rao_distance_op {
 
   // Size of shared memory. This is normally decided by the kernel policy, but
   // some ops such as correlation_distance_op use more.
-  template <typename Policy, typename DataT>
+  template <typename Policy>
   constexpr size_t shared_mem_size()
   {
     return Policy::SmemSize;
   }
 
-  template <typename AccT, typename DataT>
   DI void core(AccT& acc, DataT& x, DataT& y) const
   {
     acc += x * y;
   };
 
-  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  template <typename Policy>
   DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
                  DataT* regxn,
                  DataT* regyn,
diff --git a/cpp/include/raft/distance/detail/distance_ops/template.cuh b/cpp/include/raft/distance/detail/distance_ops/template.cuh
index 1d2d681b18..b978cf2a36 100644
--- a/cpp/include/raft/distance/detail/distance_ops/template.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/template.cuh
@@ -24,6 +24,7 @@ namespace raft::distance::detail::ops {
 //
 // Fill in the TODO items.
 
+template <typename DataT, typename AccT, typename IdxT>
 struct template_distance_op {
   TODO member;
 
@@ -43,13 +44,12 @@ struct template_distance_op {
     return Policy::SmemSize + TODO;
   }
 
-  template <typename AccT, typename DataT>
   DI void core(AccT& acc, DataT& x, DataT& y) const
   {
     TODO;
   };
 
-  template <typename Policy, typename AccT, typename DataT, typename IdxT>
+  template <typename Policy>
   DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
                  DataT* regxn,
                  DataT* regyn,
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
index 1e450f9289..6856c09c37 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
@@ -56,7 +56,7 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void pairwise_matrix_kernel(co
                                             IdxT gridStrideY) {
     // Use .template to disambiguate (See:
     // https://en.cppreference.com/w/cpp/language/dependent_name)
-    distance_op.template epilog<Policy, AccT, DataT, IdxT>(
+    distance_op.template epilog<Policy>(
       acc, regxn, regyn, gridStrideX, gridStrideY);
   };
 
@@ -123,7 +123,7 @@ void pairwise_matrix(OpT distance_op,
   dim3 blk(Policy::Nthreads);
   // Use .template to disambiguate (See:
   // https://en.cppreference.com/w/cpp/language/dependent_name)
-  size_t smem_size = distance_op.template shared_mem_size<Policy, DataT>();
+  size_t smem_size = distance_op.template shared_mem_size<Policy>();
   // Obtain function pointer to kernel
   auto kernel = pairwise_matrix_kernel<Policy, row_major, DataT, AccT, OutT, IdxT, OpT, FinOpT>;
   dim3 grid   = launchConfigGenerator<Policy>(m, n, smem_size, kernel);

From cd38ec646a6166f8263b8e0bad98aef39ed65898 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Wed, 22 Feb 2023 14:35:24 +0100
Subject: [PATCH 55/60] Fix style

---
 .../distance/detail/distance_ops/correlation.cuh    | 13 +++----------
 .../raft/distance/detail/distance_ops/cosine.cuh    |  5 +----
 .../raft/distance/detail/distance_ops/hamming.cuh   |  5 +----
 .../raft/distance/detail/distance_ops/l1.cuh        |  5 +----
 .../raft/distance/detail/distance_ops/l2_exp.cuh    |  5 +----
 .../distance/detail/distance_ops/russel_rao.cuh     |  5 +----
 .../raft/distance/detail/distance_ops/template.cuh  |  5 +----
 .../distance/detail/pairwise_matrix/kernel_sm60.cuh |  3 +--
 8 files changed, 10 insertions(+), 36 deletions(-)

diff --git a/cpp/include/raft/distance/detail/distance_ops/correlation.cuh b/cpp/include/raft/distance/detail/distance_ops/correlation.cuh
index d46cbf6718..3832104280 100644
--- a/cpp/include/raft/distance/detail/distance_ops/correlation.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/correlation.cuh
@@ -36,12 +36,8 @@ struct correlation_distance_op {
   IdxT n;
   IdxT k;
 
-  correlation_distance_op(bool is_row_major,
-                          const DataT* x2n_,
-                          const DataT* y2n_,
-                          IdxT m_,
-                          IdxT n_,
-                          IdxT k_) noexcept
+  correlation_distance_op(
+    bool is_row_major, const DataT* x2n_, const DataT* y2n_, IdxT m_, IdxT n_, IdxT k_) noexcept
     : x2n(x2n_), y2n(y2n_), m(m_), n(n_), k(k_)
   {
     // The distance op is typically created before the row-major/col-major
@@ -66,10 +62,7 @@ struct correlation_distance_op {
     return Policy::SmemSize + (2 * (Policy::Mblk + Policy::Nblk) * sizeof(DataT));
   }
 
-  DI void core(AccT& acc, DataT& x, DataT& y) const
-  {
-    acc += x * y;
-  };
+  DI void core(AccT& acc, DataT& x, DataT& y) const { acc += x * y; };
 
   template <typename Policy>
   DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
diff --git a/cpp/include/raft/distance/detail/distance_ops/cosine.cuh b/cpp/include/raft/distance/detail/distance_ops/cosine.cuh
index 422ec4a3aa..c3f3b75e62 100644
--- a/cpp/include/raft/distance/detail/distance_ops/cosine.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/cosine.cuh
@@ -43,10 +43,7 @@ struct cosine_distance_op {
     return Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT));
   }
 
-  DI void core(AccT& acc, DataT& x, DataT& y) const
-  {
-    acc += x * y;
-  };
+  DI void core(AccT& acc, DataT& x, DataT& y) const { acc += x * y; };
 
   template <typename Policy>
   DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
diff --git a/cpp/include/raft/distance/detail/distance_ops/hamming.cuh b/cpp/include/raft/distance/detail/distance_ops/hamming.cuh
index 6d050154d7..98acf11560 100644
--- a/cpp/include/raft/distance/detail/distance_ops/hamming.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/hamming.cuh
@@ -46,10 +46,7 @@ struct hamming_distance_op {
     return Policy::SmemSize;
   }
 
-  DI void core(AccT& acc, DataT& x, DataT& y) const
-  {
-    acc += (x != y);
-  };
+  DI void core(AccT& acc, DataT& x, DataT& y) const { acc += (x != y); };
 
   template <typename Policy>
   DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
diff --git a/cpp/include/raft/distance/detail/distance_ops/l1.cuh b/cpp/include/raft/distance/detail/distance_ops/l1.cuh
index f152f1d83a..b02971bac7 100644
--- a/cpp/include/raft/distance/detail/distance_ops/l1.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l1.cuh
@@ -42,10 +42,7 @@ struct l1_distance_op {
     return Policy::SmemSize;
   }
 
-  DI void core(AccT& acc, DataT& x, DataT& y) const
-  {
-    acc += raft::abs(x - y);
-  };
+  DI void core(AccT& acc, DataT& x, DataT& y) const { acc += raft::abs(x - y); };
 
   template <typename Policy>
   DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
diff --git a/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
index 785e7804d6..b68c44c8ba 100644
--- a/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/l2_exp.cuh
@@ -48,10 +48,7 @@ struct l2_exp_distance_op {
     return Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT));
   }
 
-  DI void core(AccT& acc, DataT& x, DataT& y) const
-  {
-    acc += x * y;
-  };
+  DI void core(AccT& acc, DataT& x, DataT& y) const { acc += x * y; };
 
   template <typename Policy>
   DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
diff --git a/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh b/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh
index 0bac3beaff..7acd858e49 100644
--- a/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/russel_rao.cuh
@@ -48,10 +48,7 @@ struct russel_rao_distance_op {
     return Policy::SmemSize;
   }
 
-  DI void core(AccT& acc, DataT& x, DataT& y) const
-  {
-    acc += x * y;
-  };
+  DI void core(AccT& acc, DataT& x, DataT& y) const { acc += x * y; };
 
   template <typename Policy>
   DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
diff --git a/cpp/include/raft/distance/detail/distance_ops/template.cuh b/cpp/include/raft/distance/detail/distance_ops/template.cuh
index b978cf2a36..b0f40123aa 100644
--- a/cpp/include/raft/distance/detail/distance_ops/template.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/template.cuh
@@ -44,10 +44,7 @@ struct template_distance_op {
     return Policy::SmemSize + TODO;
   }
 
-  DI void core(AccT& acc, DataT& x, DataT& y) const
-  {
-    TODO;
-  };
+  DI void core(AccT& acc, DataT& x, DataT& y) const { TODO; };
 
   template <typename Policy>
   DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
index 6856c09c37..db7ceb64f4 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
@@ -56,8 +56,7 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void pairwise_matrix_kernel(co
                                             IdxT gridStrideY) {
     // Use .template to disambiguate (See:
     // https://en.cppreference.com/w/cpp/language/dependent_name)
-    distance_op.template epilog<Policy>(
-      acc, regxn, regyn, gridStrideX, gridStrideY);
+    distance_op.template epilog<Policy>(acc, regxn, regyn, gridStrideX, gridStrideY);
   };
 
   // No support for row_epilog_op.

From 646722114f872d13f5c0fcdc3d911e3f0abbad66 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <allard@allardhendriksen.nl>
Date: Mon, 6 Mar 2023 17:59:04 +0100
Subject: [PATCH 56/60] Update
 cpp/include/raft/distance/detail/distance_ops/canberra.cuh

Co-authored-by: Tamas Bela Feher <tfeher@nvidia.com>
---
 cpp/include/raft/distance/detail/distance_ops/canberra.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/distance/detail/distance_ops/canberra.cuh b/cpp/include/raft/distance/detail/distance_ops/canberra.cuh
index 5ddf02e705..45bea08a95 100644
--- a/cpp/include/raft/distance/detail/distance_ops/canberra.cuh
+++ b/cpp/include/raft/distance/detail/distance_ops/canberra.cuh
@@ -48,7 +48,7 @@ struct canberra_distance_op {
     const auto diff = raft::abs(x - y);
     const auto add  = raft::abs(x) + raft::abs(y);
     // deal with potential for 0 in denominator by
-    // forcing 1/0 instead
+    // forcing 0/1 instead
     acc += ((add != 0) * diff / (add + (add == 0)));
   };
 

From a83461e816417da12e313018e9b5a08437207044 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <allard@allardhendriksen.nl>
Date: Mon, 6 Mar 2023 18:04:39 +0100
Subject: [PATCH 57/60] Update cpp/include/raft/distance/detail/distance.cuh

Co-authored-by: Tamas Bela Feher <tfeher@nvidia.com>
---
 cpp/include/raft/distance/detail/distance.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index 621e2d15b9..7887eb96be 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -270,7 +270,7 @@ void distance_impl(raft::resources const& handle,
       distance_matrix_cutlass_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
         distance_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);
     } else {
-      // Else use "legacy" L2
+      // Else use "legacy" cosine kernel
       ops::cosine_distance_op<DataT, AccT, IdxT> distance_op{};
       distance_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
         distance_op, m, n, k, x, y, norm_A, norm_B, out, fin_op, stream, is_row_major);

From 393edf337c43594ef233799ae63e73ccc3fd3451 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 6 Mar 2023 18:25:14 +0100
Subject: [PATCH 58/60] Add note about alignment in case of byte input

---
 .../raft/distance/detail/pairwise_matrix/dispatch.cuh  | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
index 23d0f34489..c95241cd0d 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
@@ -127,7 +127,15 @@ void distance_matrix_dispatch(OpT distance_op,
 
   // Compute number of elements that can be loaded in one instruction
   // without causing misalignent errors.
-  int vec_len_aligned = (byte_alignment % sizeof(DataT) == 0) ? byte_alignment / sizeof(DataT) : 1;
+  int vec_len_aligned;
+  if (byte_alignment % sizeof(DataT) == 0) {
+    // In the future, we might support `int8_t` input. In that case,
+    // byte_alignment / sizeof(DataT) might exceed 4. We maximize at 4 here, to
+    // prevent adding more cases in dispatch (which are expensive to compile).
+    vec_len_aligned = min(4, byte_alignment / sizeof(DataT));
+  } else {
+    vec_len_aligned = 1;
+  }
 
   dispatch(is_row_major, vec_len_aligned, [&](auto row_major, auto vec_len_aligned) {
     // row_major and vec_len are std::integral_constants of type bool and int

From 48a0c21ce78454b1cb10d7875bcf3985c27df3f7 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Tue, 7 Mar 2023 09:46:03 +0100
Subject: [PATCH 59/60] Fix

---
 cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
index c95241cd0d..9def354600 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/dispatch.cuh
@@ -16,6 +16,7 @@
 #pragma once
 
 #include "kernel_sm60.cuh"
+#include <algorithm>
 #include <cstdio>
 #include <raft/distance/detail/pairwise_distance_cutlass_base.cuh>
 #include <raft/linalg/contractions.cuh>
@@ -132,7 +133,7 @@ void distance_matrix_dispatch(OpT distance_op,
     // In the future, we might support `int8_t` input. In that case,
     // byte_alignment / sizeof(DataT) might exceed 4. We maximize at 4 here, to
     // prevent adding more cases in dispatch (which are expensive to compile).
-    vec_len_aligned = min(4, byte_alignment / sizeof(DataT));
+    vec_len_aligned = std::min(4, int(byte_alignment / sizeof(DataT)));
   } else {
     vec_len_aligned = 1;
   }

From 569b2c214845b152b2c9252465af38eba8df1260 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 9 Mar 2023 09:52:17 +0100
Subject: [PATCH 60/60] Add issue for TODO

---
 .../raft/distance/detail/pairwise_matrix/kernel_sm60.cuh       | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
index db7ceb64f4..7c1052d726 100644
--- a/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_matrix/kernel_sm60.cuh
@@ -45,7 +45,8 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void pairwise_matrix_kernel(co
 {
   extern __shared__ char smem[];
 
-  // Wrap operator back into lambdas. This is temporary and should be removed. (TODO)
+  // Wrap operator back into lambdas. This is temporary and should be removed.
+  // See: https://github.com/rapidsai/raft/issues/1323
   auto core_op = [distance_op] __device__(AccT & acc, DataT & x, DataT & y) {
     distance_op.core(acc, x, y);
   };