From 7376902dde9d0c0a3c3b41814a147201f86d1416 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Fri, 26 Aug 2022 13:18:32 +0200
Subject: [PATCH 1/5] fusedL2NN: Optimize when data is skinny

---
 cpp/bench/spatial/fused_l2_nn.cu              | 31 ++++++++--
 .../raft/distance/detail/fused_l2_nn.cuh      |  5 +-
 cpp/include/raft/distance/fused_l2_nn.cuh     | 59 ++++++++++++++++---
 cpp/include/raft/linalg/contractions.cuh      | 26 ++++++++
 4 files changed, 108 insertions(+), 13 deletions(-)
diff --git a/cpp/bench/spatial/fused_l2_nn.cu b/cpp/bench/spatial/fused_l2_nn.cu
index e5b5dc377a..9e88f23e73 100644
--- a/cpp/bench/spatial/fused_l2_nn.cu
+++ b/cpp/bench/spatial/fused_l2_nn.cu
@@ -22,9 +22,8 @@
 #include <raft/linalg/norm.cuh>
 #include <raft/random/rng.cuh>
 
-#if defined RAFT_NN_COMPILED
-#include <raft/spatial/knn/specializations.cuh>
-#endif
+// Note: do not include raft/spatial/knn/specializations.hpp based on
+// RAFT_NN_COMPILED, as fusedL2NN is not specialized and not defined there.
 
 namespace raft::bench::spatial {
 
@@ -73,6 +72,30 @@ struct fused_l2_nn : public fixture {
                                                                    false,
                                                                    stream);
     });
+
+    // Num distance calculations
+    int64_t num_dist_calcs = (int64_t)params.n * (int64_t)params.m;
+
+    int64_t num_flops = 3 * num_dist_calcs * params.k;
+
+    int64_t read_elts  = (int64_t)params.n * params.k + (int64_t)params.m * params.k;
+    int64_t write_elts = (int64_t)params.n;
+
+    state.counters["D/s"] = benchmark::Counter(num_dist_calcs,
+                                               benchmark::Counter::kIsIterationInvariantRate,
+                                               benchmark::Counter::OneK::kIs1000);
+
+    state.counters["FLOP/s"] = benchmark::Counter(
+      num_flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1000);
+
+    state.counters["BW Wr"] = benchmark::Counter(write_elts * sizeof(cub::KeyValuePair<int, T>),
+                                                 benchmark::Counter::kIsIterationInvariantRate,
+                                                 benchmark::Counter::OneK::kIs1000);
+    state.counters["BW Rd"] = benchmark::Counter(read_elts * sizeof(float),
+                                                 benchmark::Counter::kIsIterationInvariantRate,
+                                                 benchmark::Counter::OneK::kIs1000);
+
+    state.counters["K"] = benchmark::Counter(params.k);
   }
 
  private:
@@ -88,9 +111,9 @@ const std::vector<fused_l2_nn_inputs> fused_l2_nn_input_vecs = {
   {32, 16384, 16384},  {64, 16384, 16384},   {128, 16384, 16384},   {256, 16384, 16384},
   {512, 16384, 16384}, {1024, 16384, 16384}, {16384, 32, 16384},    {16384, 64, 16384},
   {16384, 128, 16384}, {16384, 256, 16384},  {16384, 512, 16384},   {16384, 1024, 16384},
+  {16384, 16384, 2},   {16384, 16384, 4},    {16384, 16384, 8},     {16384, 16384, 16},
   {16384, 16384, 32},  {16384, 16384, 64},   {16384, 16384, 128},   {16384, 16384, 256},
   {16384, 16384, 512}, {16384, 16384, 1024}, {16384, 16384, 16384},
-
 };
 
 RAFT_BENCH_REGISTER(fused_l2_nn<float>, "", fused_l2_nn_input_vecs);
diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
index 81d02c410c..39bd1508f8 100644
--- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
@@ -261,7 +261,7 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min,
 template <typename DataT,
           typename OutT,
           typename IdxT,
-          int VecLen,
+          typename Policy,
           typename ReduceOpT,
           typename KVPReduceOpT>
 void fusedL2NNImpl(OutT* min,
@@ -279,7 +279,8 @@ void fusedL2NNImpl(OutT* min,
                    bool initOutBuffer,
                    cudaStream_t stream)
 {
-  typedef typename linalg::Policy4x4<DataT, VecLen>::Policy P;
+  // The kernel policy is determined by fusedL2NN.
+  typedef Policy P;
 
   dim3 blk(P::Nthreads);
   auto nblks            = raft::ceildiv<int>(m, P::Nthreads);
diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/fused_l2_nn.cuh
index ac8895c9ce..121ccbf60d 100644
--- a/cpp/include/raft/distance/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/fused_l2_nn.cuh
@@ -24,6 +24,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/detail/fused_l2_nn.cuh>
 #include <raft/handle.hpp>
+#include <raft/linalg/contractions.cuh>
 #include <stdint.h>
 
 namespace raft {
@@ -99,20 +100,64 @@ void fusedL2NN(OutT* min,
                bool initOutBuffer,
                cudaStream_t stream)
 {
+  // When k is smaller than 32, the Policy4x4 results in redundant calculations
+  // as it uses tiles that have k=32. Therefore, use a "skinny" policy instead
+  // that uses tiles with a smaller value of k.
+  bool is_skinny = k < 32;
+
   size_t bytes = sizeof(DataT) * k;
   if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) {
-    detail::fusedL2NNImpl<DataT, OutT, IdxT, 16 / sizeof(DataT), ReduceOpT>(
-      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+    if (is_skinny) {
+      detail::fusedL2NNImpl<DataT,
+                            OutT,
+                            IdxT,
+                            typename linalg::Policy4x4Skinny<DataT, 16 / sizeof(DataT)>::Policy,
+                            ReduceOpT>(
+        min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+    } else {
+      detail::fusedL2NNImpl<DataT,
+                            OutT,
+                            IdxT,
+                            typename linalg::Policy4x4<DataT, 16 / sizeof(DataT)>::Policy,
+                            ReduceOpT>(
+        min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+    }
   } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) {
-    detail::fusedL2NNImpl<DataT, OutT, IdxT, 8 / sizeof(DataT), ReduceOpT>(
-      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+    if (is_skinny) {
+      detail::fusedL2NNImpl<DataT,
+                            OutT,
+                            IdxT,
+                            typename linalg::Policy4x4Skinny<DataT, 8 / sizeof(DataT)>::Policy,
+                            ReduceOpT>(
+        min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+    } else {
+      detail::fusedL2NNImpl<DataT,
+                            OutT,
+                            IdxT,
+                            typename linalg::Policy4x4<DataT, 8 / sizeof(DataT)>::Policy,
+                            ReduceOpT>(
+        min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+    }
   } else {
-    detail::fusedL2NNImpl<DataT, OutT, IdxT, 1, ReduceOpT>(
-      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+    if (is_skinny) {
+      detail::fusedL2NNImpl<DataT,
+                            OutT,
+                            IdxT,
+                            typename linalg::Policy4x4Skinny<DataT, 1>::Policy,
+                            ReduceOpT>(
+        min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+    } else {
+      detail::fusedL2NNImpl<DataT,
+                            OutT,
+                            IdxT,
+                            typename linalg::Policy4x4<DataT, 1>::Policy,
+                            ReduceOpT>(
+        min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+    }
   }
 }
 
 }  // namespace distance
 }  // namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh
index 5ccbd15c3d..800632ada5 100644
--- a/cpp/include/raft/linalg/contractions.cuh
+++ b/cpp/include/raft/linalg/contractions.cuh
@@ -167,6 +167,32 @@ struct Policy4x4<double, _veclen> {
 };
 /** @} */
 
+/**
+ * @defgroup Policy4x4Skinny
+ *
+ * A smaller k-block (8 instead of 32) with fewer threads per block (8x8 instead
+ * of 16x16), which is faster for raft::distance::fusedL2NN on skinny matrices,
+ * i.e., matrices with a small k dimension.
+ *
+ * @{
+ */
+template <typename DataT, int _veclen>
+struct Policy4x4Skinny {
+};
+
+template <int _veclen>
+struct Policy4x4Skinny<float, _veclen> {
+  typedef KernelPolicy<float, _veclen, 8, 4, 4, 8, 8> Policy;
+  typedef ColKernelPolicy<float, _veclen, 8, 4, 4, 8, 8> ColPolicy;
+};
+
+template <int _veclen>
+struct Policy4x4Skinny<double, _veclen> {
+  typedef KernelPolicy<double, _veclen, 8, 4, 4, 8, 8> Policy;
+  typedef ColKernelPolicy<double, _veclen, 8, 4, 4, 8, 8> ColPolicy;
+};
+/** @} */
+
 /**
  * @defgroup Policy2x8 16 elements per thread Policy with k-block = 16
  * @{

From d6027b83e1ec6d27c72f94c51a7c945318292dd9 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 1 Sep 2022 14:01:12 +0200
Subject: [PATCH 2/5] fusedL2NN: Add test cases for skinny matrices

There were very few test cases for skinny matrices. They have now been
added.
---
 cpp/test/distance/fused_l2_nn.cu | 84 +++++++++++++++++++++++++++-----
 1 file changed, 72 insertions(+), 12 deletions(-)

diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index 192f0c9a74..c01e8c74fc 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -102,6 +102,23 @@ struct Inputs {
   DataT tolerance;
   int m, n, k;
   unsigned long long int seed;
+
+  friend std::ostream& operator<<(std::ostream& os, const Inputs& p)
+  {
+    return os << "m: " << p.m
+              << ", "
+                 "n: "
+              << p.n
+              << ", "
+                 "k: "
+              << p.k
+              << ", "
+                 "seed: "
+              << p.seed
+              << ", "
+                 "tol: "
+              << p.tolerance;
+  }
 };
 
 template <typename DataT, bool Sqrt>
@@ -231,19 +248,62 @@ template <typename K, typename V, typename L>
 }
 
 const std::vector<Inputs<float>> inputsf = {
-  {0.001f, 32, 32, 32, 1234ULL},   {0.001f, 32, 64, 32, 1234ULL},   {0.001f, 64, 32, 32, 1234ULL},
-  {0.001f, 64, 64, 32, 1234ULL},   {0.001f, 128, 32, 32, 1234ULL},  {0.001f, 128, 64, 32, 1234ULL},
-  {0.001f, 128, 128, 64, 1234ULL}, {0.001f, 64, 128, 128, 1234ULL},
-
-  {0.001f, 32, 32, 34, 1234ULL},   {0.001f, 32, 64, 34, 1234ULL},   {0.001f, 64, 32, 34, 1234ULL},
-  {0.001f, 64, 64, 34, 1234ULL},   {0.001f, 128, 32, 34, 1234ULL},  {0.001f, 128, 64, 34, 1234ULL},
-  {0.001f, 128, 128, 66, 1234ULL}, {0.001f, 64, 128, 130, 1234ULL},
-
-  {0.001f, 32, 32, 33, 1234ULL},   {0.001f, 32, 64, 33, 1234ULL},   {0.001f, 64, 32, 33, 1234ULL},
-  {0.001f, 64, 64, 33, 1234ULL},   {0.001f, 128, 32, 33, 1234ULL},  {0.001f, 128, 64, 33, 1234ULL},
-  {0.001f, 128, 128, 65, 1234ULL}, {0.001f, 64, 128, 129, 1234ULL},
-
+  {0.001f, 32, 32, 32, 1234ULL},
+  {0.001f, 32, 64, 32, 1234ULL},
+  {0.001f, 64, 32, 32, 1234ULL},
+  {0.001f, 64, 64, 32, 1234ULL},
+  {0.001f, 128, 32, 32, 1234ULL},
+  {0.001f, 128, 64, 32, 1234ULL},
+  {0.001f, 128, 128, 64, 1234ULL},
+  {0.001f, 64, 128, 128, 1234ULL},
+
+  {0.001f, 32, 32, 34, 1234ULL},
+  {0.001f, 32, 64, 34, 1234ULL},
+  {0.001f, 64, 32, 34, 1234ULL},
+  {0.001f, 64, 64, 34, 1234ULL},
+  {0.001f, 128, 32, 34, 1234ULL},
+  {0.001f, 128, 64, 34, 1234ULL},
+  {0.001f, 128, 128, 66, 1234ULL},
+  {0.001f, 64, 128, 130, 1234ULL},
+
+  {0.001f, 32, 32, 33, 1234ULL},
+  {0.001f, 32, 64, 33, 1234ULL},
+  {0.001f, 64, 32, 33, 1234ULL},
+  {0.001f, 64, 64, 33, 1234ULL},
+  {0.001f, 128, 32, 33, 1234ULL},
+  {0.001f, 128, 64, 33, 1234ULL},
+  {0.001f, 128, 128, 65, 1234ULL},
+  {0.001f, 64, 128, 129, 1234ULL},
   {0.006f, 1805, 134, 2, 1234ULL},
+
+  // Repeat with smaller values of k
+  {0.006f, 32, 32, 1, 1234ULL},
+  {0.001f, 32, 64, 2, 1234ULL},
+  {0.001f, 64, 32, 3, 1234ULL},
+  {0.001f, 64, 64, 4, 1234ULL},
+  {0.001f, 128, 32, 5, 1234ULL},
+  {0.001f, 128, 64, 6, 1234ULL},
+  {0.001f, 128, 128, 7, 1234ULL},
+  {0.001f, 64, 128, 8, 1234ULL},
+
+  {0.001f, 32, 32, 9, 1234ULL},
+  {0.001f, 32, 64, 10, 1234ULL},
+  {0.001f, 64, 32, 11, 1234ULL},
+  {0.001f, 64, 64, 12, 1234ULL},
+  {0.001f, 128, 32, 13, 1234ULL},
+  {0.001f, 128, 64, 14, 1234ULL},
+  {0.001f, 128, 128, 15, 1234ULL},
+  {0.001f, 64, 128, 16, 1234ULL},
+
+  {0.001f, 32, 32, 17, 1234ULL},
+  {0.001f, 32, 64, 18, 1234ULL},
+  {0.001f, 64, 32, 19, 1234ULL},
+  {0.001f, 64, 64, 20, 1234ULL},
+  {0.001f, 128, 32, 21, 1234ULL},
+  {0.001f, 128, 64, 22, 1234ULL},
+  {0.001f, 128, 128, 23, 1234ULL},
+  {0.00001, 64, 128, 24, 1234ULL},
+  {0.001f, 1805, 134, 25, 1234ULL},
 };
 typedef FusedL2NNTest<float, false> FusedL2NNTestF_Sq;
 TEST_P(FusedL2NNTestF_Sq, Result)

From 89cbb7f6cac8464f5052d085ca17c50ce8d1ea66 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 1 Sep 2022 14:02:42 +0200
Subject: [PATCH 3/5] fusedL2NN: Fix updateReDucedVal with >2 rows/warp

In updateReDucedVal, a single warp can contain multiple rows (in
registers). A single thread within the warp uses the first element of
each row to update an output array (atomically).

In the previous implementation, a shuffle was used to move the head of
each row into the first thread of the warp. Unfortunately, this would
overwrite the value all other rows. This strategy, however, worked when
the number of rows per warp equalled 2. Hence, the bug never triggered.

In a recent commit, the number of rows per warp was increased to four in
certain situations (skinny matrices). Hence, this bug triggered.

In the new implementation, the values are not shuffled into the first
thread of the warp any more. Instead, the threads that contain the first
element of a row update the output in sequential order. The sequential
ordering is necessary to avoid deadlock on Pascal architecture.
---
 cpp/include/raft/distance/detail/fused_l2_nn.cuh | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
index 39bd1508f8..78becc1b5b 100644
--- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
@@ -92,14 +92,14 @@ DI void updateReducedVal(
   const auto lid      = threadIdx.x % raft::WarpSize;
   const auto accrowid = threadIdx.x / P::AccThCols;
 
-  // for now have first lane from each warp update a unique output row. This
-  // will resolve hang issues with pre-Volta architectures
+  // Update each output row in order within a warp. This will resolve hang
+  // issues with pre-Volta architectures
 #pragma unroll
   for (int j = 0; j < (raft::WarpSize / P::AccThCols); j++) {
-    if (lid == 0) {
+    if (lid == j * P::AccThCols) {
 #pragma unroll
       for (int i = 0; i < P::AccRowsPerTh; ++i) {
-        auto rid = gridStrideY + accrowid + j + i * P::AccThRows;
+        auto rid = gridStrideY + accrowid + i * P::AccThRows;
         if (rid < m) {
           auto value = val[i];
           while (atomicCAS(mutex + rid, 0, 1) == 1)
@@ -111,14 +111,6 @@ DI void updateReducedVal(
         }
       }
     }
-    if (j < (raft::WarpSize / P::AccThCols) - 1) {
-#pragma unroll
-      for (int i = 0; i < P::AccRowsPerTh; ++i) {
-        auto tmpkey   = raft::shfl(val[i].key, (j + 1) * P::AccThCols);
-        auto tmpvalue = raft::shfl(val[i].value, (j + 1) * P::AccThCols);
-        val[i]        = {tmpkey, tmpvalue};
-      }
-    }
   }
 }
 

From 87319de8d0925f6f72258ad12f7eaee3c1c594eb Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 1 Sep 2022 14:13:19 +0200
Subject: [PATCH 4/5] fusedL2NN: Preventatively reduce shfl_sync width

In the current implementation, it looks like values from different rows
are mixed together in what should be a row-wise warp reduce. All tests
do pass however.

Just in case, I have added a width parameter to the shuffle so that it
only shuffles within a row within the warp.
---
 cpp/include/raft/distance/detail/fused_l2_nn.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
index 78becc1b5b..6a51bdcf1a 100644
--- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
@@ -202,8 +202,8 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min,
       for (int i = 0; i < P::AccRowsPerTh; ++i) {
 #pragma unroll
         for (int j = P::AccThCols / 2; j > 0; j >>= 1) {
-          auto tmpkey   = raft::shfl(val[i].key, lid + j);
-          auto tmpvalue = raft::shfl(val[i].value, lid + j);
+          auto tmpkey   = raft::shfl(val[i].key, lid + j, P::AccThCols);
+          auto tmpvalue = raft::shfl(val[i].value, lid + j, P::AccThCols);
           KVPair tmp    = {tmpkey, tmpvalue};
           val[i]        = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]);
         }

From 36e5865b4f84b2a322e2a5df8ec6fd88496fff30 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Mon, 5 Sep 2022 11:08:25 +0200
Subject: [PATCH 5/5] Fix doxygen-related CI failure

There was a problem with defgroup syntax.
---
 cpp/bench/spatial/fused_l2_nn.cu                 | 8 ++++++--
 cpp/include/raft/distance/detail/fused_l2_nn.cuh | 2 ++
 cpp/include/raft/linalg/contractions.cuh         | 4 ----
 cpp/test/distance/fused_l2_nn.cu                 | 7 +++++++
 4 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/cpp/bench/spatial/fused_l2_nn.cu b/cpp/bench/spatial/fused_l2_nn.cu
index 9e88f23e73..aa36483145 100644
--- a/cpp/bench/spatial/fused_l2_nn.cu
+++ b/cpp/bench/spatial/fused_l2_nn.cu
@@ -22,8 +22,12 @@
 #include <raft/linalg/norm.cuh>
 #include <raft/random/rng.cuh>
 
-// Note: do not include raft/spatial/knn/specializations.hpp based on
-// RAFT_NN_COMPILED, as fusedL2NN is not specialized and not defined there.
+// TODO: Once fusedL2NN is specialized in the raft_distance shared library, add
+// back
+//
+// #if defined RAFT_NN_COMPILED
+// #include <raft/spatial/knn/specializations.hpp>
+// #endif
 
 namespace raft::bench::spatial {
 
diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
index 6a51bdcf1a..308f8a096a 100644
--- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
@@ -202,6 +202,8 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min,
       for (int i = 0; i < P::AccRowsPerTh; ++i) {
 #pragma unroll
         for (int j = P::AccThCols / 2; j > 0; j >>= 1) {
+          // Actually, the srcLane (lid +j) should be (lid +j) % P:AccThCols,
+          // but the shfl op applies the modulo internally.
           auto tmpkey   = raft::shfl(val[i].key, lid + j, P::AccThCols);
           auto tmpvalue = raft::shfl(val[i].value, lid + j, P::AccThCols);
           KVPair tmp    = {tmpkey, tmpvalue};
diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh
index 800632ada5..8aed0cb4be 100644
--- a/cpp/include/raft/linalg/contractions.cuh
+++ b/cpp/include/raft/linalg/contractions.cuh
@@ -168,13 +168,10 @@ struct Policy4x4<double, _veclen> {
 /** @} */
 
 /**
- * @defgroup Policy4x4Skinny
- *
  * A smaller k-block (8 instead of 32) with fewer threads per block (8x8 instead
  * of 16x16), which is faster for raft::distance::fusedL2NN on skinny matrices,
  * i.e., matrices with a small k dimension.
  *
- * @{
  */
 template <typename DataT, int _veclen>
 struct Policy4x4Skinny {
@@ -191,7 +188,6 @@ struct Policy4x4Skinny<double, _veclen> {
   typedef KernelPolicy<double, _veclen, 8, 4, 4, 8, 8> Policy;
   typedef ColKernelPolicy<double, _veclen, 8, 4, 4, 8, 8> ColPolicy;
 };
-/** @} */
 
 /**
  * @defgroup Policy2x8 16 elements per thread Policy with k-block = 16
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index c01e8c74fc..2a5b30e01f 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -23,6 +23,13 @@
 #include <raft/linalg/norm.cuh>
 #include <raft/random/rng.cuh>
 
+// TODO: Once fusedL2NN is specialized in the raft_distance shared library, add
+// the following:
+//
+// #if defined RAFT_NN_COMPILED
+// #include <raft/spatial/knn/specializations.hpp>
+// #endif
+
 namespace raft {
 namespace distance {