From 88f85f91e417443f937e236993645e2aa4b0e55d Mon Sep 17 00:00:00 2001
From: Mahesh Doijade <mdoijade@nvidia.com>
Date: Thu, 9 Dec 2021 16:15:11 +0000
Subject: [PATCH 1/2] fix nan issues introduced in l2 expanded sqrt distances
 due to very small negative 0s by fusedL2knn. re-enable fused l2 knn

---
 .../knn/detail/knn_brute_force_faiss.cuh      | 123 +++++++++---------
 1 file changed, 63 insertions(+), 60 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index d154e5f92a..c20582df72 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -294,67 +294,66 @@ void brute_force_knn_impl(
     cudaStream_t stream = raft::select_stream(userStream, internalStreams, n_int_streams, i);
 
     //    // TODO: Enable this once we figure out why it's causing pytest failures in cuml.
-    //    if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true &&
-    //        (metric == raft::distance::DistanceType::L2Unexpanded ||
-    //         metric == raft::distance::DistanceType::L2SqrtUnexpanded  //||
-    //         //             metric == raft::distance::DistanceType::L2Expanded ||
-    //         //             metric == raft::distance::DistanceType::L2SqrtExpanded)
-    //         )) {
-    //      fusedL2Knn(D,
-    //                 out_i_ptr,
-    //                 out_d_ptr,
-    //                 input[i],
-    //                 search_items,
-    //                 sizes[i],
-    //                 n,
-    //                 k,
-    //                 rowMajorIndex,
-    //                 rowMajorQuery,
-    //                 stream,
-    //                 metric);
-    //    } else {
-    switch (metric) {
-      case raft::distance::DistanceType::Haversine:
-
-        ASSERT(D == 2,
-               "Haversine distance requires 2 dimensions "
-               "(latitude / longitude).");
-
-        haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, stream);
-        break;
-      default:
-        faiss::MetricType m = build_faiss_metric(metric);
-
-        faiss::gpu::StandardGpuResources gpu_res;
-
-        gpu_res.noTempMemory();
-        gpu_res.setDefaultStream(device, stream);
-
-        faiss::gpu::GpuDistanceParams args;
-        args.metric          = m;
-        args.metricArg       = metricArg;
-        args.k               = k;
-        args.dims            = D;
-        args.vectors         = input[i];
-        args.vectorsRowMajor = rowMajorIndex;
-        args.numVectors      = sizes[i];
-        args.queries         = search_items;
-        args.queriesRowMajor = rowMajorQuery;
-        args.numQueries      = n;
-        args.outDistances    = out_d_ptr;
-        args.outIndices      = out_i_ptr;
-
-        /**
-         * @todo: Until FAISS supports pluggable allocation strategies,
-         * we will not reap the benefits of the pool allocator for
-         * avoiding device-wide synchronizations from cudaMalloc/cudaFree
-         */
-        bfKnn(&gpu_res, args);
+    if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true &&
+        (metric == raft::distance::DistanceType::L2Unexpanded ||
+         metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
+         metric == raft::distance::DistanceType::L2Expanded ||
+         metric == raft::distance::DistanceType::L2SqrtExpanded)) {
+      fusedL2Knn(D,
+                 out_i_ptr,
+                 out_d_ptr,
+                 input[i],
+                 search_items,
+                 sizes[i],
+                 n,
+                 k,
+                 rowMajorIndex,
+                 rowMajorQuery,
+                 stream,
+                 metric);
+    } else {
+      switch (metric) {
+        case raft::distance::DistanceType::Haversine:
+
+          ASSERT(D == 2,
+                 "Haversine distance requires 2 dimensions "
+                 "(latitude / longitude).");
+
+          haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, stream);
+          break;
+        default:
+          faiss::MetricType m = build_faiss_metric(metric);
+
+          faiss::gpu::StandardGpuResources gpu_res;
+
+          gpu_res.noTempMemory();
+          gpu_res.setDefaultStream(device, stream);
+
+          faiss::gpu::GpuDistanceParams args;
+          args.metric          = m;
+          args.metricArg       = metricArg;
+          args.k               = k;
+          args.dims            = D;
+          args.vectors         = input[i];
+          args.vectorsRowMajor = rowMajorIndex;
+          args.numVectors      = sizes[i];
+          args.queries         = search_items;
+          args.queriesRowMajor = rowMajorQuery;
+          args.numQueries      = n;
+          args.outDistances    = out_d_ptr;
+          args.outIndices      = out_i_ptr;
+
+          /**
+           * @todo: Until FAISS supports pluggable allocation strategies,
+           * we will not reap the benefits of the pool allocator for
+           * avoiding device-wide synchronizations from cudaMalloc/cudaFree
+           */
+          bfKnn(&gpu_res, args);
+      }
     }
-  }
 
-  CUDA_CHECK(cudaPeekAtLastError());
-  //  }
+    CUDA_CHECK(cudaPeekAtLastError());
+  }
 
   // Sync internal streams if used. We don't need to
   // sync the user stream because we'll already have
@@ -379,7 +378,11 @@ void brute_force_knn_impl(
     float p = 0.5;  // standard l2
     if (metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / metricArg;
     raft::linalg::unaryOp<float>(
-      res_D, res_D, n * k, [p] __device__(float input) { return powf(input, p); }, userStream);
+      res_D,
+      res_D,
+      n * k,
+      [p] __device__(float input) { return powf(fabsf(input), p); },
+      userStream);
   }
 
   query_metric_processor->revert(search_items);

From fec6b806c9945b69295cb2ae370b0d07601087ff Mon Sep 17 00:00:00 2001
From: Mahesh Doijade <mdoijade@nvidia.com>
Date: Fri, 17 Dec 2021 13:57:57 +0000
Subject: [PATCH 2/2] fix incorrect output in prod-cons code detected by UMAP
 test on GV100, the reason seems to be not using volatile and syncthreads,
 there is some reg spills perhaps due to cuda 11.4 this needs to be tracked
 further

---
 .../raft/spatial/knn/detail/fused_l2_knn.cuh  | 89 +++++++++----------
 1 file changed, 41 insertions(+), 48 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
index 47fc62066d..92c1918638 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -90,8 +90,8 @@ DI void storeWarpQShmem(myWarpSelect& heapArr,
 
 template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT, typename OutT>
 DI void storeWarpQGmem(myWarpSelect& heapArr,
-                       OutT* out_dists,
-                       IdxT* out_inds,
+                       volatile OutT* out_dists,
+                       volatile IdxT* out_inds,
                        const IdxT m,
                        const unsigned int numOfNN,
                        const IdxT starty)
@@ -115,8 +115,8 @@ DI void storeWarpQGmem(myWarpSelect& heapArr,
 
 template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT, typename OutT>
 DI void loadPrevTopKsGmemWarpQ(myWarpSelect& heapArr,
-                               OutT* out_dists,
-                               IdxT* out_inds,
+                               volatile OutT* out_dists,
+                               volatile IdxT* out_inds,
                                const IdxT m,
                                const unsigned int numOfNN,
                                const IdxT starty)
@@ -207,9 +207,9 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x
                                                                   FinalLambda fin_op,
                                                                   bool sqrt,
                                                                   unsigned int numOfNN,
-                                                                  int* mutexes,
-                                                                  OutT* out_dists,
-                                                                  IdxT* out_inds)
+                                                                  volatile int* mutexes,
+                                                                  volatile OutT* out_dists,
+                                                                  volatile IdxT* out_inds)
 {
   extern __shared__ char smem[];
 
@@ -225,8 +225,6 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x
                             IdxT gridStrideY) {
     if (gridDim.x == 1) { return; }
 
-    volatile int* mutex = mutexes;
-
     Pair* shDumpKV = nullptr;
     if (useNorms) {
       shDumpKV = (Pair*)(&smem[Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT))]);
@@ -240,7 +238,7 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x
     //  0 -> consumer done consuming the buffer.
     // -1 -> consumer started consuming the buffer
     // -2 -> producer done filling the buffer
-    // blockIdx.x -> prod started to fill the buffer
+    //  1 -> prod acquired to fill the buffer
     if (blockIdx.x == 0) {
       auto cta_processed = 0;
       myWarpSelect heapArr1(identity, keyMax, numOfNN);
@@ -252,45 +250,41 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x
 
       while (cta_processed < gridDim.x - 1) {
         if (threadIdx.x == 0) {
-          int32_t old = -3;
-          while (old != -1) {
-            old = atomicCAS((int*)&mutex[gridStrideY / Policy::Mblk], -2, -1);
-          }
-          __threadfence();
+          while (atomicCAS((int*)&mutexes[gridStrideY / Policy::Mblk], -2, -1) != -2)
+            ;
         }
+        __threadfence();
         __syncthreads();
 
 #pragma unroll
         for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-          const auto rowId      = starty + i * Policy::AccThRows;
-          const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+          const auto rowId = starty + i * Policy::AccThRows;
+          if (rowId < m) {
 #pragma unroll
-          for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) {
-            Pair otherKV;
-            otherKV.value  = identity;
-            otherKV.key    = keyMax;
-            const auto idx = j * warpSize + lid;
-            if (idx < numOfNN && rowId < m) {
-              otherKV.value                        = out_dists[rowId * numOfNN + idx];
-              otherKV.key                          = (uint32_t)out_inds[rowId * numOfNN + idx];
-              shDumpKV[shMemRowId * numOfNN + idx] = otherKV;
+            for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) {
+              Pair otherKV;
+              otherKV.value  = identity;
+              otherKV.key    = keyMax;
+              const auto idx = j * warpSize + lid;
+              if (idx < numOfNN) {
+                otherKV.value         = out_dists[rowId * numOfNN + idx];
+                otherKV.key           = (uint32_t)out_inds[rowId * numOfNN + idx];
+                const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+                shDumpKV[shMemRowId * numOfNN + idx] = otherKV;
+              }
             }
           }
         }
         __threadfence();
+        __syncthreads();
 
-        if (threadIdx.x == 0) {
-          mutex[gridStrideY / Policy::Mblk] = 0;
-          __threadfence();
-        }
+        if (threadIdx.x == 0) { atomicExch((int*)&mutexes[gridStrideY / Policy::Mblk], 0); }
+        __threadfence();
 
         // Perform merging of otherKV with topk's across warp.
-        __syncwarp();
-
 #pragma unroll
         for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-          const auto rowId      = starty + i * Policy::AccThRows;
-          const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+          const auto rowId = starty + i * Policy::AccThRows;
           if (rowId < m) {
 #pragma unroll
             for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) {
@@ -298,7 +292,10 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x
               otherKV.value  = identity;
               otherKV.key    = keyMax;
               const auto idx = j * warpSize + lid;
-              if (idx < numOfNN) { otherKV = shDumpKV[shMemRowId * numOfNN + idx]; }
+              if (idx < numOfNN) {
+                const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+                otherKV               = shDumpKV[shMemRowId * numOfNN + idx];
+              }
               heapArr[i]->add(otherKV.value, otherKV.key);
             }
           }
@@ -317,33 +314,29 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x
       storeWarpQGmem<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN, starty);
     } else {
       if (threadIdx.x == 0) {
-        int32_t old    = -1;
-        int32_t blkIdX = (int32_t)blockIdx.x;
-        while (old != blkIdX) {
-          old = atomicCAS((int*)&mutex[gridStrideY / Policy::Mblk], 0, blkIdX);
-        }
-        __threadfence();
+        while (atomicCAS((int*)&mutexes[gridStrideY / Policy::Mblk], 0, 1) != 0)
+          ;
       }
+      __threadfence();
       __syncthreads();
 
 #pragma unroll
       for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-        const auto rowId      = starty + i * Policy::AccThRows;
-        const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+        const auto rowId = starty + i * Policy::AccThRows;
         if (rowId < m) {
           for (int idx = lid; idx < numOfNN; idx += warpSize) {
-            Pair KVPair                      = shDumpKV[shMemRowId * numOfNN + idx];
+            const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+            Pair KVPair           = shDumpKV[shMemRowId * numOfNN + idx];
             out_dists[rowId * numOfNN + idx] = KVPair.value;
             out_inds[rowId * numOfNN + idx]  = (IdxT)KVPair.key;
           }
         }
       }
       __threadfence();
+      __syncthreads();
 
-      if (threadIdx.x == 0) {
-        mutex[gridStrideY / Policy::Mblk] = -2;
-        __threadfence();
-      }
+      if (threadIdx.x == 0) { atomicExch((int*)&mutexes[gridStrideY / Policy::Mblk], -2); }
+      __threadfence();
     }
   };