rapidsai · rapids-bot · Nov 9, 2022 · Oct 11, 2022 · Oct 11, 2022 · Oct 11, 2022
@@ -73,7 +73,7 @@ COMPILE_DIST_LIBRARY=OFF
 ENABLE_NN_DEPENDENCIES=OFF
 
 TEST_TARGETS="CLUSTER_TEST;CORE_TEST;DISTANCE_TEST;LABEL_TEST;LINALG_TEST;MATRIX_TEST;RANDOM_TEST;SOLVERS_TEST;SPARSE_TEST;SPARSE_DIST_TEST;SPARSE_NN_TEST;SPATIAL_TEST;STATS_TEST;UTILS_TEST"
-BENCH_TARGETS="CLUSTER_BENCH;SPATIAL_BENCH;DISTANCE_BENCH;LINALG_BENCH;SPARSE_BENCH;RANDOM_BENCH"
+BENCH_TARGETS="CLUSTER_BENCH;SPATIAL_BENCH;DISTANCE_BENCH;LINALG_BENCH;MATRIX_BENCH;SPARSE_BENCH;RANDOM_BENCH"
 ENABLE_thrust_DEPENDENCY=ON
 
 CACHE_ARGS=""

@@ -100,6 +100,12 @@ if(BUILD_BENCH)
             bench/main.cpp
             )
 
+    ConfigureBench(NAME MATRIX_BENCH
+            PATH
+            bench/matrix/argmin.cu
+            bench/main.cpp
+            )
+
     ConfigureBench(NAME RANDOM_BENCH
             PATH
             bench/random/make_blobs.cu

@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <raft/matrix/math.cuh>
+
+#include <rmm/device_uvector.hpp>
+
+namespace raft::bench::linalg {
+
+struct ArgminParams {
+  int64_t rows, cols;
+};
+
+template <typename T, typename OutT>
+struct Argmin : public fixture {
+  Argmin(const ArgminParams& p)
+    : params(p), matrix(p.rows * p.cols, stream), indices(p.cols, stream)
+  {
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    loop_on_state(state, [this]() {
+      raft::matrix::argmin(matrix.data(), params.rows, params.cols, indices.data(), stream);
+    });
+  }
+
+ private:
+  ArgminParams params;
+  rmm::device_uvector<T> matrix;
+  rmm::device_uvector<OutT> indices;
+};  // struct Argmin
+
+const std::vector<ArgminParams> kInputSizes{
+  {64, 1000},     {128, 1000},     {256, 1000},     {512, 1000},     {1024, 1000},
+  {64, 10000},    {128, 10000},    {256, 10000},    {512, 10000},    {1024, 10000},
+  {64, 100000},   {128, 100000},   {256, 100000},   {512, 100000},   {1024, 100000},
+  {64, 1000000},  {128, 1000000},  {256, 1000000},  {512, 1000000},  {1024, 1000000},
+  {64, 10000000}, {128, 10000000}, {256, 10000000}, {512, 10000000}, {1024, 10000000},
+};
+
+RAFT_BENCH_REGISTER((Argmin<float, uint32_t>), "", kInputSizes);
+RAFT_BENCH_REGISTER((Argmin<double, uint32_t>), "", kInputSizes);
+
+}  // namespace raft::bench::linalg
@@ -362,45 +362,58 @@ void matrixVectorBinarySub(Type* data,
     stream);
 }
 
-// Computes the argmax(d_in) column-wise in a DxN matrix
-template <typename T, typename IdxT, int TPB>
-__global__ void argmaxKernel(const T* d_in, int D, int N, IdxT* argmax)
+// Computes an argmin/argmax column-wise in a DxN matrix
+template <typename RedOp, int TPB, typename T, typename OutT, typename IdxT>
+__global__ void argReduceKernel(const T* d_in, IdxT D, IdxT N, OutT* out)
 {
-  typedef cub::BlockReduce<cub::KeyValuePair<int, T>, TPB> BlockReduce;
+  typedef cub::
+    BlockReduce<cub::KeyValuePair<IdxT, T>, TPB, cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY>
+      BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
-  // compute maxIndex=argMax  index for column
-  using KVP    = cub::KeyValuePair<int, T>;
-  int rowStart = blockIdx.x * D;
-  KVP thread_data(-1, -raft::myInf<T>());
+  using KVP     = cub::KeyValuePair<IdxT, T>;
+  IdxT rowStart = static_cast<IdxT>(blockIdx.x) * D;
+  KVP thread_data(0, std::is_same_v<RedOp, cub::ArgMax> ? -raft::myInf<T>() : raft::myInf<T>());
 
-  for (int i = threadIdx.x; i < D; i += TPB) {
-    int idx     = rowStart + i;
-    thread_data = cub::ArgMax()(thread_data, KVP(i, d_in[idx]));
+  for (IdxT i = threadIdx.x; i < D; i += TPB) {
+    IdxT idx    = rowStart + i;
+    thread_data = RedOp()(thread_data, KVP(i, d_in[idx]));
   }
 
-  auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax());
+  auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, RedOp());
 
-  if (threadIdx.x == 0) { argmax[blockIdx.x] = maxKV.key; }
+  if (threadIdx.x == 0) { out[blockIdx.x] = maxKV.key; }
 }
 
-template <typename math_t, typename idx_t>
-void argmax(const math_t* in, int n_rows, int n_cols, idx_t* out, cudaStream_t stream)
+template <typename RedOp, typename math_t, typename out_t, typename idx_t>
+inline void argReduce(const math_t* in, idx_t n_rows, idx_t n_cols, out_t* out, cudaStream_t stream)
 {
-  int D = n_rows;
-  int N = n_cols;
+  idx_t D = n_rows;
+  idx_t N = n_cols;
   if (D <= 32) {
-    argmaxKernel<math_t, idx_t, 32><<<N, 32, 0, stream>>>(in, D, N, out);
+    argReduceKernel<RedOp, 32><<<N, 32, 0, stream>>>(in, D, N, out);
   } else if (D <= 64) {
-    argmaxKernel<math_t, idx_t, 64><<<N, 64, 0, stream>>>(in, D, N, out);
+    argReduceKernel<RedOp, 64><<<N, 64, 0, stream>>>(in, D, N, out);
   } else if (D <= 128) {
-    argmaxKernel<math_t, idx_t, 128><<<N, 128, 0, stream>>>(in, D, N, out);
+    argReduceKernel<RedOp, 128><<<N, 128, 0, stream>>>(in, D, N, out);
   } else {
-    argmaxKernel<math_t, idx_t, 256><<<N, 256, 0, stream>>>(in, D, N, out);
+    argReduceKernel<RedOp, 256><<<N, 256, 0, stream>>>(in, D, N, out);
   }
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
+template <typename math_t, typename out_t, typename idx_t>
+void argmin(const math_t* in, idx_t n_rows, idx_t n_cols, out_t* out, cudaStream_t stream)
+{
+  argReduce<cub::ArgMin>(in, n_rows, n_cols, out, stream);
+}
+
+template <typename math_t, typename out_t, typename idx_t>
+void argmax(const math_t* in, idx_t n_rows, idx_t n_cols, out_t* out, cudaStream_t stream)
+{
+  argReduce<cub::ArgMax>(in, n_rows, n_cols, out, stream);
+}
+
 // Utility kernel needed for signFlip.
 // Computes the argmax(abs(d_in)) column-wise in a DxN matrix followed by
 // flipping the sign if the |max| value for each column is negative.

@@ -302,6 +302,20 @@ void ratio(
 
 /** @} */
 
+/**
+ * @brief Argmin: find the row idx with minimum value for each column
+ * @param in: input matrix
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param out: output vector of size n_cols
+ * @param stream: cuda stream
+ */
+template <typename math_t, typename out_t, typename idx_t = int>
+void argmin(const math_t* in, idx_t n_rows, idx_t n_cols, out_t* out, cudaStream_t stream)
+{
+  detail::argmin(in, n_rows, n_cols, out, stream);
+}
+
 /**
  * @brief Argmax: find the row idx with maximum value for each column
  * @param in: input matrix
@@ -310,8 +324,8 @@ void ratio(
  * @param out: output vector of size n_cols
  * @param stream: cuda stream
  */
-template <typename math_t, typename idx_t>
-void argmax(const math_t* in, int n_rows, int n_cols, idx_t* out, cudaStream_t stream)
+template <typename math_t, typename out_t, typename idx_t = int>
+void argmax(const math_t* in, idx_t n_rows, idx_t n_cols, out_t* out, cudaStream_t stream)
 {
   detail::argmax(in, n_rows, n_cols, out, stream);
 }

@@ -30,6 +30,7 @@
 #include <raft/linalg/gemm.cuh>
 #include <raft/linalg/norm.cuh>
 #include <raft/linalg/unary_op.cuh>
+#include <raft/matrix/math.cuh>
 #include <raft/matrix/matrix.cuh>
 #include <raft/util/cuda_utils.cuh>
 
@@ -144,8 +145,7 @@ inline void predict_float_core(const handle_t& handle,
                    distances.data(),
                    n_clusters,
                    stream);
-      utils::argmin_along_rows(
-        n_rows, static_cast<IdxT>(n_clusters), distances.data(), labels, stream);
+      raft::matrix::argmin(distances.data(), (IdxT)n_clusters, n_rows, labels, stream);
       break;
     }
     default: {

@@ -151,66 +151,6 @@ inline void memzero(T* ptr, IdxT n_elems, rmm::cuda_stream_view stream)
   }
 }
 
-template <typename IdxT, typename OutT>
-__global__ void argmin_along_rows_kernel(IdxT n_rows, uint32_t n_cols, const float* a, OutT* out)
-{
-  __shared__ OutT shm_ids[1024];    // NOLINT
-  __shared__ float shm_vals[1024];  // NOLINT
-  IdxT i = blockIdx.x;
-  if (i >= n_rows) return;
-  OutT min_idx  = n_cols;
-  float min_val = raft::upper_bound<float>();
-  for (OutT j = threadIdx.x; j < n_cols; j += blockDim.x) {
-    if (min_val > a[j + n_cols * i]) {
-      min_val = a[j + n_cols * i];
-      min_idx = j;
-    }
-  }
-  shm_vals[threadIdx.x] = min_val;
-  shm_ids[threadIdx.x]  = min_idx;
-  __syncthreads();
-  for (IdxT offset = blockDim.x / 2; offset > 0; offset >>= 1) {
-    if (threadIdx.x < offset) {
-      if (shm_vals[threadIdx.x] < shm_vals[threadIdx.x + offset]) {
-      } else if (shm_vals[threadIdx.x] > shm_vals[threadIdx.x + offset]) {
-        shm_vals[threadIdx.x] = shm_vals[threadIdx.x + offset];
-        shm_ids[threadIdx.x]  = shm_ids[threadIdx.x + offset];
-      } else if (shm_ids[threadIdx.x] > shm_ids[threadIdx.x + offset]) {
-        shm_ids[threadIdx.x] = shm_ids[threadIdx.x + offset];
-      }
-    }
-    __syncthreads();
-  }
-  if (threadIdx.x == 0) { out[i] = shm_ids[0]; }
-}
-
-/**
- * @brief Find index of the smallest element in each row.
- *
- * NB: device-only function
- * TODO: specialize select_k for the case of `k == 1` and use that one instead.
- *
- * @tparam IdxT index type
- * @tparam OutT output type
- *
- * @param n_rows
- * @param n_cols
- * @param[in] a device pointer to the row-major matrix [n_rows, n_cols]
- * @param[out] out device pointer to the vector of selected indices [n_rows]
- * @param stream
- */
-template <typename IdxT, typename OutT>
-inline void argmin_along_rows(
-  IdxT n_rows, IdxT n_cols, const float* a, OutT* out, rmm::cuda_stream_view stream)
-{
-  IdxT block_dim = 1024;
-  while (block_dim > n_cols) {
-    block_dim /= 2;
-  }
-  block_dim = max(block_dim, (IdxT)128);
-  argmin_along_rows_kernel<IdxT, OutT><<<n_rows, block_dim, 0, stream>>>(n_rows, n_cols, a, out);
-}
-
 template <typename IdxT>
 __global__ void dots_along_rows_kernel(IdxT n_rows, IdxT n_cols, const float* a, float* out)
 {