From 7f11225c061482ec5a2f0afb1e3f60bdeaaab6c8 Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Fri, 7 Oct 2022 18:35:46 +0200
Subject: [PATCH 01/25] Integrate accumulate_into_selected into raft prims

---
 cpp/bench/CMakeLists.txt                      |   1 +
 cpp/bench/common/benchmark.hpp                |   3 +-
 cpp/bench/linalg/reduce_rows_by_key.cu        |  88 ++++++
 .../raft/linalg/detail/reduce_rows_by_key.cuh | 280 ++++++++----------
 .../raft/linalg/reduce_rows_by_key.cuh        |  65 ++--
 .../knn/detail/ann_kmeans_balanced.cuh        | 103 ++++++-
 .../raft/spatial/knn/detail/ann_utils.cuh     |   6 +-
 .../spatial/knn/detail/ivf_flat_build.cuh     |   3 +-
 cpp/test/linalg/reduce_rows_by_key.cu         |   2 +-
 9 files changed, 349 insertions(+), 202 deletions(-)
 create mode 100644 cpp/bench/linalg/reduce_rows_by_key.cu
diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
index 51170e4265..7551bff830 100644
--- a/cpp/bench/CMakeLists.txt
+++ b/cpp/bench/CMakeLists.txt
@@ -96,6 +96,7 @@ if(BUILD_BENCH)
             bench/linalg/add.cu
             bench/linalg/map_then_reduce.cu
             bench/linalg/matrix_vector_op.cu
+            bench/linalg/reduce_rows_by_key.cu
             bench/linalg/reduce.cu
             bench/main.cpp
             )
diff --git a/cpp/bench/common/benchmark.hpp b/cpp/bench/common/benchmark.hpp
index adfe5218e2..13ca40a033 100644
--- a/cpp/bench/common/benchmark.hpp
+++ b/cpp/bench/common/benchmark.hpp
@@ -53,8 +53,9 @@ struct using_pool_memory_res {
     rmm::mr::set_current_device_resource(&pool_res_);
   }
 
-  using_pool_memory_res() : using_pool_memory_res(size_t(1) << size_t(30), size_t(16) << size_t(30))
+  using_pool_memory_res() : orig_res_(rmm::mr::get_current_device_resource()), pool_res_(&cuda_res_)
   {
+    rmm::mr::set_current_device_resource(&pool_res_);
   }
 
   ~using_pool_memory_res() { rmm::mr::set_current_device_resource(orig_res_); }
diff --git a/cpp/bench/linalg/reduce_rows_by_key.cu b/cpp/bench/linalg/reduce_rows_by_key.cu
new file mode 100644
index 0000000000..075bc7c8c4
--- /dev/null
+++ b/cpp/bench/linalg/reduce_rows_by_key.cu
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <raft/linalg/reduce_rows_by_key.cuh>
+#include <raft/random/rng.cuh>
+
+#include <rmm/device_uvector.hpp>
+
+namespace raft::bench::linalg {
+
+struct rrbk_params {
+  int64_t rows, cols;
+  int64_t keys;
+};
+
+template <typename T, typename KeyT>
+struct reduce_rows_by_key : public fixture {
+  reduce_rows_by_key(const rrbk_params& p)
+    : params(p),
+      in(p.rows * p.cols, stream),
+      out(p.keys * p.cols, stream),
+      keys(p.rows, stream),
+      workspace(p.rows, stream)
+  {
+    raft::random::RngState rng{42};
+    raft::random::uniformInt(rng, keys.data(), p.rows, (KeyT)0, (KeyT)p.keys, stream);
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    loop_on_state(state, [this]() {
+      raft::linalg::reduce_rows_by_key(in.data(),
+                                       params.cols,
+                                       keys.data(),
+                                       workspace.data(),
+                                       params.rows,
+                                       params.cols,
+                                       params.keys,
+                                       out.data(),
+                                       stream,
+                                       false);
+    });
+  }
+
+ protected:
+  rrbk_params params;
+  rmm::device_uvector<T> in, out;
+  rmm::device_uvector<KeyT> keys;
+  rmm::device_uvector<char> workspace;
+};  // struct reduce_rows_by_key
+
+const std::vector<rrbk_params> kInputSizes{
+  {10000, 128, 64},
+  {100000, 128, 64},
+  {1000000, 128, 64},
+  {10000000, 128, 64},
+  {10000, 128, 256},
+  {100000, 128, 256},
+  {1000000, 128, 256},
+  {10000000, 128, 256},
+  {10000, 128, 1024},
+  {100000, 128, 1024},
+  {1000000, 128, 1024},
+  {10000000, 128, 1024},
+  {10000, 128, 4096},
+  {100000, 128, 4096},
+  {1000000, 128, 4096},
+  {10000000, 128, 4096},
+};
+
+RAFT_BENCH_REGISTER((reduce_rows_by_key<float, uint32_t>), "", kInputSizes);
+RAFT_BENCH_REGISTER((reduce_rows_by_key<double, uint32_t>), "", kInputSizes);
+
+}  // namespace raft::bench::linalg
diff --git a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
index 9ddcbae20b..56c1142aff 100644
--- a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
+++ b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh
@@ -92,48 +92,48 @@ struct quadSum {
 
 #define SUM_ROWS_SMALL_K_DIMX         256
 #define SUM_ROWS_BY_KEY_SMALL_K_MAX_K 4
-template <typename DataIteratorT, typename WeightT>
+template <typename DataIteratorT, typename WeightT, typename SumsT, typename IdxT>
 __launch_bounds__(SUM_ROWS_SMALL_K_DIMX, 4)
 
-  __global__ void sum_rows_by_key_small_nkeys_kernel(const DataIteratorT* d_A,
-                                                     int lda,
+  __global__ void sum_rows_by_key_small_nkeys_kernel(const DataIteratorT d_A,
+                                                     IdxT lda,
                                                      const char* d_keys,
                                                      const WeightT* d_weights,
-                                                     int nrows,
-                                                     int ncols,
-                                                     int nkeys,
-                                                     DataIteratorT* d_sums)
+                                                     IdxT nrows,
+                                                     IdxT ncols,
+                                                     IdxT nkeys,
+                                                     SumsT* d_sums)
 {
-  typedef typename std::iterator_traits<DataIteratorT*>::value_type DataType;
-  typedef cub::BlockReduce<quad<DataType>, SUM_ROWS_SMALL_K_DIMX> BlockReduce;
+  typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
+  typedef cub::BlockReduce<quad<SumsT>, SUM_ROWS_SMALL_K_DIMX> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
-  for (int idim = static_cast<int>(blockIdx.y); idim < ncols; idim += gridDim.y) {
-    if (idim != static_cast<int>(blockIdx.y)) __syncthreads();  // we're reusing temp_storage
+  for (IdxT idim = static_cast<IdxT>(blockIdx.y); idim < ncols; idim += gridDim.y) {
+    if (idim != static_cast<IdxT>(blockIdx.y)) __syncthreads();  // we're reusing temp_storage
 
     // threadIdx.x stores partial sum for current dim and key=threadIdx.x in this reg
-    quad<DataType> thread_sums;
+    quad<SumsT> thread_sums;
     thread_sums.x = 0.0;
     thread_sums.y = 0.0;
     thread_sums.z = 0.0;
     thread_sums.w = 0.0;
 
     // May use vectorized load - not necessary for doubles
-    for (int block_offset_irow = blockIdx.x * blockDim.x;
+    for (IdxT block_offset_irow = blockIdx.x * blockDim.x;
          block_offset_irow < nrows;  // we will syncthreads() inside the loop, no CTA divergence
          block_offset_irow += blockDim.x * gridDim.x) {
-      int irow     = block_offset_irow + threadIdx.x;
+      IdxT irow    = block_offset_irow + threadIdx.x;
       DataType val = (irow < nrows) ? d_A[irow * lda + idim] : 0.0;
       if (d_weights && irow < nrows) { val = val * d_weights[irow]; }
       // we are not reusing the keys - after profiling
       // d_keys is mainly loaded from L2, and this kernel is DRAM BW bounded
       // (experimentation gave a 10% speed up - not worth the many code lines added)
-      int row_key = (irow < nrows) ? d_keys[irow] : -1;
+      IdxT row_key = (irow < nrows) ? d_keys[irow] : -1;
 
-      thread_sums.x += (row_key == 0) ? val : 0.0;
-      thread_sums.y += (row_key == 1) ? val : 0.0;
-      thread_sums.z += (row_key == 2) ? val : 0.0;
-      thread_sums.w += (row_key == 3) ? val : 0.0;
+      thread_sums.x += (row_key == 0) ? static_cast<SumsT>(val) : 0.0;
+      thread_sums.y += (row_key == 1) ? static_cast<SumsT>(val) : 0.0;
+      thread_sums.z += (row_key == 2) ? static_cast<SumsT>(val) : 0.0;
+      thread_sums.w += (row_key == 3) ? static_cast<SumsT>(val) : 0.0;
     }
 
     // End of column
@@ -142,12 +142,12 @@ __launch_bounds__(SUM_ROWS_SMALL_K_DIMX, 4)
     // Strided access
 
     // Reducing by key
-    thread_sums = BlockReduce(temp_storage).Reduce(thread_sums, quadSum<DataType>());
+    thread_sums = BlockReduce(temp_storage).Reduce(thread_sums, quadSum<SumsT>());
 
     if (threadIdx.x < 32) {
       // We only need 4
       thread_sums = cub::ShuffleIndex<32>(thread_sums, 0, 0xffffffff);
-      if (static_cast<int>(threadIdx.x) < nkeys) {
+      if (static_cast<IdxT>(threadIdx.x) < nkeys) {
         if (threadIdx.x == 0) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.x);
         if (threadIdx.x == 1) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.y);
         if (threadIdx.x == 2) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.z);
@@ -157,22 +157,22 @@ __launch_bounds__(SUM_ROWS_SMALL_K_DIMX, 4)
   }
 }
 
-template <typename DataIteratorT, typename WeightT>
-void sum_rows_by_key_small_nkeys(const DataIteratorT* d_A,
-                                 int lda,
+template <typename DataIteratorT, typename WeightT, typename SumsT, typename IdxT>
+void sum_rows_by_key_small_nkeys(const DataIteratorT d_A,
+                                 IdxT lda,
                                  const char* d_keys,
                                  const WeightT* d_weights,
-                                 int nrows,
-                                 int ncols,
-                                 int nkeys,
-                                 DataIteratorT* d_sums,
+                                 IdxT nrows,
+                                 IdxT ncols,
+                                 IdxT nkeys,
+                                 SumsT* d_sums,
                                  cudaStream_t st)
 {
   dim3 grid, block;
   block.x = SUM_ROWS_SMALL_K_DIMX;
   block.y = 1;  // Necessary
 
-  grid.x = raft::ceildiv(nrows, (int)block.x);
+  grid.x = raft::ceildiv(nrows, (IdxT)block.x);
   grid.x = std::min(grid.x, 32u);
   grid.y = ncols;
   grid.y = std::min(grid.y, MAX_BLOCKS);
@@ -188,45 +188,49 @@ void sum_rows_by_key_small_nkeys(const DataIteratorT* d_A,
 
 #define SUM_ROWS_BY_KEY_LARGE_K_MAX_K 1024
 
-template <typename DataIteratorT, typename KeysIteratorT, typename WeightT>
-__global__ void sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT* d_A,
-                                                            int lda,
+template <typename DataIteratorT,
+          typename KeysIteratorT,
+          typename WeightT,
+          typename SumsT,
+          typename IdxT>
+__global__ void sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT d_A,
+                                                            IdxT lda,
                                                             KeysIteratorT d_keys,
                                                             const WeightT* d_weights,
-                                                            int nrows,
-                                                            int ncols,
+                                                            IdxT nrows,
+                                                            IdxT ncols,
                                                             int key_offset,
-                                                            int nkeys,
-                                                            DataIteratorT* d_sums)
+                                                            IdxT nkeys,
+                                                            SumsT* d_sums)
 {
   typedef typename std::iterator_traits<KeysIteratorT>::value_type KeyType;
-  typedef typename std::iterator_traits<DataIteratorT*>::value_type DataType;
-  __shared__ DataType local_sums[SUM_ROWS_BY_KEY_LARGE_K_MAX_K];
+  typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
+  __shared__ SumsT local_sums[SUM_ROWS_BY_KEY_LARGE_K_MAX_K];
 
-  for (int local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x)
+  for (IdxT local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x)
     local_sums[local_key] = 0.0;
 
-  for (int idim = blockIdx.y; idim < ncols; idim += gridDim.y) {
+  for (IdxT idim = blockIdx.y; idim < ncols; idim += gridDim.y) {
     __syncthreads();  // local_sums
 
     // At this point local_sums if full of zeros
 
-    for (int irow = blockIdx.x * blockDim.x + threadIdx.x; irow < nrows;
+    for (IdxT irow = blockIdx.x * blockDim.x + threadIdx.x; irow < nrows;
          irow += blockDim.x * gridDim.x) {
       // Branch div in this loop - not an issue with current code
       DataType val = d_A[idim * lda + irow];
       if (d_weights) val = val * d_weights[irow];
 
-      int local_key = d_keys[irow] - key_offset;
+      IdxT local_key = d_keys[irow] - key_offset;
 
       // We could load next val here
-      raft::myAtomicAdd(&local_sums[local_key], val);
+      raft::myAtomicAdd(&local_sums[local_key], static_cast<SumsT>(val));
     }
 
     __syncthreads();  // local_sums
 
-    for (int local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x) {
-      DataType local_sum = local_sums[local_key];
+    for (IdxT local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x) {
+      SumsT local_sum = local_sums[local_key];
 
       if (local_sum != 0.0) {
         KeyType global_key = key_offset + local_key;
@@ -237,22 +241,22 @@ __global__ void sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT*
   }
 }
 
-template <typename DataIteratorT, typename KeysIteratorT>
-void sum_rows_by_key_large_nkeys_colmajor(const DataIteratorT* d_A,
-                                          int lda,
+template <typename DataIteratorT, typename KeysIteratorT, typename SumsT, typename IdxT>
+void sum_rows_by_key_large_nkeys_colmajor(const DataIteratorT d_A,
+                                          IdxT lda,
                                           KeysIteratorT d_keys,
-                                          int nrows,
-                                          int ncols,
+                                          IdxT nrows,
+                                          IdxT ncols,
                                           int key_offset,
-                                          int nkeys,
-                                          DataIteratorT* d_sums,
+                                          IdxT nkeys,
+                                          SumsT* d_sums,
                                           cudaStream_t st)
 {
   dim3 grid, block;
   block.x = SUM_ROWS_SMALL_K_DIMX;
   block.y = 1;  // Necessary
 
-  grid.x = raft::ceildiv(nrows, (int)block.x);
+  grid.x = raft::ceildiv(nrows, (IdxT)block.x);
   grid.x = std::min(grid.x, 32u);
   grid.y = ncols;
   grid.y = std::min(grid.y, MAX_BLOCKS);
@@ -260,91 +264,47 @@ void sum_rows_by_key_large_nkeys_colmajor(const DataIteratorT* d_A,
     d_A, lda, d_keys, nrows, ncols, key_offset, nkeys, d_sums);
 }
 
-#define RRBK_SHMEM_SZ 32
-
-//#define RRBK_SHMEM
-template <typename DataIteratorT, typename KeysIteratorT, typename WeightT>
-__global__ void sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT* d_A,
-                                                            int lda,
+template <typename DataIteratorT,
+          typename KeysIteratorT,
+          typename WeightT,
+          typename SumsT,
+          typename IdxT>
+__global__ void sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT d_A,
+                                                            IdxT lda,
                                                             const WeightT* d_weights,
                                                             KeysIteratorT d_keys,
-                                                            int nrows,
-                                                            int ncols,
-                                                            int key_offset,
-                                                            int nkeys,
-                                                            DataIteratorT* d_sums)
+                                                            IdxT nrows,
+                                                            IdxT ncols,
+                                                            SumsT* d_sums)
 {
-  typedef typename std::iterator_traits<KeysIteratorT>::value_type KeyType;
-  typedef typename std::iterator_traits<DataIteratorT*>::value_type DataType;
-
-#ifdef RRBK_SHMEM
-  __shared__ KeyType sh_keys[RRBK_SHMEM_SZ];
-#endif
-  int rows_per_partition = nrows / gridDim.z + 1;
-  int start_row          = blockIdx.z * rows_per_partition;
-  int end_row            = start_row + rows_per_partition;
-  end_row                = end_row > nrows ? nrows : end_row;
-
-  KeyType local_key = blockIdx.y;
-  if (local_key >= nkeys) return;
-  int this_col = threadIdx.x + blockIdx.x * blockDim.x;
-  if (this_col >= ncols) return;
-
-  DataType sum       = 0.0;
-  KeyType global_key = key_offset + local_key;
-#ifdef RRBK_SHMEM
-  int sh_key_inx = 0;
-#endif
-  for (int r = start_row; r < end_row; r++) {
-#ifdef RRBK_SHMEM
-    if (0 == sh_key_inx % RRBK_SHMEM_SZ) {
-      for (int x = threadIdx.x; x < RRBK_SHMEM_SZ; x += blockDim.x)
-        sh_keys[x] = d_keys[r + x];
-      __syncthreads();
-    }
-    if (sh_keys[sh_key_inx] != global_key) continue;  // No divergence since global_key is the
-    // same for the whole block
-    sh_key_inx++;
-#else
-    if (d_keys[r] != global_key)
-      continue;  // No divergence since global_key is the
-                 // same for the whole block
-#endif
-    // if ((end_row-start_row) / (r-start_row) != global_key) continue;
-    DataType val = __ldcg(&d_A[r * lda + this_col]);
-    if (d_weights) { val = val * d_weights[r]; }
-    sum += val;
-  }
-
-  if (sum != 0.0) raft::myAtomicAdd(&d_sums[global_key * ncols + this_col], sum);
+  IdxT gid = threadIdx.x + (blockDim.x * static_cast<IdxT>(blockIdx.x));
+  IdxT j   = gid % ncols;
+  IdxT i   = gid / ncols;
+  if (i >= nrows) return;
+  IdxT l    = static_cast<IdxT>(d_keys[i]);
+  SumsT val = d_A[j + lda * i];
+  if (d_weights != nullptr) val *= d_weights[i];
+  raft::myAtomicAdd(&d_sums[j + ncols * l], val);
 }
 
-template <typename DataIteratorT, typename KeysIteratorT, typename WeightT>
-void sum_rows_by_key_large_nkeys_rowmajor(const DataIteratorT* d_A,
-                                          int lda,
-                                          KeysIteratorT d_keys,
+template <typename DataIteratorT,
+          typename KeysIteratorT,
+          typename WeightT,
+          typename SumsT,
+          typename IdxT>
+void sum_rows_by_key_large_nkeys_rowmajor(const DataIteratorT d_A,
+                                          IdxT lda,
+                                          const KeysIteratorT d_keys,
                                           const WeightT* d_weights,
-                                          int nrows,
-                                          int ncols,
-                                          int key_offset,
-                                          int nkeys,
-                                          DataIteratorT* d_sums,
+                                          IdxT nrows,
+                                          IdxT ncols,
+                                          SumsT* d_sums,
                                           cudaStream_t st)
 {
-  // x-dim refers to the column in the input data
-  // y-dim refers to the key
-  // z-dim refers to a partitioning of the rows among the threadblocks
-  dim3 grid, block;
-  block.x = 256;  // Adjust me!
-  block.y = 1;    // Don't adjust me!
-  grid.x  = raft::ceildiv(ncols, (int)block.x);
-  grid.y  = nkeys;
-  grid.z  = std::max(40960000 / nkeys / ncols, (int)1);  // Adjust me!
-  grid.z  = std::min(grid.z, (unsigned int)nrows);
-  grid.z  = std::min(grid.z, MAX_BLOCKS);
-
-  sum_rows_by_key_large_nkeys_kernel_rowmajor<<<grid, block, 0, st>>>(
-    d_A, lda, d_weights, d_keys, nrows, ncols, key_offset, nkeys, d_sums);
+  uint32_t block_dim = 128;
+  auto grid_dim      = static_cast<uint32_t>(ceildiv<IdxT>(nrows * ncols, (IdxT)block_dim));
+  sum_rows_by_key_large_nkeys_kernel_rowmajor<<<grid_dim, block_dim, 0, st>>>(
+    d_A, lda, d_weights, d_keys, nrows, ncols, d_sums);
 }
 
 /**
@@ -354,6 +314,8 @@ void sum_rows_by_key_large_nkeys_rowmajor(const DataIteratorT* d_A,
  *                       (may be a simple pointer type)
  * @tparam KeysIteratorT Random-access iterator type, for reading input keys
  *                       (may be a simple pointer type)
+ * @tparam SumsT         Type of the output sums
+ * @tparam IdxT          Index type
  *
  * @param[in]  d_A         Input data array (lda x nrows)
  * @param[in]  lda         Real row size for input data, d_A
@@ -365,26 +327,31 @@ void sum_rows_by_key_large_nkeys_rowmajor(const DataIteratorT* d_A,
  * @param[in]  nkeys       Number of unique keys in d_keys
  * @param[out] d_sums      Row sums by key (ncols x d_keys)
  * @param[in]  stream      CUDA stream
+ * @param[in]  reset_sums  Whether to reset the output sums to zero before reducing
  */
-template <typename DataIteratorT, typename KeysIteratorT, typename WeightT>
-void reduce_rows_by_key(const DataIteratorT* d_A,
-                        int lda,
+template <typename DataIteratorT,
+          typename KeysIteratorT,
+          typename WeightT,
+          typename SumsT,
+          typename IdxT>
+void reduce_rows_by_key(const DataIteratorT d_A,
+                        IdxT lda,
                         KeysIteratorT d_keys,
                         const WeightT* d_weights,
                         char* d_keys_char,
-                        int nrows,
-                        int ncols,
-                        int nkeys,
-                        DataIteratorT* d_sums,
-                        cudaStream_t stream)
+                        IdxT nrows,
+                        IdxT ncols,
+                        IdxT nkeys,
+                        SumsT* d_sums,
+                        cudaStream_t stream,
+                        bool reset_sums)
 {
   typedef typename std::iterator_traits<KeysIteratorT>::value_type KeyType;
-  typedef typename std::iterator_traits<DataIteratorT*>::value_type DataType;
 
   // Following kernel needs memset
-  cudaMemsetAsync(d_sums, 0, ncols * nkeys * sizeof(DataType), stream);
+  if (reset_sums) { cudaMemsetAsync(d_sums, 0, ncols * nkeys * sizeof(SumsT), stream); }
 
-  if (nkeys <= SUM_ROWS_BY_KEY_SMALL_K_MAX_K) {
+  if (d_keys_char != nullptr && nkeys <= SUM_ROWS_BY_KEY_SMALL_K_MAX_K) {
     // sum_rows_by_key_small_k is BW bounded. d_keys is loaded ncols time - avoiding wasting BW
     // with doubles we have ~20% speed up - with floats we can hope something around 2x
     // Converting d_keys to char
@@ -392,12 +359,7 @@ void reduce_rows_by_key(const DataIteratorT* d_A,
     sum_rows_by_key_small_nkeys(
       d_A, lda, d_keys_char, d_weights, nrows, ncols, nkeys, d_sums, stream);
   } else {
-    for (KeyType key_offset = 0; key_offset < static_cast<KeyType>(nkeys);
-         key_offset += SUM_ROWS_BY_KEY_LARGE_K_MAX_K) {
-      KeyType this_call_nkeys = std::min(SUM_ROWS_BY_KEY_LARGE_K_MAX_K, nkeys);
-      sum_rows_by_key_large_nkeys_rowmajor(
-        d_A, lda, d_keys, d_weights, nrows, ncols, key_offset, this_call_nkeys, d_sums, stream);
-    }
+    sum_rows_by_key_large_nkeys_rowmajor(d_A, lda, d_keys, d_weights, nrows, ncols, d_sums, stream);
   }
 }
 
@@ -407,6 +369,8 @@ void reduce_rows_by_key(const DataIteratorT* d_A,
  * pointer type)
  * @tparam KeysIteratorT Random-access iterator type, for reading input keys (may be a simple
  * pointer type)
+ * @tparam SumsT         Type of the output sums
+ * @tparam IdxT          Index type
  * @param[in]  d_A         Input data array (lda x nrows)
  * @param[in]  lda         Real row size for input data, d_A
  * @param[in]  d_keys      Keys for each row (1 x nrows)
@@ -417,18 +381,19 @@ void reduce_rows_by_key(const DataIteratorT* d_A,
  * @param[out] d_sums      Row sums by key (ncols x d_keys)
  * @param[in]  stream      CUDA stream
  */
-template <typename DataIteratorT, typename KeysIteratorT>
-void reduce_rows_by_key(const DataIteratorT* d_A,
-                        int lda,
+template <typename DataIteratorT, typename KeysIteratorT, typename SumsT, typename IdxT>
+void reduce_rows_by_key(const DataIteratorT d_A,
+                        IdxT lda,
                         KeysIteratorT d_keys,
                         char* d_keys_char,
-                        int nrows,
-                        int ncols,
-                        int nkeys,
-                        DataIteratorT* d_sums,
-                        cudaStream_t stream)
+                        IdxT nrows,
+                        IdxT ncols,
+                        IdxT nkeys,
+                        SumsT* d_sums,
+                        cudaStream_t stream,
+                        bool reset_sums)
 {
-  typedef typename std::iterator_traits<DataIteratorT*>::value_type DataType;
+  typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
   reduce_rows_by_key(d_A,
                      lda,
                      d_keys,
@@ -438,7 +403,8 @@ void reduce_rows_by_key(const DataIteratorT* d_A,
                      ncols,
                      nkeys,
                      d_sums,
-                     stream);
+                     stream,
+                     reset_sums);
 }
 
 };  // end namespace detail
diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
index 39c54e8b0c..1dabd92087 100644
--- a/cpp/include/raft/linalg/reduce_rows_by_key.cuh
+++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh
@@ -43,6 +43,8 @@ void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st)
  *                       (may be a simple pointer type)
  * @tparam KeysIteratorT Random-access iterator type, for reading input keys
  *                       (may be a simple pointer type)
+ * @tparam SumsT         Type of the output sums
+ * @tparam IdxT          Index type
  *
  * @param[in]  d_A         Input data array (lda x nrows)
  * @param[in]  lda         Real row size for input data, d_A
@@ -54,21 +56,27 @@ void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st)
  * @param[in]  nkeys       Number of unique keys in d_keys
  * @param[out] d_sums      Row sums by key (ncols x d_keys)
  * @param[in]  stream      CUDA stream
+ * @param[in]  reset_sums  Whether to reset the output sums to zero before reducing
  */
-template <typename DataIteratorT, typename KeysIteratorT, typename WeightT>
-void reduce_rows_by_key(const DataIteratorT* d_A,
-                        int lda,
+template <typename DataIteratorT,
+          typename KeysIteratorT,
+          typename WeightT,
+          typename SumsT,
+          typename IdxT>
+void reduce_rows_by_key(const DataIteratorT d_A,
+                        IdxT lda,
                         const KeysIteratorT d_keys,
                         const WeightT* d_weights,
                         char* d_keys_char,
-                        int nrows,
-                        int ncols,
-                        int nkeys,
-                        DataIteratorT* d_sums,
-                        cudaStream_t stream)
+                        IdxT nrows,
+                        IdxT ncols,
+                        IdxT nkeys,
+                        SumsT* d_sums,
+                        cudaStream_t stream,
+                        bool reset_sums = true)
 {
   detail::reduce_rows_by_key(
-    d_A, lda, d_keys, d_weights, d_keys_char, nrows, ncols, nkeys, d_sums, stream);
+    d_A, lda, d_keys, d_weights, d_keys_char, nrows, ncols, nkeys, d_sums, stream, reset_sums);
 }
 
 /**
@@ -77,6 +85,8 @@ void reduce_rows_by_key(const DataIteratorT* d_A,
  * pointer type)
  * @tparam KeysIteratorT Random-access iterator type, for reading input keys (may be a simple
  * pointer type)
+ * @tparam SumsT         Type of the output sums
+ * @tparam IdxT          Index type
  * @param[in]  d_A         Input data array (lda x nrows)
  * @param[in]  lda         Real row size for input data, d_A
  * @param[in]  d_keys      Keys for each row (1 x nrows)
@@ -86,19 +96,21 @@ void reduce_rows_by_key(const DataIteratorT* d_A,
  * @param[in]  nkeys       Number of unique keys in d_keys
  * @param[out] d_sums      Row sums by key (ncols x d_keys)
  * @param[in]  stream      CUDA stream
+ * @param[in]  reset_sums  Whether to reset the output sums to zero before reducing
  */
-template <typename DataIteratorT, typename KeysIteratorT>
-void reduce_rows_by_key(const DataIteratorT* d_A,
-                        int lda,
-                        KeysIteratorT d_keys,
+template <typename DataIteratorT, typename KeysIteratorT, typename SumsT, typename IdxT>
+void reduce_rows_by_key(const DataIteratorT d_A,
+                        IdxT lda,
+                        const KeysIteratorT d_keys,
                         char* d_keys_char,
-                        int nrows,
-                        int ncols,
-                        int nkeys,
-                        DataIteratorT* d_sums,
-                        cudaStream_t stream)
+                        IdxT nrows,
+                        IdxT ncols,
+                        IdxT nkeys,
+                        SumsT* d_sums,
+                        cudaStream_t stream,
+                        bool reset_sums = true)
 {
-  typedef typename std::iterator_traits<DataIteratorT*>::value_type DataType;
+  typedef typename std::iterator_traits<DataIteratorT>::value_type DataType;
   reduce_rows_by_key(d_A,
                      lda,
                      d_keys,
@@ -108,7 +120,8 @@ void reduce_rows_by_key(const DataIteratorT* d_A,
                      ncols,
                      nkeys,
                      d_sums,
-                     stream);
+                     stream,
+                     reset_sums);
 }
 
 /**
@@ -128,9 +141,10 @@ void reduce_rows_by_key(const DataIteratorT* d_A,
  * @param[in]  d_keys      Keys for each row raft::device_vector_view (1 x nrows)
  * @param[out] d_sums      Row sums by key raft::device_matrix_view (ncols x d_keys)
  * @param[in]  n_unique_keys       Number of unique keys in d_keys
+ * @param[out] d_keys_char Scratch memory for conversion of keys to char, raft::device_vector_view
  * @param[in]  d_weights   Weights for each observation in d_A raft::device_vector_view optional (1
  * x nrows)
- * @param[out] d_keys_char Scratch memory for conversion of keys to char, raft::device_vector_view
+ * @param[in]  reset_sums  Whether to reset the output sums to zero before reducing
  */
 template <typename ElementType, typename KeyType, typename WeightType, typename IndexType>
 void reduce_rows_by_key(
@@ -140,7 +154,8 @@ void reduce_rows_by_key(
   raft::device_matrix_view<ElementType, IndexType, raft::row_major> d_sums,
   IndexType n_unique_keys,
   raft::device_vector_view<char, IndexType> d_keys_char,
-  std::optional<raft::device_vector_view<const WeightType, IndexType>> d_weights = std::nullopt)
+  std::optional<raft::device_vector_view<const WeightType, IndexType>> d_weights = std::nullopt,
+  bool reset_sums                                                                = true)
 {
   RAFT_EXPECTS(d_A.extent(0) == d_A.extent(0) && d_sums.extent(1) == n_unique_keys,
                "Output is not of size ncols * n_unique_keys");
@@ -158,7 +173,8 @@ void reduce_rows_by_key(
                        d_A.extent(0),
                        n_unique_keys,
                        d_sums.data_handle(),
-                       handle.get_stream());
+                       handle.get_stream(),
+                       reset_sums);
   } else {
     reduce_rows_by_key(d_A.data_handle(),
                        d_A.extent(0),
@@ -168,7 +184,8 @@ void reduce_rows_by_key(
                        d_A.extent(0),
                        n_unique_keys,
                        d_sums.data_handle(),
-                       handle.get_stream());
+                       handle.get_stream(),
+                       reset_sums);
   }
 }
 
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index bf0df065b2..5ddc10542b 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -21,13 +21,16 @@
 #include <thrust/gather.h>
 #include <thrust/transform.h>
 
+#include <raft/cluster/detail/kmeans_common.cuh>
 #include <raft/common/nvtx.hpp>
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_types.hpp>
 #include <raft/distance/fused_l2_nn.cuh>
+#include <raft/linalg/add.cuh>
 #include <raft/linalg/gemm.cuh>
+#include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/linalg/norm.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/matrix.cuh>
@@ -231,9 +234,11 @@ constexpr inline auto calc_minibatch_size(uint32_t n_clusters,
  *    When set to `false`, this function may be used to update existing centers and sizes using
  *    the weighted average principle.
  * @param stream
+ * @param mr (optional) memory resource to use for temporary allocations
  */
 template <typename T, typename IdxT, typename LabelT>
-void calc_centers_and_sizes(float* centers,
+void calc_centers_and_sizes(const handle_t& handle,
+                            float* centers,
                             uint32_t* cluster_sizes,
                             uint32_t n_clusters,
                             uint32_t dim,
@@ -241,12 +246,12 @@ void calc_centers_and_sizes(float* centers,
                             IdxT n_rows,
                             const LabelT* labels,
                             bool reset_counters,
-                            rmm::cuda_stream_view stream)
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr = nullptr)
 {
-  if (reset_counters) {
-    utils::memzero(centers, n_clusters * dim, stream);
-    utils::memzero(cluster_sizes, n_clusters, stream);
-  } else {
+  if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
+
+  if (!reset_counters) {
     utils::map_along_rows(
       n_clusters,
       dim,
@@ -255,13 +260,68 @@ void calc_centers_and_sizes(float* centers,
       [] __device__(float c, uint32_t s) -> float { return c * s; },
       stream);
   }
-  utils::accumulate_into_selected(n_rows, dim, centers, cluster_sizes, dataset, labels, stream);
-  utils::map_along_rows(
-    n_clusters,
-    dim,
+
+  rmm::device_uvector<char> workspace(0, stream, mr);
+  rmm::device_uvector<float> cluster_sizes_f(n_clusters, stream, mr);
+  float* sizes_f = cluster_sizes_f.data();
+
+  // If we reset the counters, we can compute directly the new sizes in cluster_sizes.
+  // If we don't reset, we compute in a temporary buffer and add in a separate step.
+  rmm::device_uvector<uint32_t> temp_cluster_sizes(0, stream, mr);
+  uint32_t* temp_sizes = cluster_sizes;
+  if (!reset_counters) {
+    temp_cluster_sizes.resize(n_clusters, stream);
+    temp_sizes = temp_cluster_sizes.data();
+  }
+
+  utils::mapping<float> mapping_op;
+  cub::TransformInputIterator<float, utils::mapping<float>, const T*> mapping_itr(dataset,
+                                                                                  mapping_op);
+
+  // todo(lsugy): use iterator from KV output of fusedL2NN
+  raft::linalg::reduce_rows_by_key(mapping_itr,
+                                   (int64_t)dim,
+                                   labels,
+                                   nullptr,
+                                   (int64_t)n_rows,
+                                   (int64_t)dim,
+                                   (int64_t)n_clusters,
+                                   centers,
+                                   stream,
+                                   reset_counters);
+
+  // Compute weight of each cluster
+  // todo(lsugy): larger index type?
+  raft::cluster::detail::countLabels(
+    handle, labels, temp_sizes, (uint32_t)n_rows, (uint32_t)n_clusters, workspace);
+
+  // Add previous sizes if necessary and cast to float
+  auto counting = thrust::make_counting_iterator<int>(0);
+  thrust::for_each(
+    handle.get_thrust_policy(), counting, counting + n_clusters, [=] __device__(int idx) {
+      uint32_t temp_size = temp_sizes[idx];
+      if (!reset_counters) {
+        temp_size += cluster_sizes[idx];
+        cluster_sizes[idx] = temp_size;
+      }
+      sizes_f[idx] = static_cast<float>(temp_size);
+    });
+
+  // todo(lsugy): custom iterator for float conversion?
+  raft::linalg::matrixVectorOp(
+    centers,
     centers,
-    cluster_sizes,
-    [] __device__(float c, uint32_t s) -> float { return s == 0 ? 0.0f : c / float(s); },
+    sizes_f,
+    (int64_t)dim,
+    (int64_t)n_clusters,
+    true,
+    false,
+    [=] __device__(float mat, float vec) {
+      if (vec == 0.0f)
+        return 0.0f;
+      else
+        return mat / vec;
+    },
     stream);
 }
 
@@ -627,7 +687,8 @@ void balancing_em_iters(const handle_t& handle,
                              device_memory,
                              dataset_norm);
     // M: Maximization step - calculate optimal cluster centers
-    calc_centers_and_sizes(cluster_centers,
+    calc_centers_and_sizes(handle,
+                           cluster_centers,
                            cluster_sizes,
                            n_clusters,
                            dim,
@@ -635,7 +696,8 @@ void balancing_em_iters(const handle_t& handle,
                            n_rows,
                            cluster_labels,
                            true,
-                           stream);
+                           stream,
+                           device_memory);
   }
 }
 
@@ -666,8 +728,17 @@ void build_clusters(const handle_t& handle,
   linalg::writeOnlyUnaryOp<LabelT, decltype(f), IdxT>(cluster_labels, n_rows, f, stream);
 
   // update centers to match the initialized labels.
-  calc_centers_and_sizes(
-    cluster_centers, cluster_sizes, n_clusters, dim, dataset, n_rows, cluster_labels, true, stream);
+  calc_centers_and_sizes(handle,
+                         cluster_centers,
+                         cluster_sizes,
+                         n_clusters,
+                         dim,
+                         dataset,
+                         n_rows,
+                         cluster_labels,
+                         true,
+                         stream,
+                         device_memory);
 
   // run EM
   balancing_em_iters<T, IdxT, LabelT>(handle,
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index 8dda574314..11313b6ed5 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -112,13 +112,13 @@ struct mapping {
    * @{
    */
   template <typename S>
-  HDI auto operator()(const S& x) -> std::enable_if_t<std::is_same_v<S, T>, T>
+  HDI auto operator()(const S& x) const -> std::enable_if_t<std::is_same_v<S, T>, T>
   {
     return x;
   };
 
   template <typename S>
-  HDI auto operator()(const S& x) -> std::enable_if_t<!std::is_same_v<S, T>, T>
+  HDI auto operator()(const S& x) const -> std::enable_if_t<!std::is_same_v<S, T>, T>
   {
     constexpr double kMult = config<T>::kDivisor / config<S>::kDivisor;
     if constexpr (std::is_floating_point_v<S>) { return static_cast<T>(x * static_cast<S>(kMult)); }
@@ -383,6 +383,8 @@ __global__ void map_along_rows_kernel(
  * @brief Map a binary function over a matrix and a vector element-wise, broadcasting the vector
  * values along rows: `m[i, j] = op(m[i,j], v[i])`
  *
+ * @todo(lsugy): replace with matrix_vector_op
+ *
  * NB: device-only function
  *
  * @tparam IdxT   index type
diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
index af1cb97d36..14f5ae4516 100644
--- a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh
@@ -144,7 +144,8 @@ inline auto extend(const handle_t& handle,
   raft::copy(
     list_sizes_ptr, orig_index.list_sizes().data_handle(), ext_index.list_sizes().size(), stream);
 
-  kmeans::calc_centers_and_sizes(centers_ptr,
+  kmeans::calc_centers_and_sizes(handle,
+                                 centers_ptr,
                                  list_sizes_ptr,
                                  n_lists,
                                  dim,
diff --git a/cpp/test/linalg/reduce_rows_by_key.cu b/cpp/test/linalg/reduce_rows_by_key.cu
index e575f37dd6..7b124cb7bb 100644
--- a/cpp/test/linalg/reduce_rows_by_key.cu
+++ b/cpp/test/linalg/reduce_rows_by_key.cu
@@ -103,7 +103,7 @@ class ReduceRowTest : public ::testing::TestWithParam<ReduceRowsInputs<T>> {
     raft::random::RngState r(params.seed);
     raft::random::RngState r_int(params.seed);
 
-    int nobs       = params.nobs;
+    uint32_t nobs  = params.nobs;
     uint32_t cols  = params.cols;
     uint32_t nkeys = params.nkeys;
     uniform(handle, r, in.data(), nobs * cols, T(0.0), T(2.0 / nobs));

From 529931dc10c5d8303f6eaeff85b2f149b05b2173 Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Mon, 10 Oct 2022 11:57:28 +0200
Subject: [PATCH 02/25] Remove accumulate_into_selected

---
 .../knn/detail/ann_kmeans_balanced.cuh        |  2 -
 .../raft/spatial/knn/detail/ann_utils.cuh     | 66 -------------------
 2 files changed, 68 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index 5ddc10542b..d322be0c0a 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -291,7 +291,6 @@ void calc_centers_and_sizes(const handle_t& handle,
                                    reset_counters);
 
   // Compute weight of each cluster
-  // todo(lsugy): larger index type?
   raft::cluster::detail::countLabels(
     handle, labels, temp_sizes, (uint32_t)n_rows, (uint32_t)n_clusters, workspace);
 
@@ -307,7 +306,6 @@ void calc_centers_and_sizes(const handle_t& handle,
       sizes_f[idx] = static_cast<float>(temp_size);
     });
 
-  // todo(lsugy): custom iterator for float conversion?
   raft::linalg::matrixVectorOp(
     centers,
     centers,
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index 11313b6ed5..f302b3d4a0 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -259,72 +259,6 @@ inline void dots_along_rows(
    */
 }
 
-template <typename T, typename IdxT, typename LabelT>
-__global__ void accumulate_into_selected_kernel(IdxT n_rows,
-                                                uint32_t n_cols,
-                                                float* output,
-                                                uint32_t* selection_counters,
-                                                const T* input,
-                                                const LabelT* row_ids)
-{
-  IdxT gid = threadIdx.x + (blockDim.x * static_cast<IdxT>(blockIdx.x));
-  IdxT j   = gid % n_cols;
-  IdxT i   = gid / n_cols;
-  if (i >= n_rows) return;
-  IdxT l = static_cast<IdxT>(row_ids[i]);
-  if (j == 0) { atomicAdd(&(selection_counters[l]), 1); }
-  atomicAdd(&(output[j + n_cols * l]), mapping<float>{}(input[gid]));
-}
-
-/**
- * @brief Add all rows of input matrix into a selection of rows in the output matrix
- * (cast and possibly scale the data input type). Count the number of times every output
- * row was selected along the way.
- *
- * @tparam T      element type
- * @tparam IdxT   index type
- * @tparam LabelT label type
- *
- * @param n_cols number of columns in all matrices
- * @param[out] output output matrix [..., n_cols]
- * @param[inout] selection_counters number of occurrences of each row id in row_ids [..., n_cols]
- * @param n_rows number of rows in the input
- * @param[in] input row-major input matrix [n_rows, n_cols]
- * @param[in] row_ids row indices in the output matrix [n_rows]
- */
-template <typename T, typename IdxT, typename LabelT>
-void accumulate_into_selected(IdxT n_rows,
-                              uint32_t n_cols,
-                              float* output,
-                              uint32_t* selection_counters,
-                              const T* input,
-                              const LabelT* row_ids,
-                              rmm::cuda_stream_view stream)
-{
-  switch (check_pointer_residency(output, input, selection_counters, row_ids)) {
-    case pointer_residency::host_and_device:
-    case pointer_residency::device_only: {
-      uint32_t block_dim = 128;
-      auto grid_dim =
-        static_cast<uint32_t>(ceildiv<IdxT>(n_rows * static_cast<IdxT>(n_cols), block_dim));
-      accumulate_into_selected_kernel<T><<<grid_dim, block_dim, 0, stream>>>(
-        n_rows, n_cols, output, selection_counters, input, row_ids);
-    } break;
-    case pointer_residency::host_only: {
-      stream.synchronize();
-      for (IdxT i = 0; i < n_rows; i++) {
-        IdxT l = static_cast<IdxT>(row_ids[i]);
-        selection_counters[l]++;
-        for (IdxT j = 0; j < n_cols; j++) {
-          output[j + n_cols * l] += mapping<float>{}(input[j + n_cols * i]);
-        }
-      }
-      stream.synchronize();
-    } break;
-    default: RAFT_FAIL("All pointers must reside on the same side, host or device.");
-  }
-}
-
 template <typename IdxT>
 __global__ void normalize_rows_kernel(IdxT n_rows, IdxT n_cols, float* a)
 {

From 6a61537d08b65ec8be122227d63e8353007ce57f Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Mon, 10 Oct 2022 19:20:59 +0200
Subject: [PATCH 03/25] Replace map_along_rows with matrixVectorOp

---
 .../raft/linalg/detail/matrix_vector_op.cuh   | 30 +++++++---
 cpp/include/raft/linalg/matrix_vector_op.cuh  | 39 +++++++++----
 .../raft/matrix/detail/linewise_op.cuh        | 58 ++++++++++---------
 cpp/include/raft/matrix/matrix.cuh            |  2 +-
 .../knn/detail/ann_kmeans_balanced.cuh        | 21 +++----
 .../raft/spatial/knn/detail/ann_utils.cuh     | 41 -------------
 6 files changed, 93 insertions(+), 98 deletions(-)

diff --git a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
index 4cfccdcaa3..d00014e56f 100644
--- a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
@@ -74,6 +74,7 @@ __global__ void matrixVectorOpKernel(Type* out,
   mat.store(out, idx);
 }
 
+// todo(lsugy): remove this function? It isn't used.
 template <typename Type, int veclen_, typename Lambda, typename IdxType, int TPB>
 void matrixVectorOpImpl(Type* out,
                         const Type* matrix,
@@ -92,10 +93,15 @@ void matrixVectorOpImpl(Type* out,
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
-template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
-void matrixVectorOp(Type* out,
-                    const Type* matrix,
-                    const Type* vec,
+template <typename OutT,
+          typename Lambda,
+          typename MatT,
+          typename VecT,
+          typename IdxType = int,
+          int TPB          = 256>
+void matrixVectorOp(OutT* out,
+                    const MatT* matrix,
+                    const VecT* vec,
                     IdxType D,
                     IdxType N,
                     bool rowMajor,
@@ -109,11 +115,17 @@ void matrixVectorOp(Type* out,
     out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec);
 }
 
-template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
-void matrixVectorOp(Type* out,
-                    const Type* matrix,
-                    const Type* vec1,
-                    const Type* vec2,
+template <typename OutT,
+          typename Lambda,
+          typename MatT,
+          typename Vec1T,
+          typename Vec2T,
+          typename IdxType = int,
+          int TPB          = 256>
+void matrixVectorOp(OutT* out,
+                    const MatT* matrix,
+                    const Vec1T* vec1,
+                    const Vec2T* vec2,
                     IdxType D,
                     IdxType N,
                     bool rowMajor,
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index 1438a09bd3..f9e9f370ab 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -35,8 +35,10 @@ namespace linalg {
  * Note : the function will also check that the size of the window of accesses
  * is a multiple of the number of elements processed by a thread in order to
  * enable faster processing
- * @tparam Type the matrix/vector type
+ * @tparam OutT the output type
  * @tparam Lambda a device function which represents a binary operator
+ * @tparam MatT the input matrix type
+ * @tparam VecT the input vector type
  * @tparam IdxType Integer type used to for addressing
  * @tparam TPB threads per block of the cuda kernel launched
  * @param out the output matrix (passing out = matrix makes it in-place)
@@ -50,10 +52,15 @@ namespace linalg {
  * @param op the mathematical operation
  * @param stream cuda stream where to launch work
  */
-template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
-void matrixVectorOp(Type* out,
-                    const Type* matrix,
-                    const Type* vec,
+template <typename OutT,
+          typename Lambda,
+          typename MatT,
+          typename VecT,
+          typename IdxType = int,
+          int TPB          = 256>
+void matrixVectorOp(OutT* out,
+                    const MatT* matrix,
+                    const VecT* vec,
                     IdxType D,
                     IdxType N,
                     bool rowMajor,
@@ -72,7 +79,11 @@ void matrixVectorOp(Type* out,
  * Note : the function will also check that the size of the window of accesses
  * is a multiple of the number of elements processed by a thread in order to
  * enable faster processing
- * @tparam Type the matrix/vector type
+ * @tparam OutT the output type
+ * @tparam Lambda a device function which represents a binary operator
+ * @tparam MatT the input matrix type
+ * @tparam Vec1T the first input vector type
+ * @tparam Vec2T the second input vector type
  * @tparam Lambda a device function which represents a binary operator
  * @tparam IdxType Integer type used to for addressing
  * @tparam TPB threads per block of the cuda kernel launched
@@ -88,11 +99,17 @@ void matrixVectorOp(Type* out,
  * @param op the mathematical operation
  * @param stream cuda stream where to launch work
  */
-template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
-void matrixVectorOp(Type* out,
-                    const Type* matrix,
-                    const Type* vec1,
-                    const Type* vec2,
+template <typename OutT,
+          typename Lambda,
+          typename MatT,
+          typename Vec1T,
+          typename Vec2T,
+          typename IdxType = int,
+          int TPB          = 256>
+void matrixVectorOp(OutT* out,
+                    const MatT* matrix,
+                    const Vec1T* vec1,
+                    const Vec2T* vec2,
                     IdxType D,
                     IdxType N,
                     bool rowMajor,
diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh
index 15f5204382..1be5dbc91c 100644
--- a/cpp/include/raft/matrix/detail/linewise_op.cuh
+++ b/cpp/include/raft/matrix/detail/linewise_op.cuh
@@ -76,7 +76,7 @@ struct Linewise {
                                                     IdxType rowDiv,
                                                     IdxType rowMod,
                                                     Lambda op,
-                                                    Vecs... vecs) noexcept
+                                                    const Vecs*... vecs) noexcept
   {
     constexpr IdxType warpPad = (AlignWarp::Value - 1) * VecElems;
     Type args[sizeof...(Vecs)];
@@ -157,11 +157,14 @@ struct Linewise {
    * @param [in] rowLen the length of a vector.
    * @return a contiguous chunk of a vector, suitable for `vectorRows`.
    */
-  static __device__ __forceinline__ Vec loadVec(Type* shm,
-                                                const Type* p,
-                                                const IdxType blockOffset,
-                                                const IdxType rowLen) noexcept
+  template <typename VecT>
+  static __device__ __forceinline__ raft::TxN_t<VecT, VecElems> loadVec(
+    VecT* shm, const VecT* p, const IdxType blockOffset, const IdxType rowLen) noexcept
   {
+    static_assert(
+      sizeof(Type) == sizeof(VecT),
+      "linewiseOp currently requires that the size of the matrix and vector types be the same");
+
     IdxType j = blockOffset + threadIdx.x;
 #pragma unroll VecElems
     for (int k = threadIdx.x; k < VecElems * BlockSize; k += BlockSize, j += BlockSize) {
@@ -171,8 +174,9 @@ struct Linewise {
     }
     __syncthreads();
     {
-      Vec out;
-      *out.vectorized_data() = reinterpret_cast<typename Vec::io_t*>(shm)[threadIdx.x];
+      raft::TxN_t<VecT, VecElems> out;
+      *out.vectorized_data() =
+        reinterpret_cast<typename raft::TxN_t<VecT, VecElems>::io_t*>(shm)[threadIdx.x];
       return out;
     }
   }
@@ -209,7 +213,7 @@ __global__ void __launch_bounds__(BlockSize)
                                   const IdxType len,
                                   const IdxType elemsPerThread,
                                   Lambda op,
-                                  Vecs... vecs)
+                                  const Vecs*... vecs)
 {
   typedef Linewise<Type, IdxType, VecBytes, BlockSize> L;
 
@@ -253,7 +257,7 @@ __global__ void __launch_bounds__(MaxOffset, 2)
                                   const IdxType rowLen,
                                   const IdxType len,
                                   Lambda op,
-                                  Vecs... vecs)
+                                  const Vecs*... vecs)
 {
   // Note, L::VecElems == 1
   typedef Linewise<Type, IdxType, sizeof(Type), MaxOffset> L;
@@ -309,7 +313,7 @@ __global__ void __launch_bounds__(BlockSize)
                                   const IdxType rowLen,
                                   const IdxType len,
                                   Lambda op,
-                                  Vecs... vecs)
+                                  const Vecs*... vecs)
 {
   typedef Linewise<Type, IdxType, VecBytes, BlockSize> L;
   constexpr uint workSize = L::VecElems * BlockSize;
@@ -322,7 +326,7 @@ __global__ void __launch_bounds__(BlockSize)
     reinterpret_cast<const typename L::Vec::io_t*>(in),
     L::AlignElems::div(len),
     op,
-    (workOffset ^= workSize, L::loadVec(shm + workOffset, vecs, blockOffset, rowLen))...);
+    (workOffset ^= workSize, L::loadVec((Vecs*)(shm + workOffset), vecs, blockOffset, rowLen))...);
 }
 
 /**
@@ -351,7 +355,7 @@ __global__ void __launch_bounds__(MaxOffset, 2)
                                   const IdxType rowLen,
                                   const IdxType len,
                                   Lambda op,
-                                  Vecs... vecs)
+                                  const Vecs*... vecs)
 {
   // Note, L::VecElems == 1
   constexpr uint workSize = MaxOffset;
@@ -360,20 +364,21 @@ __global__ void __launch_bounds__(MaxOffset, 2)
   typedef Linewise<Type, IdxType, sizeof(Type), MaxOffset> L;
   if (blockIdx.x == 0) {
     // first block: offset = 0, length = arrOffset
-    L::vectorRows(reinterpret_cast<typename L::Vec::io_t*>(out),
-                  reinterpret_cast<const typename L::Vec::io_t*>(in),
-                  arrOffset,
-                  op,
-                  (workOffset ^= workSize, L::loadVec(shm + workOffset, vecs, 0, rowLen))...);
+    L::vectorRows(
+      reinterpret_cast<typename L::Vec::io_t*>(out),
+      reinterpret_cast<const typename L::Vec::io_t*>(in),
+      arrOffset,
+      op,
+      (workOffset ^= workSize, L::loadVec((Vecs*)(shm + workOffset), vecs, 0, rowLen))...);
   } else {
     // second block: offset = arrTail, length = len - arrTail
     // NB: I substract MaxOffset (= blockDim.x) to get the correct indexing for block 1
-    L::vectorRows(
-      reinterpret_cast<typename L::Vec::io_t*>(out + arrTail - MaxOffset),
-      reinterpret_cast<const typename L::Vec::io_t*>(in + arrTail - MaxOffset),
-      len - arrTail + MaxOffset,
-      op,
-      (workOffset ^= workSize, L::loadVec(shm + workOffset, vecs, arrTail % rowLen, rowLen))...);
+    L::vectorRows(reinterpret_cast<typename L::Vec::io_t*>(out + arrTail - MaxOffset),
+                  reinterpret_cast<const typename L::Vec::io_t*>(in + arrTail - MaxOffset),
+                  len - arrTail + MaxOffset,
+                  op,
+                  (workOffset ^= workSize,
+                   L::loadVec((Vecs*)(shm + workOffset), vecs, arrTail % rowLen, rowLen))...);
   }
 }
 
@@ -409,7 +414,7 @@ void matrixLinewiseVecCols(Type* out,
                            const IdxType nRows,
                            Lambda op,
                            cudaStream_t stream,
-                           Vecs... vecs)
+                           const Vecs*... vecs)
 {
   typedef raft::Pow2<VecBytes> AlignBytes;
   constexpr std::size_t VecElems = VecBytes / sizeof(Type);
@@ -456,7 +461,7 @@ void matrixLinewiseVecRows(Type* out,
                            const IdxType nRows,
                            Lambda op,
                            cudaStream_t stream,
-                           Vecs... vecs)
+                           const Vecs*... vecs)
 {
   typedef raft::Pow2<VecBytes> AlignBytes;
   constexpr std::size_t VecElems = VecBytes / sizeof(Type);
@@ -527,8 +532,9 @@ struct MatrixLinewiseOp {
                   const bool alongLines,
                   Lambda op,
                   cudaStream_t stream,
-                  Vecs... vecs)
+                  const Vecs*... vecs)
   {
+    // todo(lsugy): add test with different vector types
     if constexpr (VecBytes > sizeof(Type)) {
       if (!raft::Pow2<VecBytes>::areSameAlignOffsets(in, out))
         return MatrixLinewiseOp<std::max((VecBytes >> 1), sizeof(Type)), BlockSize>::run(
diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh
index 3a7e0dad47..cd6c4fa219 100644
--- a/cpp/include/raft/matrix/matrix.cuh
+++ b/cpp/include/raft/matrix/matrix.cuh
@@ -289,7 +289,7 @@ void linewiseOp(m_t* out,
                 const bool alongLines,
                 Lambda op,
                 cudaStream_t stream,
-                Vecs... vecs)
+                const Vecs*... vecs)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope("linewiseOp-%c-%zu (%zu, %zu)",
                                                             alongLines ? 'l' : 'x',
diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index d322be0c0a..8167827dfb 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -252,18 +252,19 @@ void calc_centers_and_sizes(const handle_t& handle,
   if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
 
   if (!reset_counters) {
-    utils::map_along_rows(
-      n_clusters,
-      dim,
+    raft::linalg::matrixVectorOp(
+      centers,
       centers,
       cluster_sizes,
-      [] __device__(float c, uint32_t s) -> float { return c * s; },
+      (int64_t)dim,
+      (int64_t)n_clusters,
+      true,
+      false,
+      [=] __device__(float c, uint32_t s) -> float { return c * s; },
       stream);
   }
 
   rmm::device_uvector<char> workspace(0, stream, mr);
-  rmm::device_uvector<float> cluster_sizes_f(n_clusters, stream, mr);
-  float* sizes_f = cluster_sizes_f.data();
 
   // If we reset the counters, we can compute directly the new sizes in cluster_sizes.
   // If we don't reset, we compute in a temporary buffer and add in a separate step.
@@ -295,6 +296,7 @@ void calc_centers_and_sizes(const handle_t& handle,
     handle, labels, temp_sizes, (uint32_t)n_rows, (uint32_t)n_clusters, workspace);
 
   // Add previous sizes if necessary and cast to float
+  // todo(lsugy): replace with add wrapped in if
   auto counting = thrust::make_counting_iterator<int>(0);
   thrust::for_each(
     handle.get_thrust_policy(), counting, counting + n_clusters, [=] __device__(int idx) {
@@ -303,19 +305,18 @@ void calc_centers_and_sizes(const handle_t& handle,
         temp_size += cluster_sizes[idx];
         cluster_sizes[idx] = temp_size;
       }
-      sizes_f[idx] = static_cast<float>(temp_size);
     });
 
   raft::linalg::matrixVectorOp(
     centers,
     centers,
-    sizes_f,
+    cluster_sizes,
     (int64_t)dim,
     (int64_t)n_clusters,
     true,
     false,
-    [=] __device__(float mat, float vec) {
-      if (vec == 0.0f)
+    [=] __device__(float mat, uint32_t vec) {
+      if (vec == 0u)
         return 0.0f;
       else
         return mat / vec;
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index f302b3d4a0..7b26ccfb42 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -302,47 +302,6 @@ inline void normalize_rows(IdxT n_rows, IdxT n_cols, float* a, rmm::cuda_stream_
   normalize_rows_kernel<IdxT><<<blocks, threads, 0, stream>>>(n_rows, n_cols, a);
 }
 
-template <typename IdxT, typename Lambda>
-__global__ void map_along_rows_kernel(
-  IdxT n_rows, uint32_t n_cols, float* a, const uint32_t* d, Lambda map)
-{
-  IdxT gid = threadIdx.x + blockDim.x * static_cast<IdxT>(blockIdx.x);
-  IdxT i   = gid / n_cols;
-  if (i >= n_rows) return;
-  float& x = a[gid];
-  x        = map(x, d[i]);
-}
-
-/**
- * @brief Map a binary function over a matrix and a vector element-wise, broadcasting the vector
- * values along rows: `m[i, j] = op(m[i,j], v[i])`
- *
- * @todo(lsugy): replace with matrix_vector_op
- *
- * NB: device-only function
- *
- * @tparam IdxT   index type
- * @tparam Lambda
- *
- * @param n_rows
- * @param n_cols
- * @param[inout] m device pointer to a row-major matrix [n_rows, n_cols]
- * @param[in] v device pointer to a vector [n_rows]
- * @param op the binary operation to apply on every element of matrix rows and of the vector
- */
-template <typename IdxT, typename Lambda>
-inline void map_along_rows(IdxT n_rows,
-                           uint32_t n_cols,
-                           float* m,
-                           const uint32_t* v,
-                           Lambda op,
-                           rmm::cuda_stream_view stream)
-{
-  dim3 threads(128, 1, 1);
-  dim3 blocks(ceildiv<IdxT>(n_rows * n_cols, threads.x), 1, 1);
-  map_along_rows_kernel<<<blocks, threads, 0, stream>>>(n_rows, n_cols, m, v, op);
-}
-
 template <typename T, typename IdxT>
 __global__ void outer_add_kernel(const T* a, IdxT len_a, const T* b, IdxT len_b, T* c)
 {

From 0a98482936f2153918df98c7740fedfc91e28a0d Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Tue, 18 Oct 2022 14:56:37 +0200
Subject: [PATCH 04/25] Start adding support for arbitrary types in linewiseOp

---
 .../raft/matrix/detail/linewise_op.cuh        | 26 ++++++++++++++-----
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh
index 1be5dbc91c..9b9a7d2967 100644
--- a/cpp/include/raft/matrix/detail/linewise_op.cuh
+++ b/cpp/include/raft/matrix/detail/linewise_op.cuh
@@ -284,6 +284,15 @@ __global__ void __launch_bounds__(MaxOffset, 2)
     vecs...);
 }
 
+/** Helper function to get the largest type from a variadic list of types */
+template <typename... Types>
+constexpr size_t maxSizeOf()
+{
+  size_t maxSize = 0;
+  ((maxSize = std::max(maxSize, sizeof(Types))), ...);
+  return maxSize;
+}
+
 /**
  * This kernel prepares the inputs for the `vectorRows` function where the most of the
  * work happens; see `vectorRows` for details.
@@ -316,10 +325,12 @@ __global__ void __launch_bounds__(BlockSize)
                                   const Vecs*... vecs)
 {
   typedef Linewise<Type, IdxType, VecBytes, BlockSize> L;
-  constexpr uint workSize = L::VecElems * BlockSize;
-  uint workOffset         = workSize;
-  __shared__ __align__(sizeof(Type) * L::VecElems)
-    Type shm[workSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)];
+  constexpr uint workSize         = L::VecElems * BlockSize;
+  constexpr size_t maxVecItemSize = maxSizeOf<Vecs...>();
+  uint workOffset                 = workSize * maxVecItemSize;
+  __shared__ __align__(
+    maxVecItemSize *
+    L::VecElems) char shm[workSize * maxVecItemSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)];
   const IdxType blockOffset = (arrOffset + BlockSize * L::VecElems * blockIdx.x) % rowLen;
   return L::vectorRows(
     reinterpret_cast<typename L::Vec::io_t*>(out),
@@ -358,9 +369,10 @@ __global__ void __launch_bounds__(MaxOffset, 2)
                                   const Vecs*... vecs)
 {
   // Note, L::VecElems == 1
-  constexpr uint workSize = MaxOffset;
-  uint workOffset         = workSize;
-  __shared__ Type shm[workSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)];
+  constexpr uint workSize         = MaxOffset;
+  constexpr size_t maxVecItemSize = maxSizeOf<Vecs...>();
+  uint workOffset         = workSize * maxVecItemSize;
+  __shared__ char shm[workSize * maxVecItemSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)];
   typedef Linewise<Type, IdxType, sizeof(Type), MaxOffset> L;
   if (blockIdx.x == 0) {
     // first block: offset = 0, length = arrOffset

From eafa61e542cd3af97de8a10eb56bca0c55f1c0fa Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Tue, 18 Oct 2022 14:57:33 +0200
Subject: [PATCH 05/25] Allow different types for output, matrix and vector(s)
 in mdspan-based API for matrixVectorOp

---
 cpp/include/raft/linalg/matrix_vector_op.cuh | 24 ++++++++++++--------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index f9e9f370ab..41e00b568b 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -133,7 +133,8 @@ void matrixVectorOp(OutT* out,
  * Note : the function will also check that the size of the window of accesses
  * is a multiple of the number of elements processed by a thread in order to
  * enable faster processing
- * @tparam InValueType the data-type of the input matrices and vectors
+ * @tparam MatValueType the data-type of the input matrix
+ * @tparam VecValueType the data-type of the input vector
  * @tparam LayoutPolicy the layout of input and output (raft::row_major or raft::col_major)
  * @tparam Lambda a device function which represents a binary operator
  * @tparam OutElementType the data-type of the output raft::matrix_view
@@ -147,15 +148,16 @@ void matrixVectorOp(OutT* out,
  * the rows of the matrix or columns using enum class raft::linalg::Apply
  * @param[in] op the mathematical operation
  */
-template <typename InValueType,
+template <typename MatValueType,
+          typename VecValueType,
           typename LayoutPolicy,
           typename Lambda,
           typename OutValueType,
           typename IndexType,
           int TPB = 256>
 void matrix_vector_op(const raft::handle_t& handle,
-                      raft::device_matrix_view<const InValueType, IndexType, LayoutPolicy> matrix,
-                      raft::device_vector_view<const InValueType, IndexType> vec,
+                      raft::device_matrix_view<const MatValueType, IndexType, LayoutPolicy> matrix,
+                      raft::device_vector_view<const VecValueType, IndexType> vec,
                       raft::device_matrix_view<OutValueType, IndexType, LayoutPolicy> out,
                       Apply apply,
                       Lambda op)
@@ -194,7 +196,9 @@ void matrix_vector_op(const raft::handle_t& handle,
  * Note : the function will also check that the size of the window of accesses
  * is a multiple of the number of elements processed by a thread in order to
  * enable faster processing
- * @tparam InValueType the data-type of the input matrices and vectors
+ * @tparam MatValueType the data-type of the input matrix
+ * @tparam Vec1ValueType the data-type of the first input vector
+ * @tparam Vec2ValueType the data-type of the second input vector
  * @tparam LayoutPolicy the layout of input and output (raft::row_major or raft::col_major)
  * @tparam Lambda a device function which represents a binary operator
  * @tparam OutElementType the data-type of the output raft::matrix_view
@@ -209,16 +213,18 @@ void matrix_vector_op(const raft::handle_t& handle,
  * the rows of the matrix or columns using enum class raft::linalg::Apply
  * @param op the mathematical operation
  */
-template <typename InValueType,
+template <typename MatValueType,
+          typename Vec1ValueType,
+          typename Vec2ValueType,
           typename LayoutPolicy,
           typename Lambda,
           typename OutValueType,
           typename IndexType,
           int TPB = 256>
 void matrix_vector_op(const raft::handle_t& handle,
-                      raft::device_matrix_view<const InValueType, IndexType, LayoutPolicy> matrix,
-                      raft::device_vector_view<const InValueType, IndexType> vec1,
-                      raft::device_vector_view<const InValueType, IndexType> vec2,
+                      raft::device_matrix_view<const MatValueType, IndexType, LayoutPolicy> matrix,
+                      raft::device_vector_view<const Vec1ValueType, IndexType> vec1,
+                      raft::device_vector_view<const Vec2ValueType, IndexType> vec2,
                       raft::device_matrix_view<OutValueType, IndexType, LayoutPolicy> out,
                       Apply apply,
                       Lambda op)

From 74309ed9702020adbc21b3506230ee620e156b1b Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Tue, 18 Oct 2022 14:58:35 +0200
Subject: [PATCH 06/25] Call cub histogram with signed type to avoid a warning
 breaking compilation

---
 cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index 8167827dfb..b429d2a052 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -293,7 +293,7 @@ void calc_centers_and_sizes(const handle_t& handle,
 
   // Compute weight of each cluster
   raft::cluster::detail::countLabels(
-    handle, labels, temp_sizes, (uint32_t)n_rows, (uint32_t)n_clusters, workspace);
+    handle, labels, temp_sizes, (int64_t)n_rows, (int64_t)n_clusters, workspace);
 
   // Add previous sizes if necessary and cast to float
   // todo(lsugy): replace with add wrapped in if

From df7cfb5599ac433c5408957c2dab21101e26559d Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Tue, 18 Oct 2022 14:59:39 +0200
Subject: [PATCH 07/25] Start adding support for different output/matrix/vector
 types in MatVecOpTest + remove redundant inputs in LinewiseTest

---
 cpp/test/linalg/matrix_vector_op.cu | 243 +++++++++++++---------------
 cpp/test/matrix/linewise_op.cu      |   1 -
 2 files changed, 110 insertions(+), 134 deletions(-)

diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index 2023ce4121..9fb4647eb9 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -27,7 +27,7 @@ template <typename T, typename IdxType = int>
 struct MatVecOpInputs {
   T tolerance;
   IdxType rows, cols;
-  bool rowMajor, bcastAlongRows, useTwoVectors;
+  bool rowMajor, bcastAlongRows;
   unsigned long long int seed;
 };
 
@@ -40,68 +40,58 @@ template <typename T, typename IdxType>
 // Or else, we get the following compilation error
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
-template <typename T, typename IdxType>
+template <typename OpT,
+          typename OutT,
+          typename IdxType,
+          typename MatT,
+          typename Vec1T,
+          typename Vec2T>
 void matrixVectorOpLaunch(const raft::handle_t& handle,
-                          T* out,
-                          const T* in,
-                          const T* vec1,
-                          const T* vec2,
+                          OutT* out,
+                          const MatT* in,
+                          const Vec1T* vec1,
+                          const Vec2T* vec2,
                           IdxType D,
                           IdxType N,
                           bool rowMajor,
-                          bool bcastAlongRows,
-                          bool useTwoVectors)
+                          bool bcastAlongRows)
 {
-  auto out_row_major = raft::make_device_matrix_view<T, IdxType, raft::row_major>(out, N, D);
-  auto in_row_major  = raft::make_device_matrix_view<const T, IdxType, raft::row_major>(in, N, D);
+  auto out_row_major = raft::make_device_matrix_view<OutT, IdxType, raft::row_major>(out, N, D);
+  auto in_row_major = raft::make_device_matrix_view<const MatT, IdxType, raft::row_major>(in, N, D);
 
-  auto out_col_major = raft::make_device_matrix_view<T, IdxType, raft::col_major>(out, N, D);
-  auto in_col_major  = raft::make_device_matrix_view<const T, IdxType, raft::col_major>(in, N, D);
+  auto out_col_major = raft::make_device_matrix_view<OutT, IdxType, raft::col_major>(out, N, D);
+  auto in_col_major = raft::make_device_matrix_view<const MatT, IdxType, raft::col_major>(in, N, D);
 
   auto apply     = bcastAlongRows ? Apply::ALONG_ROWS : Apply::ALONG_COLUMNS;
   auto len       = bcastAlongRows ? D : N;
-  auto vec1_view = raft::make_device_vector_view<const T, IdxType>(vec1, len);
-  auto vec2_view = raft::make_device_vector_view<const T, IdxType>(vec2, len);
-
-  if (useTwoVectors) {
+  auto vec1_view = raft::make_device_vector_view<const Vec1T, IdxType>(vec1, len);
+  
+  if constexpr (OpT::useTwoVectors) {
+    auto vec2_view = raft::make_device_vector_view<const Vec2T, IdxType>(vec2, len);
     if (rowMajor) {
-      matrix_vector_op(handle,
-                       in_row_major,
-                       vec1_view,
-                       vec2_view,
-                       out_row_major,
-                       apply,
-                       [] __device__(T a, T b, T c) { return a + b + c; });
+      matrix_vector_op(handle, in_row_major, vec1_view, vec2_view, out_row_major, apply, OpT{});
     } else {
-      matrix_vector_op(handle,
-                       in_col_major,
-                       vec1_view,
-                       vec2_view,
-                       out_col_major,
-
-                       apply,
-                       [] __device__(T a, T b, T c) { return a + b + c; });
+      matrix_vector_op(handle, in_col_major, vec1_view, vec2_view, out_col_major, apply, OpT{});
     }
   } else {
     if (rowMajor) {
-      matrix_vector_op(
-        handle, in_row_major, vec1_view, out_row_major, apply, [] __device__(T a, T b) {
-          return a + b;
-        });
+      matrix_vector_op(handle, in_row_major, vec1_view, out_row_major, apply, OpT{});
     } else {
-      matrix_vector_op(
-        handle, in_col_major, vec1_view, out_col_major, apply, [] __device__(T a, T b) {
-          return a + b;
-        });
+      matrix_vector_op(handle, in_col_major, vec1_view, out_col_major, apply, OpT{});
     }
   }
 }
 
-template <typename T, typename IdxType>
-class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<T, IdxType>> {
+template <typename OpT,
+          typename OutT,
+          typename IdxType,
+          typename MatT  = OutT,
+          typename Vec1T = MatT,
+          typename Vec2T = Vec1T>
+class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<OutT, IdxType>> {
  public:
   MatVecOpTest()
-    : params(::testing::TestWithParam<MatVecOpInputs<T, IdxType>>::GetParam()),
+    : params(::testing::TestWithParam<MatVecOpInputs<OutT, IdxType>>::GetParam()),
       stream(handle.get_stream()),
       in(params.rows * params.cols, stream),
       out_ref(params.rows * params.cols, stream),
@@ -118,10 +108,10 @@ class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<T, IdxType>>
     IdxType N = params.rows, D = params.cols;
     IdxType len    = N * D;
     IdxType vecLen = params.bcastAlongRows ? D : N;
-    uniform(handle, r, in.data(), len, (T)-1.0, (T)1.0);
-    uniform(handle, r, vec1.data(), vecLen, (T)-1.0, (T)1.0);
-    uniform(handle, r, vec2.data(), vecLen, (T)-1.0, (T)1.0);
-    if (params.useTwoVectors) {
+    uniform(handle, r, in.data(), len, (MatT)-1.0, (MatT)1.0);
+    uniform(handle, r, vec1.data(), vecLen, (Vec1T)-1.0, (Vec1T)1.0);
+    uniform(handle, r, vec2.data(), vecLen, (Vec2T)-1.0, (Vec2T)1.0);
+    if constexpr (OpT::useTwoVectors) {
       naiveMatVec(out_ref.data(),
                   in.data(),
                   vec1.data(),
@@ -130,7 +120,7 @@ class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<T, IdxType>>
                   N,
                   params.rowMajor,
                   params.bcastAlongRows,
-                  (T)1.0,
+                  (OutT)1.0,
                   stream);
     } else {
       naiveMatVec(out_ref.data(),
@@ -140,19 +130,18 @@ class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<T, IdxType>>
                   N,
                   params.rowMajor,
                   params.bcastAlongRows,
-                  (T)1.0,
+                  (OutT)1.0,
                   stream);
     }
-    matrixVectorOpLaunch(handle,
-                         out.data(),
-                         in.data(),
-                         vec1.data(),
-                         vec2.data(),
-                         D,
-                         N,
-                         params.rowMajor,
-                         params.bcastAlongRows,
-                         params.useTwoVectors);
+    matrixVectorOpLaunch<OpT>(handle,
+                              out.data(),
+                              in.data(),
+                              vec1.data(),
+                              vec2.data(),
+                              D,
+                              N,
+                              params.rowMajor,
+                              params.bcastAlongRows);
     handle.sync_stream();
   }
 
@@ -160,87 +149,75 @@ class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<T, IdxType>>
   raft::handle_t handle;
   cudaStream_t stream;
 
-  MatVecOpInputs<T, IdxType> params;
-  rmm::device_uvector<T> in, out, out_ref, vec1, vec2;
+  MatVecOpInputs<OutT, IdxType> params;
+  rmm::device_uvector<MatT> in;
+  rmm::device_uvector<OutT> out;
+  rmm::device_uvector<MatT> out_ref;
+  rmm::device_uvector<Vec1T> vec1;
+  rmm::device_uvector<Vec2T> vec2;
 };
 
-const std::vector<MatVecOpInputs<float, int>> inputsf_i32 = {
-  {0.00001f, 1024, 32, true, true, false, 1234ULL},
-  {0.00001f, 1024, 64, true, true, false, 1234ULL},
-  {0.00001f, 1024, 32, true, false, false, 1234ULL},
-  {0.00001f, 1024, 64, true, false, false, 1234ULL},
-  {0.00001f, 1024, 32, false, true, false, 1234ULL},
-  {0.00001f, 1024, 64, false, true, false, 1234ULL},
-  {0.00001f, 1024, 32, false, false, false, 1234ULL},
-  {0.00001f, 1024, 64, false, false, false, 1234ULL},
-
-  {0.00001f, 1024, 32, true, true, true, 1234ULL},
-  {0.00001f, 1024, 64, true, true, true, 1234ULL},
-  {0.00001f, 1024, 32, true, false, true, 1234ULL},
-  {0.00001f, 1024, 64, true, false, true, 1234ULL},
-  {0.00001f, 1024, 32, false, true, true, 1234ULL},
-  {0.00001f, 1024, 64, false, true, true, 1234ULL},
-  {0.00001f, 1024, 32, false, false, true, 1234ULL},
-  {0.00001f, 1024, 64, false, false, true, 1234ULL}};
-typedef MatVecOpTest<float, int> MatVecOpTestF_i32;
-TEST_P(MatVecOpTestF_i32, Result)
-{
-  ASSERT_TRUE(devArrMatch(
-    out_ref.data(), out.data(), params.rows * params.cols, CompareApprox<float>(params.tolerance)));
-}
-INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i32, ::testing::ValuesIn(inputsf_i32));
+#define MVTEST(TestClass, inputs)                                     \
+  TEST_P(TestClass, Result)                                           \
+  {                                                                   \
+    ASSERT_TRUE(devArrMatch(out_ref.data(),                           \
+                            out.data(),                               \
+                            params.rows* params.cols,                 \
+                            CompareApprox<float>(params.tolerance))); \
+  }                                                                   \
+  INSTANTIATE_TEST_SUITE_P(MatVecOpTests, TestClass, ::testing::ValuesIn(inputs))
 
+const std::vector<MatVecOpInputs<float, int>> inputsf_i32 = {
+  {0.00001f, 1024, 32, true, true, 1234ULL},
+  {0.00001f, 1024, 64, true, true, 1234ULL},
+  {0.00001f, 1024, 32, true, false, 1234ULL},
+  {0.00001f, 1024, 64, true, false, 1234ULL},
+  {0.00001f, 1024, 32, false, true, 1234ULL},
+  {0.00001f, 1024, 64, false, true, 1234ULL},
+  {0.00001f, 1024, 32, false, false, 1234ULL},
+  {0.00001f, 1024, 64, false, false, 1234ULL}};
 const std::vector<MatVecOpInputs<float, size_t>> inputsf_i64 = {
-  {0.00001f, 2500, 250, false, false, false, 1234ULL},
-  {0.00001f, 2500, 250, false, false, true, 1234ULL}};
-typedef MatVecOpTest<float, size_t> MatVecOpTestF_i64;
-TEST_P(MatVecOpTestF_i64, Result)
-{
-  ASSERT_TRUE(devArrMatch(
-    out_ref.data(), out.data(), params.rows * params.cols, CompareApprox<float>(params.tolerance)));
-}
-INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i64, ::testing::ValuesIn(inputsf_i64));
-
+  {0.00001f, 2500, 250, false, false, 1234ULL}, {0.00001f, 2500, 250, false, false, 1234ULL}};
 const std::vector<MatVecOpInputs<double, int>> inputsd_i32 = {
-  {0.0000001, 1024, 32, true, true, false, 1234ULL},
-  {0.0000001, 1024, 64, true, true, false, 1234ULL},
-  {0.0000001, 1024, 32, true, false, false, 1234ULL},
-  {0.0000001, 1024, 64, true, false, false, 1234ULL},
-  {0.0000001, 1024, 32, false, true, false, 1234ULL},
-  {0.0000001, 1024, 64, false, true, false, 1234ULL},
-  {0.0000001, 1024, 32, false, false, false, 1234ULL},
-  {0.0000001, 1024, 64, false, false, false, 1234ULL},
-
-  {0.0000001, 1024, 32, true, true, true, 1234ULL},
-  {0.0000001, 1024, 64, true, true, true, 1234ULL},
-  {0.0000001, 1024, 32, true, false, true, 1234ULL},
-  {0.0000001, 1024, 64, true, false, true, 1234ULL},
-  {0.0000001, 1024, 32, false, true, true, 1234ULL},
-  {0.0000001, 1024, 64, false, true, true, 1234ULL},
-  {0.0000001, 1024, 32, false, false, true, 1234ULL},
-  {0.0000001, 1024, 64, false, false, true, 1234ULL}};
-typedef MatVecOpTest<double, int> MatVecOpTestD_i32;
-TEST_P(MatVecOpTestD_i32, Result)
-{
-  ASSERT_TRUE(devArrMatch(out_ref.data(),
-                          out.data(),
-                          params.rows * params.cols,
-                          CompareApprox<double>(params.tolerance)));
-}
-INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i32, ::testing::ValuesIn(inputsd_i32));
-
+  {0.0000001, 1024, 32, true, true, 1234ULL},
+  {0.0000001, 1024, 64, true, true, 1234ULL},
+  {0.0000001, 1024, 32, true, false, 1234ULL},
+  {0.0000001, 1024, 64, true, false, 1234ULL},
+  {0.0000001, 1024, 32, false, true, 1234ULL},
+  {0.0000001, 1024, 64, false, true, 1234ULL},
+  {0.0000001, 1024, 32, false, false, 1234ULL},
+  {0.0000001, 1024, 64, false, false, 1234ULL}};
 const std::vector<MatVecOpInputs<double, size_t>> inputsd_i64 = {
-  {0.0000001, 2500, 250, false, false, false, 1234ULL},
-  {0.0000001, 2500, 250, false, false, true, 1234ULL}};
-typedef MatVecOpTest<double, size_t> MatVecOpTestD_i64;
-TEST_P(MatVecOpTestD_i64, Result)
-{
-  ASSERT_TRUE(devArrMatch(out_ref.data(),
-                          out.data(),
-                          params.rows * params.cols,
-                          CompareApprox<double>(params.tolerance)));
-}
-INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i64, ::testing::ValuesIn(inputsd_i64));
+  {0.0000001, 2500, 250, false, false, 1234ULL}, {0.0000001, 2500, 250, false, false, 1234ULL}};
+
+template <typename T>
+struct Add1Vec {
+  static constexpr bool useTwoVectors = false;
+  HDI float operator()(T a, T b) const { return a + b; };
+};
+template <typename T>
+struct Add2Vec {
+  static constexpr bool useTwoVectors = true;
+  HDI float operator()(T a, T b, T c) const { return a + b + c; };
+};
+
+typedef MatVecOpTest<Add1Vec<float>, float, int> MatVecOpTestF_i32_add1vec;
+typedef MatVecOpTest<Add2Vec<float>, float, int> MatVecOpTestF_i32_add2vec;
+typedef MatVecOpTest<Add1Vec<float>, float, size_t> MatVecOpTestF_i64_add1vec;
+typedef MatVecOpTest<Add2Vec<float>, float, size_t> MatVecOpTestF_i64_add2vec;
+typedef MatVecOpTest<Add1Vec<double>, double, int> MatVecOpTestD_i32_add1vec;
+typedef MatVecOpTest<Add2Vec<double>, double, int> MatVecOpTestD_i32_add2vec;
+typedef MatVecOpTest<Add1Vec<double>, double, size_t> MatVecOpTestD_i64_add1vec;
+typedef MatVecOpTest<Add2Vec<double>, double, size_t> MatVecOpTestD_i64_add2vec;
+
+MVTEST(MatVecOpTestF_i32_add1vec, inputsf_i32);
+MVTEST(MatVecOpTestF_i32_add2vec, inputsf_i32);
+MVTEST(MatVecOpTestF_i64_add1vec, inputsf_i64);
+MVTEST(MatVecOpTestF_i64_add2vec, inputsf_i64);
+MVTEST(MatVecOpTestD_i32_add1vec, inputsd_i32);
+MVTEST(MatVecOpTestD_i32_add2vec, inputsd_i32);
+MVTEST(MatVecOpTestD_i64_add1vec, inputsd_i64);
+MVTEST(MatVecOpTestD_i64_add2vec, inputsd_i64);
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/matrix/linewise_op.cu b/cpp/test/matrix/linewise_op.cu
index 9d3d5af51e..8043dd05e7 100644
--- a/cpp/test/matrix/linewise_op.cu
+++ b/cpp/test/matrix/linewise_op.cu
@@ -216,7 +216,6 @@ struct LinewiseTest : public ::testing::TestWithParam<typename ParamsReader::Par
     std::vector<std::tuple<I, I>> dims;
     for (auto m : sizes) {
       for (auto n : sizes) {
-        dims.push_back(std::make_tuple(n, m));
         dims.push_back(std::make_tuple(m, n));
       }
     }

From 826446265c5617cbe6ad091e0554ad0e74f8c932 Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Tue, 18 Oct 2022 15:41:48 +0200
Subject: [PATCH 08/25] Fix shared mem buffering offset in linewise kernels

---
 .../raft/matrix/detail/linewise_op.cuh        | 31 ++++++++++---------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh
index 9b9a7d2967..707764fbe8 100644
--- a/cpp/include/raft/matrix/detail/linewise_op.cuh
+++ b/cpp/include/raft/matrix/detail/linewise_op.cuh
@@ -161,6 +161,7 @@ struct Linewise {
   static __device__ __forceinline__ raft::TxN_t<VecT, VecElems> loadVec(
     VecT* shm, const VecT* p, const IdxType blockOffset, const IdxType rowLen) noexcept
   {
+    // todo(lsugy): remove this check
     static_assert(
       sizeof(Type) == sizeof(VecT),
       "linewiseOp currently requires that the size of the matrix and vector types be the same");
@@ -324,6 +325,7 @@ __global__ void __launch_bounds__(BlockSize)
                                   Lambda op,
                                   const Vecs*... vecs)
 {
+  // todo(lsugy): use appropriate Linewise type per vec
   typedef Linewise<Type, IdxType, VecBytes, BlockSize> L;
   constexpr uint workSize         = L::VecElems * BlockSize;
   constexpr size_t maxVecItemSize = maxSizeOf<Vecs...>();
@@ -332,12 +334,12 @@ __global__ void __launch_bounds__(BlockSize)
     maxVecItemSize *
     L::VecElems) char shm[workSize * maxVecItemSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)];
   const IdxType blockOffset = (arrOffset + BlockSize * L::VecElems * blockIdx.x) % rowLen;
-  return L::vectorRows(
-    reinterpret_cast<typename L::Vec::io_t*>(out),
-    reinterpret_cast<const typename L::Vec::io_t*>(in),
-    L::AlignElems::div(len),
-    op,
-    (workOffset ^= workSize, L::loadVec((Vecs*)(shm + workOffset), vecs, blockOffset, rowLen))...);
+  return L::vectorRows(reinterpret_cast<typename L::Vec::io_t*>(out),
+                       reinterpret_cast<const typename L::Vec::io_t*>(in),
+                       L::AlignElems::div(len),
+                       op,
+                       (workOffset ^= workSize * maxVecItemSize,
+                        L::loadVec((Vecs*)(shm + workOffset), vecs, blockOffset, rowLen))...);
 }
 
 /**
@@ -371,17 +373,18 @@ __global__ void __launch_bounds__(MaxOffset, 2)
   // Note, L::VecElems == 1
   constexpr uint workSize         = MaxOffset;
   constexpr size_t maxVecItemSize = maxSizeOf<Vecs...>();
-  uint workOffset         = workSize * maxVecItemSize;
+  uint workOffset                 = workSize * maxVecItemSize;
   __shared__ char shm[workSize * maxVecItemSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)];
+  // todo(lsugy): use appropriate Linewise type per vec
   typedef Linewise<Type, IdxType, sizeof(Type), MaxOffset> L;
   if (blockIdx.x == 0) {
     // first block: offset = 0, length = arrOffset
-    L::vectorRows(
-      reinterpret_cast<typename L::Vec::io_t*>(out),
-      reinterpret_cast<const typename L::Vec::io_t*>(in),
-      arrOffset,
-      op,
-      (workOffset ^= workSize, L::loadVec((Vecs*)(shm + workOffset), vecs, 0, rowLen))...);
+    L::vectorRows(reinterpret_cast<typename L::Vec::io_t*>(out),
+                  reinterpret_cast<const typename L::Vec::io_t*>(in),
+                  arrOffset,
+                  op,
+                  (workOffset ^= workSize * maxVecItemSize,
+                   L::loadVec((Vecs*)(shm + workOffset), vecs, 0, rowLen))...);
   } else {
     // second block: offset = arrTail, length = len - arrTail
     // NB: I substract MaxOffset (= blockDim.x) to get the correct indexing for block 1
@@ -389,7 +392,7 @@ __global__ void __launch_bounds__(MaxOffset, 2)
                   reinterpret_cast<const typename L::Vec::io_t*>(in + arrTail - MaxOffset),
                   len - arrTail + MaxOffset,
                   op,
-                  (workOffset ^= workSize,
+                  (workOffset ^= workSize * maxVecItemSize,
                    L::loadVec((Vecs*)(shm + workOffset), vecs, arrTail % rowLen, rowLen))...);
   }
 }

From e4a8b661adf110534f5e7b951b320b5fa6502a4c Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Tue, 18 Oct 2022 16:51:57 +0200
Subject: [PATCH 09/25] Pass custom op to naiveMat

---
 cpp/test/linalg/matrix_vector_op.cu  |  6 +--
 cpp/test/linalg/matrix_vector_op.cuh | 70 +++++++++++++++++++++++-----
 2 files changed, 62 insertions(+), 14 deletions(-)

diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index 9fb4647eb9..ae132dea8c 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -65,7 +65,7 @@ void matrixVectorOpLaunch(const raft::handle_t& handle,
   auto apply     = bcastAlongRows ? Apply::ALONG_ROWS : Apply::ALONG_COLUMNS;
   auto len       = bcastAlongRows ? D : N;
   auto vec1_view = raft::make_device_vector_view<const Vec1T, IdxType>(vec1, len);
-  
+
   if constexpr (OpT::useTwoVectors) {
     auto vec2_view = raft::make_device_vector_view<const Vec2T, IdxType>(vec2, len);
     if (rowMajor) {
@@ -120,7 +120,7 @@ class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<OutT, IdxTyp
                   N,
                   params.rowMajor,
                   params.bcastAlongRows,
-                  (OutT)1.0,
+                  OpT{},
                   stream);
     } else {
       naiveMatVec(out_ref.data(),
@@ -130,7 +130,7 @@ class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<OutT, IdxTyp
                   N,
                   params.rowMajor,
                   params.bcastAlongRows,
-                  (OutT)1.0,
+                  OpT{},
                   stream);
     }
     matrixVectorOpLaunch<OpT>(handle,
diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh
index f46d70eaa3..38c0eda984 100644
--- a/cpp/test/linalg/matrix_vector_op.cuh
+++ b/cpp/test/linalg/matrix_vector_op.cuh
@@ -21,7 +21,7 @@
 namespace raft {
 namespace linalg {
 
-template <typename Type, typename IdxType = int>
+template <typename Type, typename Lambda, typename IdxType = int>
 __global__ void naiveMatVecKernel(Type* out,
                                   const Type* mat,
                                   const Type* vec,
@@ -29,7 +29,7 @@ __global__ void naiveMatVecKernel(Type* out,
                                   IdxType N,
                                   bool rowMajor,
                                   bool bcastAlongRows,
-                                  Type scalar)
+                                  Lambda op)
 {
   IdxType idx = threadIdx.x + blockIdx.x * blockDim.x;
   IdxType len = N * D;
@@ -43,10 +43,10 @@ __global__ void naiveMatVecKernel(Type* out,
   } else {
     col = idx / N;
   }
-  if (idx < len) { out[idx] = mat[idx] + scalar * vec[col]; }
+  if (idx < len) { out[idx] = op(mat[idx], vec[col]); }
 }
 
-template <typename Type, typename IdxType = int>
+template <typename Type, typename Lambda, typename IdxType = int>
 void naiveMatVec(Type* out,
                  const Type* mat,
                  const Type* vec,
@@ -54,18 +54,41 @@ void naiveMatVec(Type* out,
                  IdxType N,
                  bool rowMajor,
                  bool bcastAlongRows,
-                 Type scalar,
+                 Lambda op,
                  cudaStream_t stream)
 {
   static const IdxType TPB = 64;
   IdxType len              = N * D;
   IdxType nblks            = raft::ceildiv(len, TPB);
   naiveMatVecKernel<Type>
-    <<<nblks, TPB, 0, stream>>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar);
+    <<<nblks, TPB, 0, stream>>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, op);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 template <typename Type, typename IdxType = int>
+void naiveMatVec(Type* out,
+                 const Type* mat,
+                 const Type* vec1,
+                 IdxType D,
+                 IdxType N,
+                 bool rowMajor,
+                 bool bcastAlongRows,
+                 Type scalar,
+                 cudaStream_t stream)
+{
+  naiveMatVec(
+    out,
+    mat,
+    vec1,
+    D,
+    N,
+    rowMajor,
+    bcastAlongRows,
+    [scalar] __device__(Type a, Type b) { return a + scalar * b; },
+    stream);
+}
+
+template <typename Type, typename Lambda, typename IdxType = int>
 __global__ void naiveMatVecKernel(Type* out,
                                   const Type* mat,
                                   const Type* vec1,
@@ -74,7 +97,7 @@ __global__ void naiveMatVecKernel(Type* out,
                                   IdxType N,
                                   bool rowMajor,
                                   bool bcastAlongRows,
-                                  Type scalar)
+                                  Lambda op)
 {
   IdxType idx = threadIdx.x + blockIdx.x * blockDim.x;
   IdxType len = N * D;
@@ -88,10 +111,10 @@ __global__ void naiveMatVecKernel(Type* out,
   } else {
     col = idx / N;
   }
-  if (idx < len) { out[idx] = mat[idx] + scalar * vec1[col] + vec2[col]; }
+  if (idx < len) { out[idx] = op(mat[idx], vec1[col], vec2[col]); }
 }
 
-template <typename Type, typename IdxType = int>
+template <typename Type, typename Lambda, typename IdxType = int>
 void naiveMatVec(Type* out,
                  const Type* mat,
                  const Type* vec1,
@@ -100,16 +123,41 @@ void naiveMatVec(Type* out,
                  IdxType N,
                  bool rowMajor,
                  bool bcastAlongRows,
-                 Type scalar,
+                 Lambda op,
                  cudaStream_t stream)
 {
   static const IdxType TPB = 64;
   IdxType len              = N * D;
   IdxType nblks            = raft::ceildiv(len, TPB);
   naiveMatVecKernel<Type>
-    <<<nblks, TPB, 0, stream>>>(out, mat, vec1, vec2, D, N, rowMajor, bcastAlongRows, scalar);
+    <<<nblks, TPB, 0, stream>>>(out, mat, vec1, vec2, D, N, rowMajor, bcastAlongRows, op);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
+template <typename Type, typename IdxType = int>
+void naiveMatVec(Type* out,
+                 const Type* mat,
+                 const Type* vec1,
+                 const Type* vec2,
+                 IdxType D,
+                 IdxType N,
+                 bool rowMajor,
+                 bool bcastAlongRows,
+                 Type scalar,
+                 cudaStream_t stream)
+{
+  naiveMatVec(
+    out,
+    mat,
+    vec1,
+    vec2,
+    D,
+    N,
+    rowMajor,
+    bcastAlongRows,
+    [scalar] __device__(Type a, Type b, Type c) { return a + scalar * b + c; },
+    stream);
+}
+
 }  // end namespace linalg
 }  // end namespace raft

From 938b1801215446e230cf0c4662b72c741855e02d Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Tue, 18 Oct 2022 17:04:19 +0200
Subject: [PATCH 10/25] Support different output/matrix/vector(s) types in
 naiveMatVec

---
 cpp/test/linalg/matrix_vector_op.cuh | 81 +++++++++++++++-------------
 1 file changed, 45 insertions(+), 36 deletions(-)

diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh
index 38c0eda984..602d05d153 100644
--- a/cpp/test/linalg/matrix_vector_op.cuh
+++ b/cpp/test/linalg/matrix_vector_op.cuh
@@ -21,10 +21,10 @@
 namespace raft {
 namespace linalg {
 
-template <typename Type, typename Lambda, typename IdxType = int>
-__global__ void naiveMatVecKernel(Type* out,
-                                  const Type* mat,
-                                  const Type* vec,
+template <typename OutT, typename MatT, typename VecT, typename Lambda, typename IdxType = int>
+__global__ void naiveMatVecKernel(OutT* out,
+                                  const MatT* mat,
+                                  const VecT* vec,
                                   IdxType D,
                                   IdxType N,
                                   bool rowMajor,
@@ -46,10 +46,10 @@ __global__ void naiveMatVecKernel(Type* out,
   if (idx < len) { out[idx] = op(mat[idx], vec[col]); }
 }
 
-template <typename Type, typename Lambda, typename IdxType = int>
-void naiveMatVec(Type* out,
-                 const Type* mat,
-                 const Type* vec,
+template <typename OutT, typename MatT, typename VecT, typename Lambda, typename IdxType = int>
+void naiveMatVec(OutT* out,
+                 const MatT* mat,
+                 const VecT* vec,
                  IdxType D,
                  IdxType N,
                  bool rowMajor,
@@ -60,39 +60,43 @@ void naiveMatVec(Type* out,
   static const IdxType TPB = 64;
   IdxType len              = N * D;
   IdxType nblks            = raft::ceildiv(len, TPB);
-  naiveMatVecKernel<Type>
-    <<<nblks, TPB, 0, stream>>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, op);
+  naiveMatVecKernel<<<nblks, TPB, 0, stream>>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, op);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
-template <typename Type, typename IdxType = int>
-void naiveMatVec(Type* out,
-                 const Type* mat,
-                 const Type* vec1,
+template <typename OutT, typename MatT, typename VecT, typename IdxType = int>
+void naiveMatVec(OutT* out,
+                 const MatT* mat,
+                 const VecT* vec,
                  IdxType D,
                  IdxType N,
                  bool rowMajor,
                  bool bcastAlongRows,
-                 Type scalar,
+                 OutT scalar,
                  cudaStream_t stream)
 {
   naiveMatVec(
     out,
     mat,
-    vec1,
+    vec,
     D,
     N,
     rowMajor,
     bcastAlongRows,
-    [scalar] __device__(Type a, Type b) { return a + scalar * b; },
+    [scalar] __device__(MatT a, VecT b) { return (OutT)(a + scalar * b); },
     stream);
 }
 
-template <typename Type, typename Lambda, typename IdxType = int>
-__global__ void naiveMatVecKernel(Type* out,
-                                  const Type* mat,
-                                  const Type* vec1,
-                                  const Type* vec2,
+template <typename OutT,
+          typename MatT,
+          typename Vec1T,
+          typename Vec2T,
+          typename Lambda,
+          typename IdxType = int>
+__global__ void naiveMatVecKernel(OutT* out,
+                                  const MatT* mat,
+                                  const Vec1T* vec1,
+                                  const Vec2T* vec2,
                                   IdxType D,
                                   IdxType N,
                                   bool rowMajor,
@@ -114,11 +118,16 @@ __global__ void naiveMatVecKernel(Type* out,
   if (idx < len) { out[idx] = op(mat[idx], vec1[col], vec2[col]); }
 }
 
-template <typename Type, typename Lambda, typename IdxType = int>
-void naiveMatVec(Type* out,
-                 const Type* mat,
-                 const Type* vec1,
-                 const Type* vec2,
+template <typename OutT,
+          typename MatT,
+          typename Vec1T,
+          typename Vec2T,
+          typename Lambda,
+          typename IdxType = int>
+void naiveMatVec(OutT* out,
+                 const MatT* mat,
+                 const Vec1T* vec1,
+                 const Vec2T* vec2,
                  IdxType D,
                  IdxType N,
                  bool rowMajor,
@@ -129,21 +138,21 @@ void naiveMatVec(Type* out,
   static const IdxType TPB = 64;
   IdxType len              = N * D;
   IdxType nblks            = raft::ceildiv(len, TPB);
-  naiveMatVecKernel<Type>
-    <<<nblks, TPB, 0, stream>>>(out, mat, vec1, vec2, D, N, rowMajor, bcastAlongRows, op);
+  naiveMatVecKernel<<<nblks, TPB, 0, stream>>>(
+    out, mat, vec1, vec2, D, N, rowMajor, bcastAlongRows, op);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
-template <typename Type, typename IdxType = int>
-void naiveMatVec(Type* out,
-                 const Type* mat,
-                 const Type* vec1,
-                 const Type* vec2,
+template <typename OutT, typename MatT, typename Vec1T, typename Vec2T, typename IdxType = int>
+void naiveMatVec(OutT* out,
+                 const MatT* mat,
+                 const Vec1T* vec1,
+                 const Vec2T* vec2,
                  IdxType D,
                  IdxType N,
                  bool rowMajor,
                  bool bcastAlongRows,
-                 Type scalar,
+                 OutT scalar,
                  cudaStream_t stream)
 {
   naiveMatVec(
@@ -155,7 +164,7 @@ void naiveMatVec(Type* out,
     N,
     rowMajor,
     bcastAlongRows,
-    [scalar] __device__(Type a, Type b, Type c) { return a + scalar * b + c; },
+    [scalar] __device__(MatT a, Vec1T b, Vec2T c) { return (OutT)(a + scalar * b + c); },
     stream);
 }
 

From c85ee9bc3b29a5c0212299d57d2d458b3d7cc3a2 Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Tue, 18 Oct 2022 18:15:23 +0200
Subject: [PATCH 11/25] Test matrix-vector op with different matrix / vector
 types

---
 cpp/include/raft/linalg/matrix_vector_op.cuh  |   2 +
 .../raft/matrix/detail/linewise_op.cuh        |  11 +-
 cpp/test/linalg/matrix_vector_op.cu           | 136 +++++++++++-------
 3 files changed, 92 insertions(+), 57 deletions(-)

diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index 41e00b568b..464591f593 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -68,6 +68,7 @@ void matrixVectorOp(OutT* out,
                     Lambda op,
                     cudaStream_t stream)
 {
+  // todo(lsugy): at the moment only OutT == MatT supported!
   detail::matrixVectorOp(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
 }
 
@@ -117,6 +118,7 @@ void matrixVectorOp(OutT* out,
                     Lambda op,
                     cudaStream_t stream)
 {
+  // todo(lsugy): at the moment only OutT == MatT supported!
   detail::matrixVectorOp(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
 }
 
diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh
index 707764fbe8..f807b64f99 100644
--- a/cpp/include/raft/matrix/detail/linewise_op.cuh
+++ b/cpp/include/raft/matrix/detail/linewise_op.cuh
@@ -325,7 +325,6 @@ __global__ void __launch_bounds__(BlockSize)
                                   Lambda op,
                                   const Vecs*... vecs)
 {
-  // todo(lsugy): use appropriate Linewise type per vec
   typedef Linewise<Type, IdxType, VecBytes, BlockSize> L;
   constexpr uint workSize         = L::VecElems * BlockSize;
   constexpr size_t maxVecItemSize = maxSizeOf<Vecs...>();
@@ -339,7 +338,8 @@ __global__ void __launch_bounds__(BlockSize)
                        L::AlignElems::div(len),
                        op,
                        (workOffset ^= workSize * maxVecItemSize,
-                        L::loadVec((Vecs*)(shm + workOffset), vecs, blockOffset, rowLen))...);
+                        Linewise<Vecs, IdxType, VecBytes, BlockSize>::loadVec(
+                          (Vecs*)(shm + workOffset), vecs, blockOffset, rowLen))...);
 }
 
 /**
@@ -375,7 +375,6 @@ __global__ void __launch_bounds__(MaxOffset, 2)
   constexpr size_t maxVecItemSize = maxSizeOf<Vecs...>();
   uint workOffset                 = workSize * maxVecItemSize;
   __shared__ char shm[workSize * maxVecItemSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)];
-  // todo(lsugy): use appropriate Linewise type per vec
   typedef Linewise<Type, IdxType, sizeof(Type), MaxOffset> L;
   if (blockIdx.x == 0) {
     // first block: offset = 0, length = arrOffset
@@ -384,7 +383,7 @@ __global__ void __launch_bounds__(MaxOffset, 2)
                   arrOffset,
                   op,
                   (workOffset ^= workSize * maxVecItemSize,
-                   L::loadVec((Vecs*)(shm + workOffset), vecs, 0, rowLen))...);
+                   Linewise<Vecs, IdxType, sizeof(Vecs), MaxOffset>::loadVec((Vecs*)(shm + workOffset), vecs, 0, rowLen))...);
   } else {
     // second block: offset = arrTail, length = len - arrTail
     // NB: I substract MaxOffset (= blockDim.x) to get the correct indexing for block 1
@@ -393,7 +392,7 @@ __global__ void __launch_bounds__(MaxOffset, 2)
                   len - arrTail + MaxOffset,
                   op,
                   (workOffset ^= workSize * maxVecItemSize,
-                   L::loadVec((Vecs*)(shm + workOffset), vecs, arrTail % rowLen, rowLen))...);
+                   Linewise<Vecs, IdxType, sizeof(Vecs), MaxOffset>::loadVec((Vecs*)(shm + workOffset), vecs, arrTail % rowLen, rowLen))...);
   }
 }
 
@@ -549,7 +548,7 @@ struct MatrixLinewiseOp {
                   cudaStream_t stream,
                   const Vecs*... vecs)
   {
-    // todo(lsugy): add test with different vector types
+    // todo(lsugy): vectors alignment? Where is it checked/enforced?
     if constexpr (VecBytes > sizeof(Type)) {
       if (!raft::Pow2<VecBytes>::areSameAlignOffsets(in, out))
         return MatrixLinewiseOp<std::max((VecBytes >> 1), sizeof(Type)), BlockSize>::run(
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index ae132dea8c..4bf29c016f 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -23,20 +23,29 @@
 namespace raft {
 namespace linalg {
 
-template <typename T, typename IdxType = int>
+template <typename IdxType = int>
 struct MatVecOpInputs {
-  T tolerance;
   IdxType rows, cols;
   bool rowMajor, bcastAlongRows;
   unsigned long long int seed;
 };
 
-template <typename T, typename IdxType>
-::std::ostream& operator<<(::std::ostream& os, const MatVecOpInputs<T, IdxType>& dims)
+template <typename IdxType>
+::std::ostream& operator<<(::std::ostream& os, const MatVecOpInputs<IdxType>& dims)
 {
   return os;
 }
 
+template <typename T, typename LenT>
+inline void gen_uniform(const raft::handle_t& handle, raft::random::RngState& rng, T* ptr, LenT len)
+{
+  if constexpr (std::is_integral_v<T>) {
+    raft::random::uniformInt(handle, rng, ptr, len, (T)0, (T)100);
+  } else {
+    raft::random::uniform(handle, rng, ptr, len, (T)-10.0, (T)10.0);
+  }
+}
+
 // Or else, we get the following compilation error
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
@@ -88,10 +97,10 @@ template <typename OpT,
           typename MatT  = OutT,
           typename Vec1T = MatT,
           typename Vec2T = Vec1T>
-class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<OutT, IdxType>> {
+class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<IdxType>> {
  public:
   MatVecOpTest()
-    : params(::testing::TestWithParam<MatVecOpInputs<OutT, IdxType>>::GetParam()),
+    : params(::testing::TestWithParam<MatVecOpInputs<IdxType>>::GetParam()),
       stream(handle.get_stream()),
       in(params.rows * params.cols, stream),
       out_ref(params.rows * params.cols, stream),
@@ -108,9 +117,9 @@ class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<OutT, IdxTyp
     IdxType N = params.rows, D = params.cols;
     IdxType len    = N * D;
     IdxType vecLen = params.bcastAlongRows ? D : N;
-    uniform(handle, r, in.data(), len, (MatT)-1.0, (MatT)1.0);
-    uniform(handle, r, vec1.data(), vecLen, (Vec1T)-1.0, (Vec1T)1.0);
-    uniform(handle, r, vec2.data(), vecLen, (Vec2T)-1.0, (Vec2T)1.0);
+    gen_uniform<MatT>(handle, r, in.data(), len);
+    gen_uniform<Vec1T>(handle, r, vec1.data(), vecLen);
+    gen_uniform<Vec2T>(handle, r, vec2.data(), vecLen);
     if constexpr (OpT::useTwoVectors) {
       naiveMatVec(out_ref.data(),
                   in.data(),
@@ -149,56 +158,49 @@ class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<OutT, IdxTyp
   raft::handle_t handle;
   cudaStream_t stream;
 
-  MatVecOpInputs<OutT, IdxType> params;
+  MatVecOpInputs<IdxType> params;
   rmm::device_uvector<MatT> in;
   rmm::device_uvector<OutT> out;
-  rmm::device_uvector<MatT> out_ref;
+  rmm::device_uvector<OutT> out_ref;
   rmm::device_uvector<Vec1T> vec1;
   rmm::device_uvector<Vec2T> vec2;
 };
 
-#define MVTEST(TestClass, inputs)                                     \
-  TEST_P(TestClass, Result)                                           \
-  {                                                                   \
-    ASSERT_TRUE(devArrMatch(out_ref.data(),                           \
-                            out.data(),                               \
-                            params.rows* params.cols,                 \
-                            CompareApprox<float>(params.tolerance))); \
-  }                                                                   \
+#define MVTEST(TestClass, inputs, tolerance)                                                   \
+  TEST_P(TestClass, Result)                                                                    \
+  {                                                                                            \
+    ASSERT_TRUE(devArrMatch(                                                                   \
+      out_ref.data(), out.data(), params.rows* params.cols, CompareApprox<float>(tolerance))); \
+  }                                                                                            \
   INSTANTIATE_TEST_SUITE_P(MatVecOpTests, TestClass, ::testing::ValuesIn(inputs))
 
-const std::vector<MatVecOpInputs<float, int>> inputsf_i32 = {
-  {0.00001f, 1024, 32, true, true, 1234ULL},
-  {0.00001f, 1024, 64, true, true, 1234ULL},
-  {0.00001f, 1024, 32, true, false, 1234ULL},
-  {0.00001f, 1024, 64, true, false, 1234ULL},
-  {0.00001f, 1024, 32, false, true, 1234ULL},
-  {0.00001f, 1024, 64, false, true, 1234ULL},
-  {0.00001f, 1024, 32, false, false, 1234ULL},
-  {0.00001f, 1024, 64, false, false, 1234ULL}};
-const std::vector<MatVecOpInputs<float, size_t>> inputsf_i64 = {
-  {0.00001f, 2500, 250, false, false, 1234ULL}, {0.00001f, 2500, 250, false, false, 1234ULL}};
-const std::vector<MatVecOpInputs<double, int>> inputsd_i32 = {
-  {0.0000001, 1024, 32, true, true, 1234ULL},
-  {0.0000001, 1024, 64, true, true, 1234ULL},
-  {0.0000001, 1024, 32, true, false, 1234ULL},
-  {0.0000001, 1024, 64, true, false, 1234ULL},
-  {0.0000001, 1024, 32, false, true, 1234ULL},
-  {0.0000001, 1024, 64, false, true, 1234ULL},
-  {0.0000001, 1024, 32, false, false, 1234ULL},
-  {0.0000001, 1024, 64, false, false, 1234ULL}};
-const std::vector<MatVecOpInputs<double, size_t>> inputsd_i64 = {
-  {0.0000001, 2500, 250, false, false, 1234ULL}, {0.0000001, 2500, 250, false, false, 1234ULL}};
+#define MV_EPS_F 0.00001f
+#define MV_EPS_D 0.0000001
+
+/*
+ * This set of tests covers cases where all the types are the same.
+ */
+
+const std::vector<MatVecOpInputs<int>> inputs_i32    = {{1024, 32, true, true, 1234ULL},
+                                                     {1024, 64, true, true, 1234ULL},
+                                                     {1024, 32, true, false, 1234ULL},
+                                                     {1024, 64, true, false, 1234ULL},
+                                                     {1024, 32, false, true, 1234ULL},
+                                                     {1024, 64, false, true, 1234ULL},
+                                                     {1024, 32, false, false, 1234ULL},
+                                                     {1024, 64, false, false, 1234ULL}};
+const std::vector<MatVecOpInputs<size_t>> inputs_i64 = {{2500, 250, false, false, 1234ULL},
+                                                        {2500, 250, false, false, 1234ULL}};
 
 template <typename T>
 struct Add1Vec {
   static constexpr bool useTwoVectors = false;
-  HDI float operator()(T a, T b) const { return a + b; };
+  HDI T operator()(T a, T b) const { return a + b; };
 };
 template <typename T>
 struct Add2Vec {
   static constexpr bool useTwoVectors = true;
-  HDI float operator()(T a, T b, T c) const { return a + b + c; };
+  HDI T operator()(T a, T b, T c) const { return a + b + c; };
 };
 
 typedef MatVecOpTest<Add1Vec<float>, float, int> MatVecOpTestF_i32_add1vec;
@@ -210,14 +212,46 @@ typedef MatVecOpTest<Add2Vec<double>, double, int> MatVecOpTestD_i32_add2vec;
 typedef MatVecOpTest<Add1Vec<double>, double, size_t> MatVecOpTestD_i64_add1vec;
 typedef MatVecOpTest<Add2Vec<double>, double, size_t> MatVecOpTestD_i64_add2vec;
 
-MVTEST(MatVecOpTestF_i32_add1vec, inputsf_i32);
-MVTEST(MatVecOpTestF_i32_add2vec, inputsf_i32);
-MVTEST(MatVecOpTestF_i64_add1vec, inputsf_i64);
-MVTEST(MatVecOpTestF_i64_add2vec, inputsf_i64);
-MVTEST(MatVecOpTestD_i32_add1vec, inputsd_i32);
-MVTEST(MatVecOpTestD_i32_add2vec, inputsd_i32);
-MVTEST(MatVecOpTestD_i64_add1vec, inputsd_i64);
-MVTEST(MatVecOpTestD_i64_add2vec, inputsd_i64);
+MVTEST(MatVecOpTestF_i32_add1vec, inputs_i32, MV_EPS_F);
+MVTEST(MatVecOpTestF_i32_add2vec, inputs_i32, MV_EPS_F);
+MVTEST(MatVecOpTestF_i64_add1vec, inputs_i64, MV_EPS_F);
+MVTEST(MatVecOpTestF_i64_add2vec, inputs_i64, MV_EPS_F);
+MVTEST(MatVecOpTestD_i32_add1vec, inputs_i32, MV_EPS_D);
+MVTEST(MatVecOpTestD_i32_add2vec, inputs_i32, MV_EPS_D);
+MVTEST(MatVecOpTestD_i64_add1vec, inputs_i64, MV_EPS_D);
+MVTEST(MatVecOpTestD_i64_add2vec, inputs_i64, MV_EPS_D);
+
+/*
+ * This set of tests covers cases with different types.
+ */
+
+template <typename OutT, typename MatT, typename Vec1T, typename Vec2T>
+struct DivAndAdd {
+  static constexpr bool useTwoVectors = true;
+  HDI OutT operator()(MatT a, Vec1T b, Vec2T c) const { return a / b + c; };
+};
+
+typedef MatVecOpTest<DivAndAdd<float, float, int32_t, float>, float, int, float, int32_t, float>
+  MatVecOpTestF_i32_DivAndAdd_f_i32_f;
+// typedef MatVecOpTest<DivAndAdd<float, double, int32_t, float>, float, int, double, int32_t,
+// float>
+//   MatVecOpTestF_i32_DivAndAdd_d_i32_f;
+// typedef MatVecOpTest<DivAndAdd<double, float, int32_t, float>, double, int, float, int32_t,
+// float>
+//   MatVecOpTestD_i32_DivAndAdd_f_i32_f;
+typedef MatVecOpTest<DivAndAdd<float, float, int32_t, double>, float, int, float, int32_t, double>
+  MatVecOpTestF_i32_DivAndAdd_f_i32_d;
+typedef MatVecOpTest<DivAndAdd<float, float, int64_t, float>, float, int, float, int64_t, float>
+  MatVecOpTestF_i32_DivAndAdd_f_i64_f;
+typedef MatVecOpTest<DivAndAdd<double, double, int32_t, float>, double, int, double, int32_t, float>
+  MatVecOpTestD_i32_DivAndAdd_d_i32_f;
+
+MVTEST(MatVecOpTestF_i32_DivAndAdd_f_i32_f, inputs_i32, MV_EPS_F);
+// MVTEST(MatVecOpTestF_i32_DivAndAdd_d_i32_f, inputs_i32, MV_EPS_F);
+// MVTEST(MatVecOpTestD_i32_DivAndAdd_f_i32_f, inputs_i32, (double)MV_EPS_F);
+MVTEST(MatVecOpTestF_i32_DivAndAdd_f_i32_d, inputs_i32, MV_EPS_F);
+MVTEST(MatVecOpTestF_i32_DivAndAdd_f_i64_f, inputs_i32, MV_EPS_F);
+MVTEST(MatVecOpTestD_i32_DivAndAdd_d_i32_f, inputs_i32, (double)MV_EPS_F);
 
 }  // end namespace linalg
 }  // end namespace raft

From be142c6c453259f7aef682f381d4557a20fab5c7 Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Wed, 19 Oct 2022 17:10:59 +0200
Subject: [PATCH 12/25] Fix linewiseOp VecRows kernels

---
 .../raft/matrix/detail/linewise_op.cuh        | 35 +++++++++--------
 cpp/test/linalg/matrix_vector_op.cu           | 38 ++++++++-----------
 2 files changed, 34 insertions(+), 39 deletions(-)

diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh
index f807b64f99..5a7e106951 100644
--- a/cpp/include/raft/matrix/detail/linewise_op.cuh
+++ b/cpp/include/raft/matrix/detail/linewise_op.cuh
@@ -26,6 +26,11 @@ namespace raft {
 namespace matrix {
 namespace detail {
 
+template <typename Type, int VecElems>
+struct VecArg {
+  Type val[VecElems];
+};
+
 template <typename Type, typename IdxType, std::size_t VecBytes, int BlockSize>
 struct Linewise {
   static constexpr IdxType VecElems = VecBytes / sizeof(Type);
@@ -79,6 +84,7 @@ struct Linewise {
                                                     const Vecs*... vecs) noexcept
   {
     constexpr IdxType warpPad = (AlignWarp::Value - 1) * VecElems;
+    // todo(lsugy): don't use Type!
     Type args[sizeof...(Vecs)];
     Vec v, w;
     bool update = true;
@@ -141,7 +147,7 @@ struct Linewise {
       *v.vectorized_data() = __ldcv(in + i);
 #pragma unroll VecElems
       for (int k = 0; k < VecElems; k++)
-        v.val.data[k] = op(v.val.data[k], args.val.data[k]...);
+        v.val.data[k] = op(v.val.data[k], args.val[k]...);
       __stwt(out + i, *v.vectorized_data());
     }
   }
@@ -158,14 +164,11 @@ struct Linewise {
    * @return a contiguous chunk of a vector, suitable for `vectorRows`.
    */
   template <typename VecT>
-  static __device__ __forceinline__ raft::TxN_t<VecT, VecElems> loadVec(
-    VecT* shm, const VecT* p, const IdxType blockOffset, const IdxType rowLen) noexcept
+  static __device__ __forceinline__ VecArg<VecT, VecElems> loadVec(VecT* shm,
+                                                                   const VecT* p,
+                                                                   const IdxType blockOffset,
+                                                                   const IdxType rowLen) noexcept
   {
-    // todo(lsugy): remove this check
-    static_assert(
-      sizeof(Type) == sizeof(VecT),
-      "linewiseOp currently requires that the size of the matrix and vector types be the same");
-
     IdxType j = blockOffset + threadIdx.x;
 #pragma unroll VecElems
     for (int k = threadIdx.x; k < VecElems * BlockSize; k += BlockSize, j += BlockSize) {
@@ -175,9 +178,10 @@ struct Linewise {
     }
     __syncthreads();
     {
-      raft::TxN_t<VecT, VecElems> out;
-      *out.vectorized_data() =
-        reinterpret_cast<typename raft::TxN_t<VecT, VecElems>::io_t*>(shm)[threadIdx.x];
+      VecArg<VecT, VecElems> out;
+#pragma unroll VecElems
+      for (int i = 0; i < VecElems; i++)
+        out.val[i] = shm[threadIdx.x * VecElems + i];
       return out;
     }
   }
@@ -338,8 +342,7 @@ __global__ void __launch_bounds__(BlockSize)
                        L::AlignElems::div(len),
                        op,
                        (workOffset ^= workSize * maxVecItemSize,
-                        Linewise<Vecs, IdxType, VecBytes, BlockSize>::loadVec(
-                          (Vecs*)(shm + workOffset), vecs, blockOffset, rowLen))...);
+                        L::loadVec((Vecs*)(shm + workOffset), vecs, blockOffset, rowLen))...);
 }
 
 /**
@@ -383,7 +386,7 @@ __global__ void __launch_bounds__(MaxOffset, 2)
                   arrOffset,
                   op,
                   (workOffset ^= workSize * maxVecItemSize,
-                   Linewise<Vecs, IdxType, sizeof(Vecs), MaxOffset>::loadVec((Vecs*)(shm + workOffset), vecs, 0, rowLen))...);
+                   L::loadVec((Vecs*)(shm + workOffset), vecs, 0, rowLen))...);
   } else {
     // second block: offset = arrTail, length = len - arrTail
     // NB: I substract MaxOffset (= blockDim.x) to get the correct indexing for block 1
@@ -392,7 +395,7 @@ __global__ void __launch_bounds__(MaxOffset, 2)
                   len - arrTail + MaxOffset,
                   op,
                   (workOffset ^= workSize * maxVecItemSize,
-                   Linewise<Vecs, IdxType, sizeof(Vecs), MaxOffset>::loadVec((Vecs*)(shm + workOffset), vecs, arrTail % rowLen, rowLen))...);
+                   L::loadVec((Vecs*)(shm + workOffset), vecs, arrTail % rowLen, rowLen))...);
   }
 }
 
@@ -503,6 +506,7 @@ void matrixLinewiseVecRows(Type* out,
     const uint expected_grid_size = rowLen / raft::gcd(block_work_size, uint(rowLen));
     // Minimum size of the grid to make the device well occupied
     const uint occupy = getOptimalGridSize<BlockSize>();
+    // todo(lsugy): alignedLen instead of totalLen?
     const dim3 gs(std::min(
                     // does not make sense to have more blocks than this
                     raft::ceildiv<uint>(uint(totalLen), block_work_size),
@@ -548,7 +552,6 @@ struct MatrixLinewiseOp {
                   cudaStream_t stream,
                   const Vecs*... vecs)
   {
-    // todo(lsugy): vectors alignment? Where is it checked/enforced?
     if constexpr (VecBytes > sizeof(Type)) {
       if (!raft::Pow2<VecBytes>::areSameAlignOffsets(in, out))
         return MatrixLinewiseOp<std::max((VecBytes >> 1), sizeof(Type)), BlockSize>::run(
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index 4bf29c016f..ccce54390b 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -226,32 +226,24 @@ MVTEST(MatVecOpTestD_i64_add2vec, inputs_i64, MV_EPS_D);
  */
 
 template <typename OutT, typename MatT, typename Vec1T, typename Vec2T>
-struct DivAndAdd {
+struct MulAndAdd {
   static constexpr bool useTwoVectors = true;
-  HDI OutT operator()(MatT a, Vec1T b, Vec2T c) const { return a / b + c; };
+  HDI OutT operator()(MatT a, Vec1T b, Vec2T c) const { return a * b + c; };
 };
 
-typedef MatVecOpTest<DivAndAdd<float, float, int32_t, float>, float, int, float, int32_t, float>
-  MatVecOpTestF_i32_DivAndAdd_f_i32_f;
-// typedef MatVecOpTest<DivAndAdd<float, double, int32_t, float>, float, int, double, int32_t,
-// float>
-//   MatVecOpTestF_i32_DivAndAdd_d_i32_f;
-// typedef MatVecOpTest<DivAndAdd<double, float, int32_t, float>, double, int, float, int32_t,
-// float>
-//   MatVecOpTestD_i32_DivAndAdd_f_i32_f;
-typedef MatVecOpTest<DivAndAdd<float, float, int32_t, double>, float, int, float, int32_t, double>
-  MatVecOpTestF_i32_DivAndAdd_f_i32_d;
-typedef MatVecOpTest<DivAndAdd<float, float, int64_t, float>, float, int, float, int64_t, float>
-  MatVecOpTestF_i32_DivAndAdd_f_i64_f;
-typedef MatVecOpTest<DivAndAdd<double, double, int32_t, float>, double, int, double, int32_t, float>
-  MatVecOpTestD_i32_DivAndAdd_d_i32_f;
-
-MVTEST(MatVecOpTestF_i32_DivAndAdd_f_i32_f, inputs_i32, MV_EPS_F);
-// MVTEST(MatVecOpTestF_i32_DivAndAdd_d_i32_f, inputs_i32, MV_EPS_F);
-// MVTEST(MatVecOpTestD_i32_DivAndAdd_f_i32_f, inputs_i32, (double)MV_EPS_F);
-MVTEST(MatVecOpTestF_i32_DivAndAdd_f_i32_d, inputs_i32, MV_EPS_F);
-MVTEST(MatVecOpTestF_i32_DivAndAdd_f_i64_f, inputs_i32, MV_EPS_F);
-MVTEST(MatVecOpTestD_i32_DivAndAdd_d_i32_f, inputs_i32, (double)MV_EPS_F);
+typedef MatVecOpTest<MulAndAdd<float, float, int32_t, float>, float, int, float, int32_t, float>
+  MatVecOpTestF_i32_MulAndAdd_f_i32_f;
+typedef MatVecOpTest<MulAndAdd<float, float, int32_t, double>, float, int, float, int32_t, double>
+  MatVecOpTestF_i32_MulAndAdd_f_i32_d;
+typedef MatVecOpTest<MulAndAdd<float, float, int64_t, float>, float, int, float, int64_t, float>
+  MatVecOpTestF_i32_MulAndAdd_f_i64_f;
+typedef MatVecOpTest<MulAndAdd<double, double, int32_t, float>, double, int, double, int32_t, float>
+  MatVecOpTestD_i32_MulAndAdd_d_i32_f;
+
+MVTEST(MatVecOpTestF_i32_MulAndAdd_f_i32_f, inputs_i32, MV_EPS_F);
+MVTEST(MatVecOpTestF_i32_MulAndAdd_f_i32_d, inputs_i32, MV_EPS_F);
+MVTEST(MatVecOpTestF_i32_MulAndAdd_f_i64_f, inputs_i32, MV_EPS_F);
+MVTEST(MatVecOpTestD_i32_MulAndAdd_d_i32_f, inputs_i32, (double)MV_EPS_F);
 
 }  // end namespace linalg
 }  // end namespace raft

From a850d8525725df086384b9ab4d4a056236c5fd0d Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Thu, 20 Oct 2022 15:08:16 +0200
Subject: [PATCH 13/25] Fix linewiseOp VecCols kernels

---
 .../raft/matrix/detail/linewise_op.cuh        | 27 +++++++++++++------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh
index 5a7e106951..9b40108289 100644
--- a/cpp/include/raft/matrix/detail/linewise_op.cuh
+++ b/cpp/include/raft/matrix/detail/linewise_op.cuh
@@ -21,16 +21,28 @@
 #include <raft/util/vectorized.cuh>
 
 #include <algorithm>
+#include <thrust/tuple.h>
 
 namespace raft {
 namespace matrix {
 namespace detail {
 
+/** This type simplifies returning arrays and passing them as arguments */
 template <typename Type, int VecElems>
 struct VecArg {
   Type val[VecElems];
 };
 
+/** Executes the operation with the given matrix element and an arbitrary number of vector elements
+ * contained in the given tuple. The index_sequence is used here for compile-time indexing of the
+ * tuple in the fold expression. */
+template <typename MatT, typename Lambda, class Tuple, size_t... Is>
+__device__ __forceinline__ MatT
+RunMatVecOp(Lambda op, MatT mat, Tuple&& args, std::index_sequence<Is...>)
+{
+  return op(mat, (thrust::get<Is>(args))...);
+}
+
 template <typename Type, typename IdxType, std::size_t VecBytes, int BlockSize>
 struct Linewise {
   static constexpr IdxType VecElems = VecBytes / sizeof(Type);
@@ -84,8 +96,10 @@ struct Linewise {
                                                     const Vecs*... vecs) noexcept
   {
     constexpr IdxType warpPad = (AlignWarp::Value - 1) * VecElems;
-    // todo(lsugy): don't use Type!
-    Type args[sizeof...(Vecs)];
+    constexpr auto index      = std::index_sequence_for<Vecs...>();
+    // todo(lsugy): switch to cuda::std::tuple from libcudacxx if we add it as a required
+    // dependency. Note that thrust::tuple is limited to 10 elements.
+    thrust::tuple<Vecs...> args;
     Vec v, w;
     bool update = true;
     for (; in < in_end; in += AlignWarp::Value, out += AlignWarp::Value, rowMod += warpPad) {
@@ -96,8 +110,7 @@ struct Linewise {
         update = true;
       }
       if (update) {
-        int l = 0;
-        ((args[l] = vecs[rowDiv], l++), ...);
+        args   = thrust::make_tuple((vecs[rowDiv])...);
         update = false;
       }
 #pragma unroll VecElems
@@ -105,11 +118,9 @@ struct Linewise {
         if (rowMod == rowLen) {
           rowMod = 0;
           rowDiv++;
-          int l = 0;
-          ((args[l] = vecs[rowDiv], l++), ...);
+          args = thrust::make_tuple((vecs[rowDiv])...);
         }
-        int l         = 0;
-        w.val.data[k] = op(v.val.data[k], (std::ignore = vecs, args[l++])...);
+        w.val.data[k] = RunMatVecOp(op, v.val.data[k], args, index);
       }
       *out = *w.vectorized_data();
     }

From e44bcc4348b92d9780bb0abf2499ecd1f787262f Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Thu, 20 Oct 2022 15:28:53 +0200
Subject: [PATCH 14/25] Merge OutT=MatT because linewiseOp only supports one
 input/output matrix type + cleanup old matrixVectorOp implementation

---
 .../raft/linalg/detail/matrix_vector_op.cuh   | 81 +------------------
 cpp/include/raft/linalg/matrix_vector_op.cuh  | 33 +++-----
 cpp/test/linalg/matrix_vector_op.cu           | 50 +++++-------
 3 files changed, 35 insertions(+), 129 deletions(-)

diff --git a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
index d00014e56f..807bacc470 100644
--- a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
@@ -22,84 +22,12 @@ namespace raft {
 namespace linalg {
 namespace detail {
 
-namespace {
-template <size_t VecBytes>
-struct AlignedAccess {
-  template <typename T>
-  static inline bool test(const T* matrix, size_t strideBytes)
-  {
-    return Pow2<VecBytes>::isAligned(matrix) && Pow2<VecBytes>::isAligned(strideBytes) &&
-           Pow2<sizeof(T)>::isAligned(VecBytes);
-  }
-};
-};  // namespace
-
-template <typename Type, int veclen_, typename Lambda, typename IdxType>
-__global__ void matrixVectorOpKernel(Type* out,
-                                     const Type* matrix,
-                                     const Type* vector,
-                                     IdxType D,
-                                     IdxType N,
-                                     bool rowMajor,
-                                     bool bcastAlongRows,
-                                     Lambda op)
-{
-  typedef TxN_t<Type, veclen_> VecType;
-  IdxType len = N * D;
-  IdxType idx = threadIdx.x;
-  idx += (IdxType)blockIdx.x * (IdxType)blockDim.x;
-  idx *= VecType::Ratio;
-  if (idx >= len) return;
-  IdxType vIdx;
-  VecType mat, vec;
-  ///@todo: yikes! use fast-int-div here.
-  ///@todo: shared mem for vector could help with perf
-  if (rowMajor && bcastAlongRows) {
-    vIdx = idx % D;
-    vec.load(vector, vIdx);
-  } else if (!rowMajor && !bcastAlongRows) {
-    vIdx = idx % N;
-    vec.load(vector, vIdx);
-  } else if (rowMajor && !bcastAlongRows) {
-    vIdx = idx / D;
-    vec.fill(vector[vIdx]);
-  } else {
-    vIdx = idx / N;
-    vec.fill(vector[vIdx]);
-  }
-  mat.load(matrix, idx);
-#pragma unroll
-  for (int i = 0; i < VecType::Ratio; ++i)
-    mat.val.data[i] = op(mat.val.data[i], vec.val.data[i]);
-  mat.store(out, idx);
-}
-
-// todo(lsugy): remove this function? It isn't used.
-template <typename Type, int veclen_, typename Lambda, typename IdxType, int TPB>
-void matrixVectorOpImpl(Type* out,
-                        const Type* matrix,
-                        const Type* vec,
-                        IdxType D,
-                        IdxType N,
-                        bool rowMajor,
-                        bool bcastAlongRows,
-                        Lambda op,
-                        cudaStream_t stream)
-{
-  IdxType len   = N * D;
-  IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB);
-  matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-}
-
-template <typename OutT,
+template <typename MatT,
           typename Lambda,
-          typename MatT,
           typename VecT,
           typename IdxType = int,
           int TPB          = 256>
-void matrixVectorOp(OutT* out,
+void matrixVectorOp(MatT* out,
                     const MatT* matrix,
                     const VecT* vec,
                     IdxType D,
@@ -115,14 +43,13 @@ void matrixVectorOp(OutT* out,
     out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec);
 }
 
-template <typename OutT,
+template <typename MatT,
           typename Lambda,
-          typename MatT,
           typename Vec1T,
           typename Vec2T,
           typename IdxType = int,
           int TPB          = 256>
-void matrixVectorOp(OutT* out,
+void matrixVectorOp(MatT* out,
                     const MatT* matrix,
                     const Vec1T* vec1,
                     const Vec2T* vec2,
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index 464591f593..d673769dc3 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -35,9 +35,8 @@ namespace linalg {
  * Note : the function will also check that the size of the window of accesses
  * is a multiple of the number of elements processed by a thread in order to
  * enable faster processing
- * @tparam OutT the output type
+ * @tparam MatT the matrix type
  * @tparam Lambda a device function which represents a binary operator
- * @tparam MatT the input matrix type
  * @tparam VecT the input vector type
  * @tparam IdxType Integer type used to for addressing
  * @tparam TPB threads per block of the cuda kernel launched
@@ -52,13 +51,8 @@ namespace linalg {
  * @param op the mathematical operation
  * @param stream cuda stream where to launch work
  */
-template <typename OutT,
-          typename Lambda,
-          typename MatT,
-          typename VecT,
-          typename IdxType = int,
-          int TPB          = 256>
-void matrixVectorOp(OutT* out,
+template <typename MatT, typename Lambda, typename VecT, typename IdxType = int, int TPB = 256>
+void matrixVectorOp(MatT* out,
                     const MatT* matrix,
                     const VecT* vec,
                     IdxType D,
@@ -68,7 +62,6 @@ void matrixVectorOp(OutT* out,
                     Lambda op,
                     cudaStream_t stream)
 {
-  // todo(lsugy): at the moment only OutT == MatT supported!
   detail::matrixVectorOp(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
 }
 
@@ -80,12 +73,10 @@ void matrixVectorOp(OutT* out,
  * Note : the function will also check that the size of the window of accesses
  * is a multiple of the number of elements processed by a thread in order to
  * enable faster processing
- * @tparam OutT the output type
+ * @tparam MatT the matrix type
  * @tparam Lambda a device function which represents a binary operator
- * @tparam MatT the input matrix type
  * @tparam Vec1T the first input vector type
  * @tparam Vec2T the second input vector type
- * @tparam Lambda a device function which represents a binary operator
  * @tparam IdxType Integer type used to for addressing
  * @tparam TPB threads per block of the cuda kernel launched
  * @param out the output matrix (passing out = matrix makes it in-place)
@@ -100,14 +91,13 @@ void matrixVectorOp(OutT* out,
  * @param op the mathematical operation
  * @param stream cuda stream where to launch work
  */
-template <typename OutT,
+template <typename MatT,
           typename Lambda,
-          typename MatT,
           typename Vec1T,
           typename Vec2T,
           typename IdxType = int,
           int TPB          = 256>
-void matrixVectorOp(OutT* out,
+void matrixVectorOp(MatT* out,
                     const MatT* matrix,
                     const Vec1T* vec1,
                     const Vec2T* vec2,
@@ -118,7 +108,6 @@ void matrixVectorOp(OutT* out,
                     Lambda op,
                     cudaStream_t stream)
 {
-  // todo(lsugy): at the moment only OutT == MatT supported!
   detail::matrixVectorOp(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
 }
 
@@ -139,7 +128,6 @@ void matrixVectorOp(OutT* out,
  * @tparam VecValueType the data-type of the input vector
  * @tparam LayoutPolicy the layout of input and output (raft::row_major or raft::col_major)
  * @tparam Lambda a device function which represents a binary operator
- * @tparam OutElementType the data-type of the output raft::matrix_view
  * @tparam IndexType Integer used for addressing
  * @tparam TPB threads per block of the cuda kernel launched
  * @param[in] handle raft::handle_t
@@ -154,13 +142,12 @@ template <typename MatValueType,
           typename VecValueType,
           typename LayoutPolicy,
           typename Lambda,
-          typename OutValueType,
           typename IndexType,
           int TPB = 256>
 void matrix_vector_op(const raft::handle_t& handle,
                       raft::device_matrix_view<const MatValueType, IndexType, LayoutPolicy> matrix,
                       raft::device_vector_view<const VecValueType, IndexType> vec,
-                      raft::device_matrix_view<OutValueType, IndexType, LayoutPolicy> out,
+                      raft::device_matrix_view<MatValueType, IndexType, LayoutPolicy> out,
                       Apply apply,
                       Lambda op)
 {
@@ -198,12 +185,11 @@ void matrix_vector_op(const raft::handle_t& handle,
  * Note : the function will also check that the size of the window of accesses
  * is a multiple of the number of elements processed by a thread in order to
  * enable faster processing
- * @tparam MatValueType the data-type of the input matrix
+ * @tparam MatValueType the data-type of the input and output matrices
  * @tparam Vec1ValueType the data-type of the first input vector
  * @tparam Vec2ValueType the data-type of the second input vector
  * @tparam LayoutPolicy the layout of input and output (raft::row_major or raft::col_major)
  * @tparam Lambda a device function which represents a binary operator
- * @tparam OutElementType the data-type of the output raft::matrix_view
  * @tparam IndexType Integer used for addressing
  * @tparam TPB threads per block of the cuda kernel launched
  * @param handle raft::handle_t
@@ -220,14 +206,13 @@ template <typename MatValueType,
           typename Vec2ValueType,
           typename LayoutPolicy,
           typename Lambda,
-          typename OutValueType,
           typename IndexType,
           int TPB = 256>
 void matrix_vector_op(const raft::handle_t& handle,
                       raft::device_matrix_view<const MatValueType, IndexType, LayoutPolicy> matrix,
                       raft::device_vector_view<const Vec1ValueType, IndexType> vec1,
                       raft::device_vector_view<const Vec2ValueType, IndexType> vec2,
-                      raft::device_matrix_view<OutValueType, IndexType, LayoutPolicy> out,
+                      raft::device_matrix_view<MatValueType, IndexType, LayoutPolicy> out,
                       Apply apply,
                       Lambda op)
 {
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index ccce54390b..0b2c54de88 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -49,14 +49,9 @@ inline void gen_uniform(const raft::handle_t& handle, raft::random::RngState& rn
 // Or else, we get the following compilation error
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
-template <typename OpT,
-          typename OutT,
-          typename IdxType,
-          typename MatT,
-          typename Vec1T,
-          typename Vec2T>
+template <typename OpT, typename MatT, typename IdxType, typename Vec1T, typename Vec2T>
 void matrixVectorOpLaunch(const raft::handle_t& handle,
-                          OutT* out,
+                          MatT* out,
                           const MatT* in,
                           const Vec1T* vec1,
                           const Vec2T* vec2,
@@ -65,10 +60,10 @@ void matrixVectorOpLaunch(const raft::handle_t& handle,
                           bool rowMajor,
                           bool bcastAlongRows)
 {
-  auto out_row_major = raft::make_device_matrix_view<OutT, IdxType, raft::row_major>(out, N, D);
+  auto out_row_major = raft::make_device_matrix_view<MatT, IdxType, raft::row_major>(out, N, D);
   auto in_row_major = raft::make_device_matrix_view<const MatT, IdxType, raft::row_major>(in, N, D);
 
-  auto out_col_major = raft::make_device_matrix_view<OutT, IdxType, raft::col_major>(out, N, D);
+  auto out_col_major = raft::make_device_matrix_view<MatT, IdxType, raft::col_major>(out, N, D);
   auto in_col_major = raft::make_device_matrix_view<const MatT, IdxType, raft::col_major>(in, N, D);
 
   auto apply     = bcastAlongRows ? Apply::ALONG_ROWS : Apply::ALONG_COLUMNS;
@@ -92,9 +87,8 @@ void matrixVectorOpLaunch(const raft::handle_t& handle,
 }
 
 template <typename OpT,
-          typename OutT,
+          typename MatT,
           typename IdxType,
-          typename MatT  = OutT,
           typename Vec1T = MatT,
           typename Vec2T = Vec1T>
 class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<IdxType>> {
@@ -160,8 +154,8 @@ class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<IdxType>> {
 
   MatVecOpInputs<IdxType> params;
   rmm::device_uvector<MatT> in;
-  rmm::device_uvector<OutT> out;
-  rmm::device_uvector<OutT> out_ref;
+  rmm::device_uvector<MatT> out;
+  rmm::device_uvector<MatT> out_ref;
   rmm::device_uvector<Vec1T> vec1;
   rmm::device_uvector<Vec2T> vec2;
 };
@@ -225,25 +219,25 @@ MVTEST(MatVecOpTestD_i64_add2vec, inputs_i64, MV_EPS_D);
  * This set of tests covers cases with different types.
  */
 
-template <typename OutT, typename MatT, typename Vec1T, typename Vec2T>
+template <typename MatT, typename Vec1T, typename Vec2T>
 struct MulAndAdd {
   static constexpr bool useTwoVectors = true;
-  HDI OutT operator()(MatT a, Vec1T b, Vec2T c) const { return a * b + c; };
+  HDI MatT operator()(MatT a, Vec1T b, Vec2T c) const { return a * b + c; };
 };
 
-typedef MatVecOpTest<MulAndAdd<float, float, int32_t, float>, float, int, float, int32_t, float>
-  MatVecOpTestF_i32_MulAndAdd_f_i32_f;
-typedef MatVecOpTest<MulAndAdd<float, float, int32_t, double>, float, int, float, int32_t, double>
-  MatVecOpTestF_i32_MulAndAdd_f_i32_d;
-typedef MatVecOpTest<MulAndAdd<float, float, int64_t, float>, float, int, float, int64_t, float>
-  MatVecOpTestF_i32_MulAndAdd_f_i64_f;
-typedef MatVecOpTest<MulAndAdd<double, double, int32_t, float>, double, int, double, int32_t, float>
-  MatVecOpTestD_i32_MulAndAdd_d_i32_f;
-
-MVTEST(MatVecOpTestF_i32_MulAndAdd_f_i32_f, inputs_i32, MV_EPS_F);
-MVTEST(MatVecOpTestF_i32_MulAndAdd_f_i32_d, inputs_i32, MV_EPS_F);
-MVTEST(MatVecOpTestF_i32_MulAndAdd_f_i64_f, inputs_i32, MV_EPS_F);
-MVTEST(MatVecOpTestD_i32_MulAndAdd_d_i32_f, inputs_i32, (double)MV_EPS_F);
+typedef MatVecOpTest<MulAndAdd<float, int32_t, float>, float, int, int32_t, float>
+  MatVecOpTestF_i32_MulAndAdd_i32_f;
+typedef MatVecOpTest<MulAndAdd<float, int32_t, double>, float, int, int32_t, double>
+  MatVecOpTestF_i32_MulAndAdd_i32_d;
+typedef MatVecOpTest<MulAndAdd<float, int64_t, float>, float, int, int64_t, float>
+  MatVecOpTestF_i32_MulAndAdd_i64_f;
+typedef MatVecOpTest<MulAndAdd<double, int32_t, float>, double, int, int32_t, float>
+  MatVecOpTestD_i32_MulAndAdd_i32_f;
+
+MVTEST(MatVecOpTestF_i32_MulAndAdd_i32_f, inputs_i32, MV_EPS_F);
+MVTEST(MatVecOpTestF_i32_MulAndAdd_i32_d, inputs_i32, MV_EPS_F);
+MVTEST(MatVecOpTestF_i32_MulAndAdd_i64_f, inputs_i32, MV_EPS_F);
+MVTEST(MatVecOpTestD_i32_MulAndAdd_i32_f, inputs_i32, (double)MV_EPS_F);
 
 }  // end namespace linalg
 }  // end namespace raft

From dcfd96277632457763e00e8fdb3abca282d34023 Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Thu, 20 Oct 2022 15:56:25 +0200
Subject: [PATCH 15/25] Replace for_each with linalg::add + fix syntax error

---
 .../knn/detail/ann_kmeans_balanced.cuh        | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
index df822ef009..fd009b30af 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh
@@ -292,24 +292,17 @@ void calc_centers_and_sizes(const handle_t& handle,
                                      static_cast<int64_t>(n_clusters),
                                      workspace);
 
-  // Add previous sizes if necessary and cast to float
-  // todo(lsugy): add wrapped in if
-  auto counting = thrust::make_counting_iterator<int>(0);
-  thrust::for_each(
-    handle.get_thrust_policy(), counting, counting + n_clusters, [=] __device__(int idx) {
-      uint32_t temp_size = temp_sizes[idx];
-      if (!reset_counters) {
-        temp_size += cluster_sizes[idx];
-        cluster_sizes[idx] = temp_size;
-      }
-    });
+  // Add previous sizes if necessary
+  if (!reset_counters) {
+    raft::linalg::add(cluster_sizes, cluster_sizes, temp_sizes, n_clusters, stream);
+  }
 
   raft::linalg::matrixVectorOp(
     centers,
     centers,
     cluster_sizes,
-    static_cast<int64_t>dim,
-    static_cast<int64_t>n_clusters,
+    static_cast<int64_t>(dim),
+    static_cast<int64_t>(n_clusters),
     true,
     false,
     [=] __device__(float mat, uint32_t vec) {

From 4718e5bc73b0235c34581ed7f6f9c320d7e7b347 Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Thu, 20 Oct 2022 16:49:10 +0200
Subject: [PATCH 16/25] Clang-format fix

---
 cpp/include/raft/linalg/detail/matrix_vector_op.cuh | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
index 807bacc470..62ec9bb7a4 100644
--- a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh
@@ -22,11 +22,7 @@ namespace raft {
 namespace linalg {
 namespace detail {
 
-template <typename MatT,
-          typename Lambda,
-          typename VecT,
-          typename IdxType = int,
-          int TPB          = 256>
+template <typename MatT, typename Lambda, typename VecT, typename IdxType = int, int TPB = 256>
 void matrixVectorOp(MatT* out,
                     const MatT* matrix,
                     const VecT* vec,

From fc55921b4a6ba44367c1e5185636dec5530e1a55 Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Fri, 21 Oct 2022 14:58:43 +0200
Subject: [PATCH 17/25] used alignedLen instead of totalLen in max block number
 calculation

---
 cpp/include/raft/matrix/detail/linewise_op.cuh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh
index 9b40108289..67177d7d05 100644
--- a/cpp/include/raft/matrix/detail/linewise_op.cuh
+++ b/cpp/include/raft/matrix/detail/linewise_op.cuh
@@ -517,10 +517,9 @@ void matrixLinewiseVecRows(Type* out,
     const uint expected_grid_size = rowLen / raft::gcd(block_work_size, uint(rowLen));
     // Minimum size of the grid to make the device well occupied
     const uint occupy = getOptimalGridSize<BlockSize>();
-    // todo(lsugy): alignedLen instead of totalLen?
     const dim3 gs(std::min(
                     // does not make sense to have more blocks than this
-                    raft::ceildiv<uint>(uint(totalLen), block_work_size),
+                    raft::ceildiv<uint>(uint(alignedLen), block_work_size),
                     // increase the grid size to be not less than `occupy` while
                     // still being the multiple of `expected_grid_size`
                     raft::ceildiv<uint>(occupy, expected_grid_size) * expected_grid_size),

From 5421dda583e4f5246467dcba43746e3db3e58df7 Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Wed, 26 Oct 2022 12:28:39 +0200
Subject: [PATCH 18/25] Add misalignments to matrix-vector-op test

---
 cpp/include/raft/util/itertools.hpp | 54 +++++++++++++++++
 cpp/test/linalg/matrix_vector_op.cu | 93 +++++++++++++++--------------
 2 files changed, 102 insertions(+), 45 deletions(-)
 create mode 100644 cpp/include/raft/util/itertools.hpp

diff --git a/cpp/include/raft/util/itertools.hpp b/cpp/include/raft/util/itertools.hpp
new file mode 100644
index 0000000000..025778659b
--- /dev/null
+++ b/cpp/include/raft/util/itertools.hpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <tuple>
+#include <vector>
+
+/**
+ * Helpers inspired by the Python itertools library
+ *
+ */
+
+namespace raft {
+namespace itertools {
+
+template <class S, typename... Args, size_t... Is>
+inline std::vector<S> product(std::index_sequence<Is...> index, const std::vector<Args>&... vecs)
+{
+  size_t len = 1;
+  ((len *= vecs.size()), ...);
+  std::vector<S> out;
+  out.reserve(len);
+  for (size_t i = 0; i < len; i++) {
+    std::tuple<Args...> tup;
+    size_t mod = len, new_mod;
+    ((new_mod = mod / vecs.size(), std::get<Is>(tup) = vecs[(i % mod) / new_mod], mod = new_mod),
+     ...);
+    out.push_back({std::get<Is>(tup)...});
+  }
+  return out;
+}
+
+template <typename S, typename... Args>
+std::vector<S> product(std::initializer_list<Args>... lists)
+{
+  return product<S>(std::index_sequence_for<Args...>(), (std::vector<Args>(lists))...);
+}
+
+};  // namespace itertools
+};  // namespace raft
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index 0b2c54de88..5a6320435f 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -19,6 +19,7 @@
 #include <gtest/gtest.h>
 #include <raft/random/rng.cuh>
 #include <raft/util/cudart_utils.hpp>
+#include <raft/util/itertools.hpp>
 
 namespace raft {
 namespace linalg {
@@ -27,6 +28,7 @@ template <typename IdxType = int>
 struct MatVecOpInputs {
   IdxType rows, cols;
   bool rowMajor, bcastAlongRows;
+  IdxType inAlignOffset, outAlignOffset;
   unsigned long long int seed;
 };
 
@@ -94,55 +96,58 @@ template <typename OpT,
 class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<IdxType>> {
  public:
   MatVecOpTest()
-    : params(::testing::TestWithParam<MatVecOpInputs<IdxType>>::GetParam()),
-      stream(handle.get_stream()),
-      in(params.rows * params.cols, stream),
-      out_ref(params.rows * params.cols, stream),
-      out(params.rows * params.cols, stream),
-      vec1(params.bcastAlongRows ? params.cols : params.rows, stream),
-      vec2(params.bcastAlongRows ? params.cols : params.rows, stream)
+    : stream(handle.get_stream()),
+      params(::testing::TestWithParam<MatVecOpInputs<IdxType>>::GetParam()),
+      vec_size(params.bcastAlongRows ? params.cols : params.rows),
+      in(params.rows * params.cols + params.inAlignOffset, stream),
+      out_ref(params.rows * params.cols + params.outAlignOffset, stream),
+      out(params.rows * params.cols + params.outAlignOffset, stream),
+      vec1(vec_size, stream),
+      vec2(vec_size, stream)
   {
   }
 
  protected:
   void SetUp() override
   {
+    MatT* in_ptr      = in.data() + params.inAlignOffset;
+    MatT* out_ptr     = out.data() + params.outAlignOffset;
+    MatT* out_ref_ptr = out_ref.data() + params.outAlignOffset;
+
     raft::random::RngState r(params.seed);
-    IdxType N = params.rows, D = params.cols;
-    IdxType len    = N * D;
-    IdxType vecLen = params.bcastAlongRows ? D : N;
-    gen_uniform<MatT>(handle, r, in.data(), len);
-    gen_uniform<Vec1T>(handle, r, vec1.data(), vecLen);
-    gen_uniform<Vec2T>(handle, r, vec2.data(), vecLen);
+    IdxType len = params.rows * params.cols;
+    gen_uniform<MatT>(handle, r, in_ptr, len);
+    gen_uniform<Vec1T>(handle, r, vec1.data(), vec_size);
+    gen_uniform<Vec2T>(handle, r, vec2.data(), vec_size);
     if constexpr (OpT::useTwoVectors) {
-      naiveMatVec(out_ref.data(),
-                  in.data(),
+      naiveMatVec(out_ref_ptr,
+                  in_ptr,
                   vec1.data(),
                   vec2.data(),
-                  D,
-                  N,
+                  params.cols,
+                  params.rows,
                   params.rowMajor,
                   params.bcastAlongRows,
                   OpT{},
                   stream);
     } else {
-      naiveMatVec(out_ref.data(),
-                  in.data(),
+      naiveMatVec(out_ref_ptr,
+                  in_ptr,
                   vec1.data(),
-                  D,
-                  N,
+                  params.cols,
+                  params.rows,
                   params.rowMajor,
                   params.bcastAlongRows,
                   OpT{},
                   stream);
     }
     matrixVectorOpLaunch<OpT>(handle,
-                              out.data(),
-                              in.data(),
+                              out_ptr,
+                              in_ptr,
                               vec1.data(),
                               vec2.data(),
-                              D,
-                              N,
+                              params.cols,
+                              params.rows,
                               params.rowMajor,
                               params.bcastAlongRows);
     handle.sync_stream();
@@ -153,6 +158,7 @@ class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<IdxType>> {
   cudaStream_t stream;
 
   MatVecOpInputs<IdxType> params;
+  IdxType vec_size;
   rmm::device_uvector<MatT> in;
   rmm::device_uvector<MatT> out;
   rmm::device_uvector<MatT> out_ref;
@@ -160,12 +166,14 @@ class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<IdxType>> {
   rmm::device_uvector<Vec2T> vec2;
 };
 
-#define MVTEST(TestClass, inputs, tolerance)                                                   \
-  TEST_P(TestClass, Result)                                                                    \
-  {                                                                                            \
-    ASSERT_TRUE(devArrMatch(                                                                   \
-      out_ref.data(), out.data(), params.rows* params.cols, CompareApprox<float>(tolerance))); \
-  }                                                                                            \
+#define MVTEST(TestClass, inputs, tolerance)                        \
+  TEST_P(TestClass, Result)                                         \
+  {                                                                 \
+    ASSERT_TRUE(devArrMatch(out_ref.data() + params.outAlignOffset, \
+                            out.data() + params.outAlignOffset,     \
+                            params.rows * params.cols,              \
+                            CompareApprox<float>(tolerance)));      \
+  }                                                                 \
   INSTANTIATE_TEST_SUITE_P(MatVecOpTests, TestClass, ::testing::ValuesIn(inputs))
 
 #define MV_EPS_F 0.00001f
@@ -175,16 +183,11 @@ class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<IdxType>> {
  * This set of tests covers cases where all the types are the same.
  */
 
-const std::vector<MatVecOpInputs<int>> inputs_i32    = {{1024, 32, true, true, 1234ULL},
-                                                     {1024, 64, true, true, 1234ULL},
-                                                     {1024, 32, true, false, 1234ULL},
-                                                     {1024, 64, true, false, 1234ULL},
-                                                     {1024, 32, false, true, 1234ULL},
-                                                     {1024, 64, false, true, 1234ULL},
-                                                     {1024, 32, false, false, 1234ULL},
-                                                     {1024, 64, false, false, 1234ULL}};
-const std::vector<MatVecOpInputs<size_t>> inputs_i64 = {{2500, 250, false, false, 1234ULL},
-                                                        {2500, 250, false, false, 1234ULL}};
+const std::vector<MatVecOpInputs<int>> inputs_i32 = raft::itertools::product<MatVecOpInputs<int>>(
+  {1024}, {32, 64}, {true, false}, {true, false}, {0, 1, 2}, {0, 1, 2}, {1234ULL});
+const std::vector<MatVecOpInputs<int64_t>> inputs_i64 =
+  raft::itertools::product<MatVecOpInputs<int64_t>>(
+    {2500}, {250}, {false}, {false}, {0, 1}, {0, 1}, {1234ULL});
 
 template <typename T>
 struct Add1Vec {
@@ -199,12 +202,12 @@ struct Add2Vec {
 
 typedef MatVecOpTest<Add1Vec<float>, float, int> MatVecOpTestF_i32_add1vec;
 typedef MatVecOpTest<Add2Vec<float>, float, int> MatVecOpTestF_i32_add2vec;
-typedef MatVecOpTest<Add1Vec<float>, float, size_t> MatVecOpTestF_i64_add1vec;
-typedef MatVecOpTest<Add2Vec<float>, float, size_t> MatVecOpTestF_i64_add2vec;
+typedef MatVecOpTest<Add1Vec<float>, float, int64_t> MatVecOpTestF_i64_add1vec;
+typedef MatVecOpTest<Add2Vec<float>, float, int64_t> MatVecOpTestF_i64_add2vec;
 typedef MatVecOpTest<Add1Vec<double>, double, int> MatVecOpTestD_i32_add1vec;
 typedef MatVecOpTest<Add2Vec<double>, double, int> MatVecOpTestD_i32_add2vec;
-typedef MatVecOpTest<Add1Vec<double>, double, size_t> MatVecOpTestD_i64_add1vec;
-typedef MatVecOpTest<Add2Vec<double>, double, size_t> MatVecOpTestD_i64_add2vec;
+typedef MatVecOpTest<Add1Vec<double>, double, int64_t> MatVecOpTestD_i64_add1vec;
+typedef MatVecOpTest<Add2Vec<double>, double, int64_t> MatVecOpTestD_i64_add2vec;
 
 MVTEST(MatVecOpTestF_i32_add1vec, inputs_i32, MV_EPS_F);
 MVTEST(MatVecOpTestF_i32_add2vec, inputs_i32, MV_EPS_F);

From ccdc961d6a671f8f4a4384efb6b94db8b42656ec Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Wed, 26 Oct 2022 17:06:18 +0200
Subject: [PATCH 19/25] Extend matrix-vector op benchmark

---
 cpp/bench/linalg/matrix_vector_op.cu | 140 +++++++++++++++++++++------
 1 file changed, 108 insertions(+), 32 deletions(-)

diff --git a/cpp/bench/linalg/matrix_vector_op.cu b/cpp/bench/linalg/matrix_vector_op.cu
index aa8f2667ed..aa388955da 100644
--- a/cpp/bench/linalg/matrix_vector_op.cu
+++ b/cpp/bench/linalg/matrix_vector_op.cu
@@ -20,61 +20,137 @@
 
 namespace raft::bench::linalg {
 
+template <typename IdxT>
 struct mat_vec_op_inputs {
-  int rows, cols;
+  IdxT rows, cols;
   bool rowMajor, bcastAlongRows;
+  IdxT inAlignOffset, outAlignOffset;
 };  // struct mat_vec_op_inputs
 
-template <typename T>
+template <typename IdxT>
+inline auto operator<<(std::ostream& os, const mat_vec_op_inputs<IdxT>& p) -> std::ostream&
+{
+  os << p.rows << "#" << p.cols << "#" << p.rowMajor << "#" << p.bcastAlongRows << "#"
+     << p.inAlignOffset << "#" << p.outAlignOffset;
+  return os;
+}
+
+template <typename OpT, typename T, typename IdxT>
 struct mat_vec_op : public fixture {
-  mat_vec_op(const mat_vec_op_inputs& p)
+  mat_vec_op(const mat_vec_op_inputs<IdxT>& p)
     : params(p),
-      out(p.rows * p.cols, stream),
-      in(p.rows * p.cols, stream),
-      vec(p.bcastAlongRows ? p.cols : p.rows, stream)
+      out(p.rows * p.cols + params.outAlignOffset, stream),
+      in(p.rows * p.cols + params.inAlignOffset, stream),
+      vec1(p.bcastAlongRows ? p.cols : p.rows, stream),
+      vec2(p.bcastAlongRows ? p.cols : p.rows, stream)
   {
   }
 
   void run_benchmark(::benchmark::State& state) override
   {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
     loop_on_state(state, [this]() {
-      raft::linalg::matrixVectorOp(out.data(),
-                                   in.data(),
-                                   vec.data(),
-                                   params.cols,
-                                   params.rows,
-                                   params.rowMajor,
-                                   params.bcastAlongRows,
-                                   raft::Sum<T>(),
-                                   stream);
+      if constexpr (OpT::useTwoVectors) {
+        raft::linalg::matrixVectorOp(out.data() + params.outAlignOffset,
+                                     in.data() + params.inAlignOffset,
+                                     vec1.data(),
+                                     vec2.data(),
+                                     params.cols,
+                                     params.rows,
+                                     params.rowMajor,
+                                     params.bcastAlongRows,
+                                     OpT{},
+                                     stream);
+      } else {
+        raft::linalg::matrixVectorOp(out.data() + params.outAlignOffset,
+                                     in.data() + params.inAlignOffset,
+                                     vec1.data(),
+                                     params.cols,
+                                     params.rows,
+                                     params.rowMajor,
+                                     params.bcastAlongRows,
+                                     OpT{},
+                                     stream);
+      }
     });
   }
 
  private:
-  mat_vec_op_inputs params;
-  rmm::device_uvector<T> out, in, vec;
+  mat_vec_op_inputs<IdxT> params;
+  rmm::device_uvector<T> out, in, vec1, vec2;
 };  // struct MatVecOp
 
-const std::vector<mat_vec_op_inputs> mat_vec_op_input_vecs{
-  {1024, 128, true, true},       {1024 * 1024, 128, true, true},
-  {1024, 128 + 2, true, true},   {1024 * 1024, 128 + 2, true, true},
-  {1024, 128 + 1, true, true},   {1024 * 1024, 128 + 1, true, true},
+template <typename IdxT>
+std::vector<mat_vec_op_inputs<IdxT>> get_mv_inputs()
+{
+  std::vector<mat_vec_op_inputs<IdxT>> out;
 
-  {1024, 128, true, false},      {1024 * 1024, 128, true, false},
-  {1024, 128 + 2, true, false},  {1024 * 1024, 128 + 2, true, false},
-  {1024, 128 + 1, true, false},  {1024 * 1024, 128 + 1, true, false},
+  // Scalability benchmark with round dimensions
+  std::vector<IdxT> rows = {1000, 100000, 1000000};
+  std::vector<IdxT> cols = {8, 64, 256, 1024};
+  for (bool rowMajor : {true, false}) {
+    for (bool alongRows : {true, false}) {
+      for (IdxT rows_ : rows) {
+        for (IdxT cols_ : cols) {
+          out.push_back({rows_, cols_, rowMajor, alongRows, 0, 0});
+        }
+      }
+    }
+  }
 
-  {1024, 128, false, false},     {1024 * 1024, 128, false, false},
-  {1024, 128 + 2, false, false}, {1024 * 1024, 128 + 2, false, false},
-  {1024, 128 + 1, false, false}, {1024 * 1024, 128 + 1, false, false},
+  // Odd dimensions, misalignment
+  std::vector<std::tuple<IdxT, IdxT>> rowcols = {
+    {44739207, 7},
+    {44739207, 15},
+    {44739207, 16},
+    {44739207, 17},
+    {2611236, 256},
+    {2611236, 257},
+    {2611236, 263},
+  };
+  for (bool rowMajor : {true, false}) {
+    for (bool alongRows : {true, false}) {
+      for (auto rc : rowcols) {
+        for (IdxT inAlignOffset : {0, 1}) {
+          for (IdxT outAlignOffset : {0, 1}) {
+            out.push_back({std::get<0>(rc),
+                           std::get<1>(rc),
+                           rowMajor,
+                           alongRows,
+                           inAlignOffset,
+                           outAlignOffset});
+          }
+        }
+      }
+    }
+  }
+  return out;
+}
 
-  {1024, 128, false, true},      {1024 * 1024, 128, false, true},
-  {1024, 128 + 2, false, true},  {1024 * 1024, 128 + 2, false, true},
-  {1024, 128 + 1, false, true},  {1024 * 1024, 128 + 1, false, true},
+const std::vector<mat_vec_op_inputs<int>> mv_input_i32     = get_mv_inputs<int>();
+const std::vector<mat_vec_op_inputs<int64_t>> mv_input_i64 = get_mv_inputs<int64_t>();
 
+template <typename T>
+struct Add1Vec {
+  static constexpr bool useTwoVectors = false;
+  HDI T operator()(T a, T b) const { return a + b; };
+};
+template <typename T>
+struct Add2Vec {
+  static constexpr bool useTwoVectors = true;
+  HDI T operator()(T a, T b, T c) const { return a + b + c; };
 };
 
-RAFT_BENCH_REGISTER(mat_vec_op<float>, "", mat_vec_op_input_vecs);
-RAFT_BENCH_REGISTER(mat_vec_op<double>, "", mat_vec_op_input_vecs);
+RAFT_BENCH_REGISTER((mat_vec_op<Add1Vec<float>, float, int>), "", mv_input_i32);
+RAFT_BENCH_REGISTER((mat_vec_op<Add1Vec<double>, double, int>), "", mv_input_i32);
+RAFT_BENCH_REGISTER((mat_vec_op<Add2Vec<float>, float, int>), "", mv_input_i32);
+RAFT_BENCH_REGISTER((mat_vec_op<Add2Vec<double>, double, int>), "", mv_input_i32);
+RAFT_BENCH_REGISTER((mat_vec_op<Add1Vec<float>, float, int64_t>), "", mv_input_i64);
+RAFT_BENCH_REGISTER((mat_vec_op<Add1Vec<double>, double, int64_t>), "", mv_input_i64);
+RAFT_BENCH_REGISTER((mat_vec_op<Add2Vec<float>, float, int64_t>), "", mv_input_i64);
+RAFT_BENCH_REGISTER((mat_vec_op<Add2Vec<double>, double, int64_t>), "", mv_input_i64);
 
 }  // namespace raft::bench::linalg

From f2f67dbece5f2d797d84b5a9449293c3e9dbc4cf Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Fri, 28 Oct 2022 19:09:08 +0200
Subject: [PATCH 20/25] Apply changes to new padded kernel (note: test is still
 failing but also upstream)

---
 .../raft/matrix/detail/linewise_op.cuh        | 44 +++++++++++--------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh
index 482571e2ea..37198684ee 100644
--- a/cpp/include/raft/matrix/detail/linewise_op.cuh
+++ b/cpp/include/raft/matrix/detail/linewise_op.cuh
@@ -170,6 +170,7 @@ struct Linewise {
    * of a vector. Most of the time this is not aligned, so we load it thread-striped
    * within a block and then use the shared memory to get a contiguous chunk.
    *
+   * @tparam VecT Type of the vector to load
    * @param [in] shm a shared memory region for rearranging the data among threads
    * @param [in] p pointer to a vector
    * @param [in] blockOffset the offset of the current block into a vector.
@@ -202,6 +203,7 @@ struct Linewise {
   /**
    * @brief Same as loadVec, but padds data with Ones
    *
+   * @tparam VecT Type of the vector to load
    * @param shm
    * @param p
    * @param blockOffset
@@ -209,23 +211,27 @@ struct Linewise {
    * @param rowLenPadded
    * @return a contiguous chunk of a vector, suitable for `vectorRows`.
    */
-  static __device__ __forceinline__ Vec loadVecPadded(Type* shm,
-                                                      const Type* p,
-                                                      const IdxType blockOffset,
-                                                      const IdxType rowLen,
-                                                      const IdxType rowLenPadded) noexcept
+  template <typename VecT>
+  static __device__ __forceinline__ VecArg<VecT, VecElems> loadVecPadded(
+    VecT* shm,
+    const VecT* p,
+    const IdxType blockOffset,
+    const IdxType rowLen,
+    const IdxType rowLenPadded) noexcept
   {
     IdxType j = blockOffset + threadIdx.x;
 #pragma unroll VecElems
     for (int k = threadIdx.x; k < VecElems * BlockSize; k += BlockSize, j += BlockSize) {
       while (j >= rowLenPadded)
         j -= rowLenPadded;
-      shm[k] = j < rowLen ? p[j] : Type(1);
+      shm[k] = j < rowLen ? p[j] : VecT(1);
     }
     __syncthreads();
     {
-      Vec out;
-      *out.vectorized_data() = reinterpret_cast<typename Vec::io_t*>(shm)[threadIdx.x];
+      VecArg<VecT, VecElems> out;
+#pragma unroll VecElems
+      for (int i = 0; i < VecElems; i++)
+        out.val[i] = shm[threadIdx.x * VecElems + i];
       return out;
     }
   }
@@ -414,21 +420,23 @@ __global__ void __launch_bounds__(BlockSize)
                                   const IdxType rowLenPadded,
                                   const IdxType lenPadded,
                                   Lambda op,
-                                  Vecs... vecs)
+                                  const Vecs*... vecs)
 {
   typedef Linewise<Type, IdxType, VecBytes, BlockSize> L;
-  constexpr uint workSize = L::VecElems * BlockSize;
-  uint workOffset         = workSize;
-  __shared__ __align__(sizeof(Type) * L::VecElems)
-    Type shm[workSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)];
+  constexpr uint workSize         = L::VecElems * BlockSize;
+  constexpr size_t maxVecItemSize = maxSizeOf<Vecs...>();
+  uint workOffset                 = workSize * maxVecItemSize;
+  __shared__ __align__(
+    maxVecItemSize *
+    L::VecElems) char shm[workSize * maxVecItemSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)];
   const IdxType blockOffset = (BlockSize * L::VecElems * blockIdx.x) % rowLenPadded;
   return L::vectorRows(
     reinterpret_cast<typename L::Vec::io_t*>(out),
     reinterpret_cast<const typename L::Vec::io_t*>(in),
     L::AlignElems::div(lenPadded),
     op,
-    (workOffset ^= workSize,
-     L::loadVecPadded(shm + workOffset, vecs, blockOffset, rowLen, rowLenPadded))...);
+    (workOffset ^= workSize * maxVecItemSize,
+     L::loadVecPadded((Vecs*)(shm + workOffset), vecs, blockOffset, rowLen, rowLenPadded))...);
 }
 
 /**
@@ -570,7 +578,7 @@ void matrixLinewiseVecColsSpan(
   const IdxType nRows,
   Lambda op,
   cudaStream_t stream,
-  Vecs... vecs)
+  const Vecs*... vecs)
 {
   typedef raft::Pow2<VecBytes> AlignBytes;
   constexpr std::size_t VecElems = VecBytes / sizeof(Type);
@@ -688,7 +696,7 @@ void matrixLinewiseVecRowsSpan(
   const IdxType nRows,
   Lambda op,
   cudaStream_t stream,
-  Vecs... vecs)
+  const Vecs*... vecs)
 {
   constexpr std::size_t VecElems = VecBytes / sizeof(Type);
   typedef raft::Pow2<VecBytes> AlignBytes;
@@ -779,7 +787,7 @@ struct MatrixLinewiseOp {
                         const bool alongLines,
                         Lambda op,
                         cudaStream_t stream,
-                        Vecs... vecs)
+                        const Vecs*... vecs)
   {
     constexpr auto is_rowmajor = std::is_same_v<LayoutPolicy, raft::layout_right_padded<Type>>;
     constexpr auto is_colmajor = std::is_same_v<LayoutPolicy, raft::layout_left_padded<Type>>;

From 1450baedc5ace3df4a749bb2f1a8a0e28b26557d Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Fri, 28 Oct 2022 19:14:51 +0200
Subject: [PATCH 21/25] Put itertools in util namespace

---
 cpp/include/raft/util/itertools.hpp | 6 ++----
 cpp/test/linalg/matrix_vector_op.cu | 7 ++++---
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/cpp/include/raft/util/itertools.hpp b/cpp/include/raft/util/itertools.hpp
index 025778659b..e6bb66e7ef 100644
--- a/cpp/include/raft/util/itertools.hpp
+++ b/cpp/include/raft/util/itertools.hpp
@@ -24,8 +24,7 @@
  *
  */
 
-namespace raft {
-namespace itertools {
+namespace raft::util::itertools {
 
 template <class S, typename... Args, size_t... Is>
 inline std::vector<S> product(std::index_sequence<Is...> index, const std::vector<Args>&... vecs)
@@ -50,5 +49,4 @@ std::vector<S> product(std::initializer_list<Args>... lists)
   return product<S>(std::index_sequence_for<Args...>(), (std::vector<Args>(lists))...);
 }
 
-};  // namespace itertools
-};  // namespace raft
+}  // namespace raft::util::itertools
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index 3757e19274..c0355b0385 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -184,10 +184,11 @@ class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<IdxType>> {
  * This set of tests covers cases where all the types are the same.
  */
 
-const std::vector<MatVecOpInputs<int>> inputs_i32 = raft::itertools::product<MatVecOpInputs<int>>(
-  {1024}, {32, 64}, {true, false}, {true, false}, {0, 1, 2}, {0, 1, 2}, {1234ULL});
+const std::vector<MatVecOpInputs<int>> inputs_i32 =
+  raft::utils::itertools::product<MatVecOpInputs<int>>(
+    {1024}, {32, 64}, {true, false}, {true, false}, {0, 1, 2}, {0, 1, 2}, {1234ULL});
 const std::vector<MatVecOpInputs<int64_t>> inputs_i64 =
-  raft::itertools::product<MatVecOpInputs<int64_t>>(
+  raft::utils::itertools::product<MatVecOpInputs<int64_t>>(
     {2500}, {250}, {false}, {false}, {0, 1}, {0, 1}, {1234ULL});
 
 template <typename T>

From 7bcbbe2ebc7507b222825f1e9243a4903b36efee Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Fri, 28 Oct 2022 19:16:32 +0200
Subject: [PATCH 22/25] Remove TPB from public API (it wasn't even forwarded to
 the actual implementation)

---
 cpp/include/raft/linalg/matrix_vector_op.cuh | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index d673769dc3..d68be838b0 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -39,7 +39,6 @@ namespace linalg {
  * @tparam Lambda a device function which represents a binary operator
  * @tparam VecT the input vector type
  * @tparam IdxType Integer type used to for addressing
- * @tparam TPB threads per block of the cuda kernel launched
  * @param out the output matrix (passing out = matrix makes it in-place)
  * @param matrix the input matrix
  * @param vec the vector
@@ -51,7 +50,7 @@ namespace linalg {
  * @param op the mathematical operation
  * @param stream cuda stream where to launch work
  */
-template <typename MatT, typename Lambda, typename VecT, typename IdxType = int, int TPB = 256>
+template <typename MatT, typename Lambda, typename VecT, typename IdxType = int>
 void matrixVectorOp(MatT* out,
                     const MatT* matrix,
                     const VecT* vec,
@@ -78,7 +77,6 @@ void matrixVectorOp(MatT* out,
  * @tparam Vec1T the first input vector type
  * @tparam Vec2T the second input vector type
  * @tparam IdxType Integer type used to for addressing
- * @tparam TPB threads per block of the cuda kernel launched
  * @param out the output matrix (passing out = matrix makes it in-place)
  * @param matrix the input matrix
  * @param vec1 the first vector
@@ -91,12 +89,7 @@ void matrixVectorOp(MatT* out,
  * @param op the mathematical operation
  * @param stream cuda stream where to launch work
  */
-template <typename MatT,
-          typename Lambda,
-          typename Vec1T,
-          typename Vec2T,
-          typename IdxType = int,
-          int TPB          = 256>
+template <typename MatT, typename Lambda, typename Vec1T, typename Vec2T, typename IdxType = int>
 void matrixVectorOp(MatT* out,
                     const MatT* matrix,
                     const Vec1T* vec1,
@@ -129,7 +122,6 @@ void matrixVectorOp(MatT* out,
  * @tparam LayoutPolicy the layout of input and output (raft::row_major or raft::col_major)
  * @tparam Lambda a device function which represents a binary operator
  * @tparam IndexType Integer used for addressing
- * @tparam TPB threads per block of the cuda kernel launched
  * @param[in] handle raft::handle_t
  * @param[in] matrix input raft::matrix_view
  * @param[in] vec vector raft::vector_view
@@ -142,8 +134,7 @@ template <typename MatValueType,
           typename VecValueType,
           typename LayoutPolicy,
           typename Lambda,
-          typename IndexType,
-          int TPB = 256>
+          typename IndexType>
 void matrix_vector_op(const raft::handle_t& handle,
                       raft::device_matrix_view<const MatValueType, IndexType, LayoutPolicy> matrix,
                       raft::device_vector_view<const VecValueType, IndexType> vec,
@@ -191,7 +182,6 @@ void matrix_vector_op(const raft::handle_t& handle,
  * @tparam LayoutPolicy the layout of input and output (raft::row_major or raft::col_major)
  * @tparam Lambda a device function which represents a binary operator
  * @tparam IndexType Integer used for addressing
- * @tparam TPB threads per block of the cuda kernel launched
  * @param handle raft::handle_t
  * @param matrix input raft::matrix_view
  * @param vec1 the first vector raft::vector_view
@@ -206,8 +196,7 @@ template <typename MatValueType,
           typename Vec2ValueType,
           typename LayoutPolicy,
           typename Lambda,
-          typename IndexType,
-          int TPB = 256>
+          typename IndexType>
 void matrix_vector_op(const raft::handle_t& handle,
                       raft::device_matrix_view<const MatValueType, IndexType, LayoutPolicy> matrix,
                       raft::device_vector_view<const Vec1ValueType, IndexType> vec1,

From c7942b00f38bb60b523fa10e47201853583de80a Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Fri, 28 Oct 2022 19:58:49 +0200
Subject: [PATCH 23/25] Fix utils -> util

---
 cpp/test/linalg/matrix_vector_op.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index c0355b0385..67bdfb74fc 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -185,10 +185,10 @@ class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<IdxType>> {
  */
 
 const std::vector<MatVecOpInputs<int>> inputs_i32 =
-  raft::utils::itertools::product<MatVecOpInputs<int>>(
+  raft::util::itertools::product<MatVecOpInputs<int>>(
     {1024}, {32, 64}, {true, false}, {true, false}, {0, 1, 2}, {0, 1, 2}, {1234ULL});
 const std::vector<MatVecOpInputs<int64_t>> inputs_i64 =
-  raft::utils::itertools::product<MatVecOpInputs<int64_t>>(
+  raft::util::itertools::product<MatVecOpInputs<int64_t>>(
     {2500}, {250}, {false}, {false}, {0, 1}, {0, 1}, {1234ULL});
 
 template <typename T>

From 68fd977d70b5762da065cfe61925ecfd7f99f3a1 Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Tue, 8 Nov 2022 18:43:13 +0100
Subject: [PATCH 24/25] Move product auxiliary function to itertools::detail

---
 cpp/include/raft/util/detail/itertools.hpp | 41 ++++++++++++++++++++++
 cpp/include/raft/util/itertools.hpp        | 35 ++++++++----------
 2 files changed, 56 insertions(+), 20 deletions(-)
 create mode 100644 cpp/include/raft/util/detail/itertools.hpp

diff --git a/cpp/include/raft/util/detail/itertools.hpp b/cpp/include/raft/util/detail/itertools.hpp
new file mode 100644
index 0000000000..1908d90b95
--- /dev/null
+++ b/cpp/include/raft/util/detail/itertools.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <tuple>
+#include <vector>
+
+namespace raft::util::itertools::detail {
+
+template <class S, typename... Args, size_t... Is>
+inline std::vector<S> product(std::index_sequence<Is...> index, const std::vector<Args>&... vecs)
+{
+  size_t len = 1;
+  ((len *= vecs.size()), ...);
+  std::vector<S> out;
+  out.reserve(len);
+  for (size_t i = 0; i < len; i++) {
+    std::tuple<Args...> tup;
+    size_t mod = len, new_mod;
+    ((new_mod = mod / vecs.size(), std::get<Is>(tup) = vecs[(i % mod) / new_mod], mod = new_mod),
+     ...);
+    out.push_back({std::get<Is>(tup)...});
+  }
+  return out;
+}
+
+}  // namespace raft::util::itertools::detail
diff --git a/cpp/include/raft/util/itertools.hpp b/cpp/include/raft/util/itertools.hpp
index e6bb66e7ef..493ac9befe 100644
--- a/cpp/include/raft/util/itertools.hpp
+++ b/cpp/include/raft/util/itertools.hpp
@@ -16,8 +16,7 @@
 
 #pragma once
 
-#include <tuple>
-#include <vector>
+#include <raft/util/detail/itertools.hpp>
 
 /**
  * Helpers inspired by the Python itertools library
@@ -26,27 +25,23 @@
 
 namespace raft::util::itertools {
 
-template <class S, typename... Args, size_t... Is>
-inline std::vector<S> product(std::index_sequence<Is...> index, const std::vector<Args>&... vecs)
-{
-  size_t len = 1;
-  ((len *= vecs.size()), ...);
-  std::vector<S> out;
-  out.reserve(len);
-  for (size_t i = 0; i < len; i++) {
-    std::tuple<Args...> tup;
-    size_t mod = len, new_mod;
-    ((new_mod = mod / vecs.size(), std::get<Is>(tup) = vecs[(i % mod) / new_mod], mod = new_mod),
-     ...);
-    out.push_back({std::get<Is>(tup)...});
-  }
-  return out;
-}
-
+/**
+ * @brief Cartesian product of the given initializer lists.
+ *
+ * This helper can be used to easily define input parameters in tests/benchmarks.
+ * Note that it's not optimized for use with large lists / many lists in performance-critical code!
+ *
+ * @tparam S    Type of the output structures.
+ * @tparam Args Types of the elements of the initilizer lists, matching the types of the first
+ *              fields of the structure (if the structure has more fields, some might be initialized
+ *              with their default value).
+ * @param lists One or more initializer lists.
+ * @return std::vector<S> A vector of structures containing the cartesian product.
+ */
 template <typename S, typename... Args>
 std::vector<S> product(std::initializer_list<Args>... lists)
 {
-  return product<S>(std::index_sequence_for<Args...>(), (std::vector<Args>(lists))...);
+  return detail::product<S>(std::index_sequence_for<Args...>(), (std::vector<Args>(lists))...);
 }
 
 }  // namespace raft::util::itertools

From b2be0c1195ca749f353a30c860e03f34edf0a2db Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Tue, 8 Nov 2022 19:16:14 +0100
Subject: [PATCH 25/25] Add test case for int8_t matrix with float vectors

---
 cpp/test/linalg/matrix_vector_op.cu | 60 +++++++++++++++++++----------
 1 file changed, 40 insertions(+), 20 deletions(-)

diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index 67bdfb74fc..1c96c3fc74 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -21,6 +21,7 @@
 #include <raft/random/rng.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/itertools.hpp>
+#include <type_traits>
 
 namespace raft {
 namespace linalg {
@@ -167,14 +168,21 @@ class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<IdxType>> {
   rmm::device_uvector<Vec2T> vec2;
 };
 
-#define MVTEST(TestClass, inputs, tolerance)                        \
-  TEST_P(TestClass, Result)                                         \
-  {                                                                 \
-    ASSERT_TRUE(devArrMatch(out_ref.data() + params.outAlignOffset, \
-                            out.data() + params.outAlignOffset,     \
-                            params.rows * params.cols,              \
-                            CompareApprox<float>(tolerance)));      \
-  }                                                                 \
+#define MVTEST(TestClass, OutType, inputs, tolerance)                 \
+  TEST_P(TestClass, Result)                                           \
+  {                                                                   \
+    if constexpr (std::is_floating_point_v<OutType>) {                \
+      ASSERT_TRUE(devArrMatch(out_ref.data() + params.outAlignOffset, \
+                              out.data() + params.outAlignOffset,     \
+                              params.rows * params.cols,              \
+                              CompareApprox<OutType>(tolerance)));    \
+    } else {                                                          \
+      ASSERT_TRUE(devArrMatch(out_ref.data() + params.outAlignOffset, \
+                              out.data() + params.outAlignOffset,     \
+                              params.rows * params.cols,              \
+                              Compare<OutType>()));                   \
+    }                                                                 \
+  }                                                                   \
   INSTANTIATE_TEST_SUITE_P(MatVecOpTests, TestClass, ::testing::ValuesIn(inputs))
 
 #define MV_EPS_F 0.00001f
@@ -211,14 +219,14 @@ typedef MatVecOpTest<Add2Vec<double>, double, int> MatVecOpTestD_i32_add2vec;
 typedef MatVecOpTest<Add1Vec<double>, double, int64_t> MatVecOpTestD_i64_add1vec;
 typedef MatVecOpTest<Add2Vec<double>, double, int64_t> MatVecOpTestD_i64_add2vec;
 
-MVTEST(MatVecOpTestF_i32_add1vec, inputs_i32, MV_EPS_F);
-MVTEST(MatVecOpTestF_i32_add2vec, inputs_i32, MV_EPS_F);
-MVTEST(MatVecOpTestF_i64_add1vec, inputs_i64, MV_EPS_F);
-MVTEST(MatVecOpTestF_i64_add2vec, inputs_i64, MV_EPS_F);
-MVTEST(MatVecOpTestD_i32_add1vec, inputs_i32, MV_EPS_D);
-MVTEST(MatVecOpTestD_i32_add2vec, inputs_i32, MV_EPS_D);
-MVTEST(MatVecOpTestD_i64_add1vec, inputs_i64, MV_EPS_D);
-MVTEST(MatVecOpTestD_i64_add2vec, inputs_i64, MV_EPS_D);
+MVTEST(MatVecOpTestF_i32_add1vec, float, inputs_i32, MV_EPS_F);
+MVTEST(MatVecOpTestF_i32_add2vec, float, inputs_i32, MV_EPS_F);
+MVTEST(MatVecOpTestF_i64_add1vec, float, inputs_i64, MV_EPS_F);
+MVTEST(MatVecOpTestF_i64_add2vec, float, inputs_i64, MV_EPS_F);
+MVTEST(MatVecOpTestD_i32_add1vec, double, inputs_i32, MV_EPS_D);
+MVTEST(MatVecOpTestD_i32_add2vec, double, inputs_i32, MV_EPS_D);
+MVTEST(MatVecOpTestD_i64_add1vec, double, inputs_i64, MV_EPS_D);
+MVTEST(MatVecOpTestD_i64_add2vec, double, inputs_i64, MV_EPS_D);
 
 /*
  * This set of tests covers cases with different types.
@@ -239,10 +247,22 @@ typedef MatVecOpTest<MulAndAdd<float, int64_t, float>, float, int, int64_t, floa
 typedef MatVecOpTest<MulAndAdd<double, int32_t, float>, double, int, int32_t, float>
   MatVecOpTestD_i32_MulAndAdd_i32_f;
 
-MVTEST(MatVecOpTestF_i32_MulAndAdd_i32_f, inputs_i32, MV_EPS_F);
-MVTEST(MatVecOpTestF_i32_MulAndAdd_i32_d, inputs_i32, MV_EPS_F);
-MVTEST(MatVecOpTestF_i32_MulAndAdd_i64_f, inputs_i32, MV_EPS_F);
-MVTEST(MatVecOpTestD_i32_MulAndAdd_i32_f, inputs_i32, (double)MV_EPS_F);
+MVTEST(MatVecOpTestF_i32_MulAndAdd_i32_f, float, inputs_i32, MV_EPS_F);
+MVTEST(MatVecOpTestF_i32_MulAndAdd_i32_d, float, inputs_i32, MV_EPS_F);
+MVTEST(MatVecOpTestF_i32_MulAndAdd_i64_f, float, inputs_i32, MV_EPS_F);
+MVTEST(MatVecOpTestD_i32_MulAndAdd_i32_f, double, inputs_i32, (double)MV_EPS_F);
+
+struct DQMultiply {
+  static constexpr bool useTwoVectors = true;
+  HDI int8_t operator()(int8_t a, float b, float c) const
+  {
+    return static_cast<int8_t>((static_cast<float>(a) / 100.0f * (b + c) / 20.0f) * 100.0f);
+  };
+};
+
+typedef MatVecOpTest<DQMultiply, int8_t, int, float, float> MatVecOpTestI8_i32_DQMultiply_f_f;
+
+MVTEST(MatVecOpTestI8_i32_DQMultiply_f_f, int8_t, inputs_i32, 0);
 
 }  // end namespace linalg
 }  // end namespace raft