rapidsai · rapids-bot · Nov 8, 2022 · Oct 7, 2022 · Oct 10, 2022 · Oct 10, 2022
@@ -20,61 +20,137 @@
 
 namespace raft::bench::linalg {
 
+template <typename IdxT>
 struct mat_vec_op_inputs {
-  int rows, cols;
+  IdxT rows, cols;
   bool rowMajor, bcastAlongRows;
+  IdxT inAlignOffset, outAlignOffset;
 };  // struct mat_vec_op_inputs
 
-template <typename T>
+template <typename IdxT>
+inline auto operator<<(std::ostream& os, const mat_vec_op_inputs<IdxT>& p) -> std::ostream&
+{
+  os << p.rows << "#" << p.cols << "#" << p.rowMajor << "#" << p.bcastAlongRows << "#"
+     << p.inAlignOffset << "#" << p.outAlignOffset;
+  return os;
+}
+
+template <typename OpT, typename T, typename IdxT>
 struct mat_vec_op : public fixture {
-  mat_vec_op(const mat_vec_op_inputs& p)
+  mat_vec_op(const mat_vec_op_inputs<IdxT>& p)
     : params(p),
-      out(p.rows * p.cols, stream),
-      in(p.rows * p.cols, stream),
-      vec(p.bcastAlongRows ? p.cols : p.rows, stream)
+      out(p.rows * p.cols + params.outAlignOffset, stream),
+      in(p.rows * p.cols + params.inAlignOffset, stream),
+      vec1(p.bcastAlongRows ? p.cols : p.rows, stream),
+      vec2(p.bcastAlongRows ? p.cols : p.rows, stream)
   {
   }
 
   void run_benchmark(::benchmark::State& state) override
   {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
     loop_on_state(state, [this]() {
-      raft::linalg::matrixVectorOp(out.data(),
-                                   in.data(),
-                                   vec.data(),
-                                   params.cols,
-                                   params.rows,
-                                   params.rowMajor,
-                                   params.bcastAlongRows,
-                                   raft::Sum<T>(),
-                                   stream);
+      if constexpr (OpT::useTwoVectors) {
+        raft::linalg::matrixVectorOp(out.data() + params.outAlignOffset,
+                                     in.data() + params.inAlignOffset,
+                                     vec1.data(),
+                                     vec2.data(),
+                                     params.cols,
+                                     params.rows,
+                                     params.rowMajor,
+                                     params.bcastAlongRows,
+                                     OpT{},
+                                     stream);
+      } else {
+        raft::linalg::matrixVectorOp(out.data() + params.outAlignOffset,
+                                     in.data() + params.inAlignOffset,
+                                     vec1.data(),
+                                     params.cols,
+                                     params.rows,
+                                     params.rowMajor,
+                                     params.bcastAlongRows,
+                                     OpT{},
+                                     stream);
+      }
     });
   }
 
  private:
-  mat_vec_op_inputs params;
-  rmm::device_uvector<T> out, in, vec;
+  mat_vec_op_inputs<IdxT> params;
+  rmm::device_uvector<T> out, in, vec1, vec2;
 };  // struct MatVecOp
 
-const std::vector<mat_vec_op_inputs> mat_vec_op_input_vecs{
-  {1024, 128, true, true},       {1024 * 1024, 128, true, true},
-  {1024, 128 + 2, true, true},   {1024 * 1024, 128 + 2, true, true},
-  {1024, 128 + 1, true, true},   {1024 * 1024, 128 + 1, true, true},
+template <typename IdxT>
+std::vector<mat_vec_op_inputs<IdxT>> get_mv_inputs()
+{
+  std::vector<mat_vec_op_inputs<IdxT>> out;
 
-  {1024, 128, true, false},      {1024 * 1024, 128, true, false},
-  {1024, 128 + 2, true, false},  {1024 * 1024, 128 + 2, true, false},
-  {1024, 128 + 1, true, false},  {1024 * 1024, 128 + 1, true, false},
+  // Scalability benchmark with round dimensions
+  std::vector<IdxT> rows = {1000, 100000, 1000000};
+  std::vector<IdxT> cols = {8, 64, 256, 1024};
+  for (bool rowMajor : {true, false}) {
+    for (bool alongRows : {true, false}) {
+      for (IdxT rows_ : rows) {
+        for (IdxT cols_ : cols) {
+          out.push_back({rows_, cols_, rowMajor, alongRows, 0, 0});
+        }
+      }
+    }
+  }
 
-  {1024, 128, false, false},     {1024 * 1024, 128, false, false},
-  {1024, 128 + 2, false, false}, {1024 * 1024, 128 + 2, false, false},
-  {1024, 128 + 1, false, false}, {1024 * 1024, 128 + 1, false, false},
+  // Odd dimensions, misalignment
+  std::vector<std::tuple<IdxT, IdxT>> rowcols = {
+    {44739207, 7},
+    {44739207, 15},
+    {44739207, 16},
+    {44739207, 17},
+    {2611236, 256},
+    {2611236, 257},
+    {2611236, 263},
+  };
+  for (bool rowMajor : {true, false}) {
+    for (bool alongRows : {true, false}) {
+      for (auto rc : rowcols) {
+        for (IdxT inAlignOffset : {0, 1}) {
+          for (IdxT outAlignOffset : {0, 1}) {
+            out.push_back({std::get<0>(rc),
+                           std::get<1>(rc),
+                           rowMajor,
+                           alongRows,
+                           inAlignOffset,
+                           outAlignOffset});
+          }
+        }
+      }
+    }
+  }
+  return out;
+}
 
-  {1024, 128, false, true},      {1024 * 1024, 128, false, true},
-  {1024, 128 + 2, false, true},  {1024 * 1024, 128 + 2, false, true},
-  {1024, 128 + 1, false, true},  {1024 * 1024, 128 + 1, false, true},
+const std::vector<mat_vec_op_inputs<int>> mv_input_i32     = get_mv_inputs<int>();
+const std::vector<mat_vec_op_inputs<int64_t>> mv_input_i64 = get_mv_inputs<int64_t>();
 
+template <typename T>
+struct Add1Vec {
+  static constexpr bool useTwoVectors = false;
+  HDI T operator()(T a, T b) const { return a + b; };
+};
+template <typename T>
+struct Add2Vec {
+  static constexpr bool useTwoVectors = true;
+  HDI T operator()(T a, T b, T c) const { return a + b + c; };
 };
 
-RAFT_BENCH_REGISTER(mat_vec_op<float>, "", mat_vec_op_input_vecs);
-RAFT_BENCH_REGISTER(mat_vec_op<double>, "", mat_vec_op_input_vecs);
+RAFT_BENCH_REGISTER((mat_vec_op<Add1Vec<float>, float, int>), "", mv_input_i32);
+RAFT_BENCH_REGISTER((mat_vec_op<Add1Vec<double>, double, int>), "", mv_input_i32);
+RAFT_BENCH_REGISTER((mat_vec_op<Add2Vec<float>, float, int>), "", mv_input_i32);
+RAFT_BENCH_REGISTER((mat_vec_op<Add2Vec<double>, double, int>), "", mv_input_i32);
+RAFT_BENCH_REGISTER((mat_vec_op<Add1Vec<float>, float, int64_t>), "", mv_input_i64);
+RAFT_BENCH_REGISTER((mat_vec_op<Add1Vec<double>, double, int64_t>), "", mv_input_i64);
+RAFT_BENCH_REGISTER((mat_vec_op<Add2Vec<float>, float, int64_t>), "", mv_input_i64);
+RAFT_BENCH_REGISTER((mat_vec_op<Add2Vec<double>, double, int64_t>), "", mv_input_i64);
 
 }  // namespace raft::bench::linalg
@@ -22,80 +22,10 @@ namespace raft {
 namespace linalg {
 namespace detail {
 
-namespace {
-template <size_t VecBytes>
-struct AlignedAccess {
-  template <typename T>
-  static inline bool test(const T* matrix, size_t strideBytes)
-  {
-    return Pow2<VecBytes>::isAligned(matrix) && Pow2<VecBytes>::isAligned(strideBytes) &&
-           Pow2<sizeof(T)>::isAligned(VecBytes);
-  }
-};
-};  // namespace
-
-template <typename Type, int veclen_, typename Lambda, typename IdxType>
-__global__ void matrixVectorOpKernel(Type* out,
-                                     const Type* matrix,
-                                     const Type* vector,
-                                     IdxType D,
-                                     IdxType N,
-                                     bool rowMajor,
-                                     bool bcastAlongRows,
-                                     Lambda op)
-{
-  typedef TxN_t<Type, veclen_> VecType;
-  IdxType len = N * D;
-  IdxType idx = threadIdx.x;
-  idx += (IdxType)blockIdx.x * (IdxType)blockDim.x;
-  idx *= VecType::Ratio;
-  if (idx >= len) return;
-  IdxType vIdx;
-  VecType mat, vec;
-  ///@todo: yikes! use fast-int-div here.
-  ///@todo: shared mem for vector could help with perf
-  if (rowMajor && bcastAlongRows) {
-    vIdx = idx % D;
-    vec.load(vector, vIdx);
-  } else if (!rowMajor && !bcastAlongRows) {
-    vIdx = idx % N;
-    vec.load(vector, vIdx);
-  } else if (rowMajor && !bcastAlongRows) {
-    vIdx = idx / D;
-    vec.fill(vector[vIdx]);
-  } else {
-    vIdx = idx / N;
-    vec.fill(vector[vIdx]);
-  }
-  mat.load(matrix, idx);
-#pragma unroll
-  for (int i = 0; i < VecType::Ratio; ++i)
-    mat.val.data[i] = op(mat.val.data[i], vec.val.data[i]);
-  mat.store(out, idx);
-}
-
-template <typename Type, int veclen_, typename Lambda, typename IdxType, int TPB>
-void matrixVectorOpImpl(Type* out,
-                        const Type* matrix,
-                        const Type* vec,
-                        IdxType D,
-                        IdxType N,
-                        bool rowMajor,
-                        bool bcastAlongRows,
-                        Lambda op,
-                        cudaStream_t stream)
-{
-  IdxType len   = N * D;
-  IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB);
-  matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-}
-
-template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
-void matrixVectorOp(Type* out,
-                    const Type* matrix,
-                    const Type* vec,
+template <typename MatT, typename Lambda, typename VecT, typename IdxType = int, int TPB = 256>
+void matrixVectorOp(MatT* out,
+                    const MatT* matrix,
+                    const VecT* vec,
                     IdxType D,
                     IdxType N,
                     bool rowMajor,
@@ -109,11 +39,16 @@ void matrixVectorOp(Type* out,
     out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec);
 }
 
-template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
-void matrixVectorOp(Type* out,
-                    const Type* matrix,
-                    const Type* vec1,
-                    const Type* vec2,
+template <typename MatT,
+          typename Lambda,
+          typename Vec1T,
+          typename Vec2T,
+          typename IdxType = int,
+          int TPB          = 256>
+void matrixVectorOp(MatT* out,
+                    const MatT* matrix,
+                    const Vec1T* vec1,
+                    const Vec2T* vec2,
                     IdxType D,
                     IdxType N,
                     bool rowMajor,