Faster matrix-vector-ops (#401)

Introduce `matrixLinewiseOp` for applying row- or column-wise operations on matrices with (templated) fixed number of vectors. This is a rewriting of `matrixVectorOp`. The new primitive is on average 2x faster for various numbers of columns/rows. In general case: it improves performance by reusing the vector values across multiple matrix rows/columns (trying to load vector value once or at least cache it). In edge case: it tries to use vectorized load/store operations on the input/output matrices even when the pointers are not properly aligned, or the vectors' length is not multiple of the alignment. Authors: - Artem M. Chirkin (https://github.com/achirkin) Approvers: - Tamas Bela Feher (https://github.com/tfeher) - Corey J. Nolet (https://github.com/cjnolet) URL: #401
rapidsai · Jan 12, 2022 · 15fd1d3 · 15fd1d3
1 parent c958dc0
commit 15fd1d3
Show file tree

Hide file tree

Showing 6 changed files with 904 additions and 201 deletions.
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -16,83 +16,11 @@
 
 #pragma once
 
-#include <raft/cuda_utils.cuh>
-#include <raft/pow2_utils.cuh>
-#include <raft/vectorized.cuh>
+#include <raft/matrix/matrix.hpp>
 
 namespace raft {
 namespace linalg {
 
-namespace {
-template <size_t VecBytes>
-struct AlignedAccess {
-  template <typename T>
-  static inline bool test(const T* matrix, size_t strideBytes)
-  {
-    return Pow2<VecBytes>::isAligned(matrix) && Pow2<VecBytes>::isAligned(strideBytes) &&
-           Pow2<sizeof(T)>::isAligned(VecBytes);
-  }
-};
-};  // namespace
-
-template <typename Type, int veclen_, typename Lambda, typename IdxType>
-__global__ void matrixVectorOpKernel(Type* out,
-                                     const Type* matrix,
-                                     const Type* vector,
-                                     IdxType D,
-                                     IdxType N,
-                                     bool rowMajor,
-                                     bool bcastAlongRows,
-                                     Lambda op)
-{
-  typedef TxN_t<Type, veclen_> VecType;
-  IdxType len = N * D;
-  IdxType idx = threadIdx.x;
-  idx += (IdxType)blockIdx.x * (IdxType)blockDim.x;
-  idx *= VecType::Ratio;
-  if (idx >= len) return;
-  IdxType vIdx;
-  VecType mat, vec;
-  ///@todo: yikes! use fast-int-div here.
-  ///@todo: shared mem for vector could help with perf
-  if (rowMajor && bcastAlongRows) {
-    vIdx = idx % D;
-    vec.load(vector, vIdx);
-  } else if (!rowMajor && !bcastAlongRows) {
-    vIdx = idx % N;
-    vec.load(vector, vIdx);
-  } else if (rowMajor && !bcastAlongRows) {
-    vIdx = idx / D;
-    vec.fill(vector[vIdx]);
-  } else {
-    vIdx = idx / N;
-    vec.fill(vector[vIdx]);
-  }
-  mat.load(matrix, idx);
-#pragma unroll
-  for (int i = 0; i < VecType::Ratio; ++i)
-    mat.val.data[i] = op(mat.val.data[i], vec.val.data[i]);
-  mat.store(out, idx);
-}
-
-template <typename Type, int veclen_, typename Lambda, typename IdxType, int TPB>
-void matrixVectorOpImpl(Type* out,
-                        const Type* matrix,
-                        const Type* vec,
-                        IdxType D,
-                        IdxType N,
-                        bool rowMajor,
-                        bool bcastAlongRows,
-                        Lambda op,
-                        cudaStream_t stream)
-{
-  IdxType len   = N * D;
-  IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB);
-  matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-}
-
 /**
  * @brief Operations for all the columns or rows with a given vector.
  * Caution : Threads process multiple elements to speed up processing. These
@@ -127,91 +55,10 @@ void matrixVectorOp(Type* out,
                     Lambda op,
                     cudaStream_t stream)
 {
-  IdxType stride      = rowMajor ? D : N;
-  size_t stride_bytes = stride * sizeof(Type);
-
-  if (AlignedAccess<16>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<8>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<4>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<2>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<1>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 1 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else {
-    matrixVectorOpImpl<Type, 1, Lambda, IdxType, TPB>(
-      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  }
-}
-
-///@todo: come up with a cleaner interface to support these cases in future!
-
-template <typename Type, int veclen_, typename Lambda, typename IdxType>
-__global__ void matrixVectorOpKernel(Type* out,
-                                     const Type* matrix,
-                                     const Type* vector1,
-                                     const Type* vector2,
-                                     IdxType D,
-                                     IdxType N,
-                                     bool rowMajor,
-                                     bool bcastAlongRows,
-                                     Lambda op)
-{
-  typedef TxN_t<Type, veclen_> VecType;
-  IdxType len = N * D;
-  IdxType idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * VecType::Ratio;
-  if (idx >= len) return;
-  IdxType vIdx;
-  VecType mat, vec1, vec2;
-  ///@todo: yikes! use fast-int-div here.
-  ///@todo: shared mem for vector could help with perf
-  if (rowMajor && bcastAlongRows) {
-    vIdx = idx % D;
-    vec1.load(vector1, vIdx);
-    vec2.load(vector2, vIdx);
-  } else if (!rowMajor && !bcastAlongRows) {
-    vIdx = idx % N;
-    vec1.load(vector1, vIdx);
-    vec2.load(vector2, vIdx);
-  } else if (rowMajor && !bcastAlongRows) {
-    vIdx = idx / D;
-    vec1.fill(vector1[vIdx]);
-    vec2.fill(vector2[vIdx]);
-  } else {
-    vIdx = idx / N;
-    vec1.fill(vector1[vIdx]);
-    vec2.fill(vector2[vIdx]);
-  }
-  mat.load(matrix, idx);
-#pragma unroll
-  for (int i = 0; i < VecType::Ratio; ++i)
-    mat.val.data[i] = op(mat.val.data[i], vec1.val.data[i], vec2.val.data[i]);
-  mat.store(out, idx);
-}
-
-template <typename Type, int veclen_, typename Lambda, typename IdxType, int TPB>
-void matrixVectorOpImpl(Type* out,
-                        const Type* matrix,
-                        const Type* vec1,
-                        const Type* vec2,
-                        IdxType D,
-                        IdxType N,
-                        bool rowMajor,
-                        bool bcastAlongRows,
-                        Lambda op,
-                        cudaStream_t stream)
-{
-  IdxType nblks = raft::ceildiv(N * D, (IdxType)TPB);
-  matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  IdxType stride = rowMajor ? D : N;
+  IdxType nLines = rowMajor ? N : D;
+  return matrix::linewiseOp(
+    out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec);
 }
 
 /**
@@ -250,28 +97,10 @@ void matrixVectorOp(Type* out,
                     Lambda op,
                     cudaStream_t stream)
 {
-  IdxType stride      = rowMajor ? D : N;
-  size_t stride_bytes = stride * sizeof(Type);
-
-  if (AlignedAccess<16>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<8>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<4>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<2>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<1>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 1 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else {
-    matrixVectorOpImpl<Type, 1, Lambda, IdxType, TPB>(
-      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  }
+  IdxType stride = rowMajor ? D : N;
+  IdxType nLines = rowMajor ? N : D;
+  return matrix::linewiseOp(
+    out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec1, vec2);
 }
 
 };  // end namespace linalg