From d4a1e69ea42775d7217acd90e69e17817c0b9c1a Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 25 Nov 2021 15:30:57 +0100
Subject: [PATCH] Replace matrixVectorOp implementation with matrixLinewiseOp

---
 cpp/include/raft/linalg/matrix_vector_op.cuh | 181 +------------------
 1 file changed, 6 insertions(+), 175 deletions(-)
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index ea6e2c02ac..3fcf65389f 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -16,75 +16,11 @@
 
 #pragma once
 
-#include <raft/cuda_utils.cuh>
 #include <raft/linalg/matrix_linewise_op.cuh>
-#include <raft/pow2_utils.cuh>
-#include <raft/vectorized.cuh>
 
 namespace raft {
 namespace linalg {
 
-namespace {
-template <size_t VecBytes>
-struct AlignedAccess {
-  template <typename T>
-  static inline bool test(const T *matrix, size_t strideBytes) {
-    return Pow2<VecBytes>::isAligned(matrix) &&
-           Pow2<VecBytes>::isAligned(strideBytes) &&
-           Pow2<sizeof(T)>::isAligned(VecBytes);
-  }
-};
-};  // namespace
-
-template <typename Type, int veclen_, typename Lambda, typename IdxType>
-__global__ void matrixVectorOpKernel(Type *out, const Type *matrix,
-                                     const Type *vector, IdxType D, IdxType N,
-                                     bool rowMajor, bool bcastAlongRows,
-                                     Lambda op) {
-  typedef TxN_t<Type, veclen_> VecType;
-  IdxType len = N * D;
-  IdxType idx = threadIdx.x;
-  idx += (IdxType)blockIdx.x * (IdxType)blockDim.x;
-  idx *= VecType::Ratio;
-  if (idx >= len) return;
-  IdxType vIdx;
-  VecType mat, vec;
-  ///@todo: yikes! use fast-int-div here.
-  ///@todo: shared mem for vector could help with perf
-  if (rowMajor && bcastAlongRows) {
-    vIdx = idx % D;
-    vec.load(vector, vIdx);
-  } else if (!rowMajor && !bcastAlongRows) {
-    vIdx = idx % N;
-    vec.load(vector, vIdx);
-  } else if (rowMajor && !bcastAlongRows) {
-    vIdx = idx / D;
-    vec.fill(vector[vIdx]);
-  } else {
-    vIdx = idx / N;
-    vec.fill(vector[vIdx]);
-  }
-  mat.load(matrix, idx);
-#pragma unroll
-  for (int i = 0; i < VecType::Ratio; ++i)
-    mat.val.data[i] = op(mat.val.data[i], vec.val.data[i]);
-  mat.store(out, idx);
-}
-
-template <typename Type, int veclen_, typename Lambda, typename IdxType,
-          int TPB>
-void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec,
-                        IdxType D, IdxType N, bool rowMajor,
-                        bool bcastAlongRows, Lambda op, cudaStream_t stream) {
-  IdxType len = N * D;
-  IdxType nblks =
-    raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB);
-  matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, matrix, vec, D, N, rowMajor,
-                                bcastAlongRows, op);
-  CUDA_CHECK(cudaPeekAtLastError());
-}
-
 /**
  * @brief Operations for all the columns or rows with a given vector.
  * Caution : Threads process multiple elements to speed up processing. These
@@ -113,87 +49,9 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D,
                     IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op,
                     cudaStream_t stream) {
   IdxType stride = rowMajor ? D : N;
-  // IdxType nLines = rowMajor ? N : D;
-  // return matrixLinewiseOp(out, matrix, stride, nLines,
-  //                         rowMajor == bcastAlongRows, op, stream, vec);
-
-  size_t stride_bytes = stride * sizeof(Type);
-
-  if (AlignedAccess<16>::test(matrix, stride_bytes) &&
-      AlignedAccess<16>::test(out, stride_bytes)) {
-    matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<8>::test(matrix, stride_bytes) &&
-             AlignedAccess<8>::test(out, stride_bytes)) {
-    matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<4>::test(matrix, stride_bytes) &&
-             AlignedAccess<4>::test(out, stride_bytes)) {
-    matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<2>::test(matrix, stride_bytes) &&
-             AlignedAccess<2>::test(out, stride_bytes)) {
-    matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<1>::test(matrix, stride_bytes) &&
-             AlignedAccess<1>::test(out, stride_bytes)) {
-    matrixVectorOpImpl<Type, 1 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else {
-    matrixVectorOpImpl<Type, 1, Lambda, IdxType, TPB>(
-      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  }
-}
-
-///@todo: come up with a cleaner interface to support these cases in future!
-
-template <typename Type, int veclen_, typename Lambda, typename IdxType>
-__global__ void matrixVectorOpKernel(Type *out, const Type *matrix,
-                                     const Type *vector1, const Type *vector2,
-                                     IdxType D, IdxType N, bool rowMajor,
-                                     bool bcastAlongRows, Lambda op) {
-  typedef TxN_t<Type, veclen_> VecType;
-  IdxType len = N * D;
-  IdxType idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * VecType::Ratio;
-  if (idx >= len) return;
-  IdxType vIdx;
-  VecType mat, vec1, vec2;
-  ///@todo: yikes! use fast-int-div here.
-  ///@todo: shared mem for vector could help with perf
-  if (rowMajor && bcastAlongRows) {
-    vIdx = idx % D;
-    vec1.load(vector1, vIdx);
-    vec2.load(vector2, vIdx);
-  } else if (!rowMajor && !bcastAlongRows) {
-    vIdx = idx % N;
-    vec1.load(vector1, vIdx);
-    vec2.load(vector2, vIdx);
-  } else if (rowMajor && !bcastAlongRows) {
-    vIdx = idx / D;
-    vec1.fill(vector1[vIdx]);
-    vec2.fill(vector2[vIdx]);
-  } else {
-    vIdx = idx / N;
-    vec1.fill(vector1[vIdx]);
-    vec2.fill(vector2[vIdx]);
-  }
-  mat.load(matrix, idx);
-#pragma unroll
-  for (int i = 0; i < VecType::Ratio; ++i)
-    mat.val.data[i] = op(mat.val.data[i], vec1.val.data[i], vec2.val.data[i]);
-  mat.store(out, idx);
-}
-
-template <typename Type, int veclen_, typename Lambda, typename IdxType,
-          int TPB>
-void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1,
-                        const Type *vec2, IdxType D, IdxType N, bool rowMajor,
-                        bool bcastAlongRows, Lambda op, cudaStream_t stream) {
-  IdxType nblks = raft::ceildiv(N * D, (IdxType)TPB);
-  matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, matrix, vec1, vec2, D, N, rowMajor,
-                                bcastAlongRows, op);
-  CUDA_CHECK(cudaPeekAtLastError());
+  IdxType nLines = rowMajor ? N : D;
+  return matrixLinewiseOp(out, matrix, stride, nLines,
+                          rowMajor == bcastAlongRows, op, stream, vec);
 }
 
 /**
@@ -225,36 +83,9 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1,
                     const Type *vec2, IdxType D, IdxType N, bool rowMajor,
                     bool bcastAlongRows, Lambda op, cudaStream_t stream) {
   IdxType stride = rowMajor ? D : N;
-  // IdxType nLines = rowMajor ? N : D;
-  // return matrixLinewiseOp(out, matrix, stride, nLines,
-  //                         rowMajor == bcastAlongRows, op, stream, vec1, vec2);
-
-  size_t stride_bytes = stride * sizeof(Type);
-
-  if (AlignedAccess<16>::test(matrix, stride_bytes) &&
-      AlignedAccess<16>::test(out, stride_bytes)) {
-    matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<8>::test(matrix, stride_bytes) &&
-             AlignedAccess<8>::test(out, stride_bytes)) {
-    matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<4>::test(matrix, stride_bytes) &&
-             AlignedAccess<4>::test(out, stride_bytes)) {
-    matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<2>::test(matrix, stride_bytes) &&
-             AlignedAccess<2>::test(out, stride_bytes)) {
-    matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<1>::test(matrix, stride_bytes) &&
-             AlignedAccess<1>::test(out, stride_bytes)) {
-    matrixVectorOpImpl<Type, 1 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else {
-    matrixVectorOpImpl<Type, 1, Lambda, IdxType, TPB>(
-      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  }
+  IdxType nLines = rowMajor ? N : D;
+  return matrixLinewiseOp(out, matrix, stride, nLines,
+                          rowMajor == bcastAlongRows, op, stream, vec1, vec2);
 }
 
 };  // end namespace linalg