diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index bd80cf5d02..750eca0742 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -16,83 +16,11 @@
 
 #pragma once
 
-#include <raft/cuda_utils.cuh>
-#include <raft/pow2_utils.cuh>
-#include <raft/vectorized.cuh>
+#include <raft/matrix/matrix.hpp>
 
 namespace raft {
 namespace linalg {
 
-namespace {
-template <size_t VecBytes>
-struct AlignedAccess {
-  template <typename T>
-  static inline bool test(const T* matrix, size_t strideBytes)
-  {
-    return Pow2<VecBytes>::isAligned(matrix) && Pow2<VecBytes>::isAligned(strideBytes) &&
-           Pow2<sizeof(T)>::isAligned(VecBytes);
-  }
-};
-};  // namespace
-
-template <typename Type, int veclen_, typename Lambda, typename IdxType>
-__global__ void matrixVectorOpKernel(Type* out,
-                                     const Type* matrix,
-                                     const Type* vector,
-                                     IdxType D,
-                                     IdxType N,
-                                     bool rowMajor,
-                                     bool bcastAlongRows,
-                                     Lambda op)
-{
-  typedef TxN_t<Type, veclen_> VecType;
-  IdxType len = N * D;
-  IdxType idx = threadIdx.x;
-  idx += (IdxType)blockIdx.x * (IdxType)blockDim.x;
-  idx *= VecType::Ratio;
-  if (idx >= len) return;
-  IdxType vIdx;
-  VecType mat, vec;
-  ///@todo: yikes! use fast-int-div here.
-  ///@todo: shared mem for vector could help with perf
-  if (rowMajor && bcastAlongRows) {
-    vIdx = idx % D;
-    vec.load(vector, vIdx);
-  } else if (!rowMajor && !bcastAlongRows) {
-    vIdx = idx % N;
-    vec.load(vector, vIdx);
-  } else if (rowMajor && !bcastAlongRows) {
-    vIdx = idx / D;
-    vec.fill(vector[vIdx]);
-  } else {
-    vIdx = idx / N;
-    vec.fill(vector[vIdx]);
-  }
-  mat.load(matrix, idx);
-#pragma unroll
-  for (int i = 0; i < VecType::Ratio; ++i)
-    mat.val.data[i] = op(mat.val.data[i], vec.val.data[i]);
-  mat.store(out, idx);
-}
-
-template <typename Type, int veclen_, typename Lambda, typename IdxType, int TPB>
-void matrixVectorOpImpl(Type* out,
-                        const Type* matrix,
-                        const Type* vec,
-                        IdxType D,
-                        IdxType N,
-                        bool rowMajor,
-                        bool bcastAlongRows,
-                        Lambda op,
-                        cudaStream_t stream)
-{
-  IdxType len   = N * D;
-  IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB);
-  matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-}
-
 /**
  * @brief Operations for all the columns or rows with a given vector.
  * Caution : Threads process multiple elements to speed up processing. These
@@ -127,91 +55,10 @@ void matrixVectorOp(Type* out,
                     Lambda op,
                     cudaStream_t stream)
 {
-  IdxType stride      = rowMajor ? D : N;
-  size_t stride_bytes = stride * sizeof(Type);
-
-  if (AlignedAccess<16>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<8>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<4>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<2>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<1>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 1 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else {
-    matrixVectorOpImpl<Type, 1, Lambda, IdxType, TPB>(
-      out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  }
-}
-
-///@todo: come up with a cleaner interface to support these cases in future!
-
-template <typename Type, int veclen_, typename Lambda, typename IdxType>
-__global__ void matrixVectorOpKernel(Type* out,
-                                     const Type* matrix,
-                                     const Type* vector1,
-                                     const Type* vector2,
-                                     IdxType D,
-                                     IdxType N,
-                                     bool rowMajor,
-                                     bool bcastAlongRows,
-                                     Lambda op)
-{
-  typedef TxN_t<Type, veclen_> VecType;
-  IdxType len = N * D;
-  IdxType idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * VecType::Ratio;
-  if (idx >= len) return;
-  IdxType vIdx;
-  VecType mat, vec1, vec2;
-  ///@todo: yikes! use fast-int-div here.
-  ///@todo: shared mem for vector could help with perf
-  if (rowMajor && bcastAlongRows) {
-    vIdx = idx % D;
-    vec1.load(vector1, vIdx);
-    vec2.load(vector2, vIdx);
-  } else if (!rowMajor && !bcastAlongRows) {
-    vIdx = idx % N;
-    vec1.load(vector1, vIdx);
-    vec2.load(vector2, vIdx);
-  } else if (rowMajor && !bcastAlongRows) {
-    vIdx = idx / D;
-    vec1.fill(vector1[vIdx]);
-    vec2.fill(vector2[vIdx]);
-  } else {
-    vIdx = idx / N;
-    vec1.fill(vector1[vIdx]);
-    vec2.fill(vector2[vIdx]);
-  }
-  mat.load(matrix, idx);
-#pragma unroll
-  for (int i = 0; i < VecType::Ratio; ++i)
-    mat.val.data[i] = op(mat.val.data[i], vec1.val.data[i], vec2.val.data[i]);
-  mat.store(out, idx);
-}
-
-template <typename Type, int veclen_, typename Lambda, typename IdxType, int TPB>
-void matrixVectorOpImpl(Type* out,
-                        const Type* matrix,
-                        const Type* vec1,
-                        const Type* vec2,
-                        IdxType D,
-                        IdxType N,
-                        bool rowMajor,
-                        bool bcastAlongRows,
-                        Lambda op,
-                        cudaStream_t stream)
-{
-  IdxType nblks = raft::ceildiv(N * D, (IdxType)TPB);
-  matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  IdxType stride = rowMajor ? D : N;
+  IdxType nLines = rowMajor ? N : D;
+  return matrix::linewiseOp(
+    out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec);
 }
 
 /**
@@ -250,28 +97,10 @@ void matrixVectorOp(Type* out,
                     Lambda op,
                     cudaStream_t stream)
 {
-  IdxType stride      = rowMajor ? D : N;
-  size_t stride_bytes = stride * sizeof(Type);
-
-  if (AlignedAccess<16>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<8>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<4>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<2>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<1>::test(matrix, stride_bytes)) {
-    matrixVectorOpImpl<Type, 1 / sizeof(Type), Lambda, IdxType, TPB>(
-      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else {
-    matrixVectorOpImpl<Type, 1, Lambda, IdxType, TPB>(
-      out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  }
+  IdxType stride = rowMajor ? D : N;
+  IdxType nLines = rowMajor ? N : D;
+  return matrix::linewiseOp(
+    out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec1, vec2);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh
new file mode 100644
index 0000000000..63fa872f9d
--- /dev/null
+++ b/cpp/include/raft/matrix/detail/linewise_op.cuh
@@ -0,0 +1,546 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/cuda_utils.cuh>
+#include <raft/pow2_utils.cuh>
+#include <raft/vectorized.cuh>
+
+namespace raft {
+namespace matrix {
+namespace detail {
+
+template <typename Type, typename IdxType, std::size_t VecBytes, int BlockSize>
+struct Linewise {
+  static constexpr IdxType VecElems = VecBytes / sizeof(Type);
+
+  typedef raft::TxN_t<Type, VecElems> Vec;
+  typedef raft::Pow2<VecBytes> AlignBytes;
+  typedef raft::Pow2<VecElems> AlignElems;
+  typedef raft::Pow2<raft::WarpSize> AlignWarp;
+
+  /**
+   * Compute op(matrix_in, vec_1, vec_2, ...) where vectors are applied across the
+   * matrix rows (one vector element per matrix row).
+   *
+   * It's assumed that `in` and `out` are aligned to the cuda-vector-size,
+   * and their length is multiple of that.
+   *
+   * Block work arrangement: blocked;
+   *     one warp works on a contiguous chunk of a matrix. Since the matrix is represented
+   *     as a flat array, such an arangement minimizes the number of times when a single
+   *     thread needs to reload the vector value at an index corresponding to the current
+   *     matrix row. Ideally, a thread would load a value from a vector only once, but that
+   *     is not possible if the vector size (= number of matrix rows) is too small or not
+   *     aligned with the cuda-vector-size.
+   *
+   * Note about rowDiv/rowMod:
+   *     these two represent the row/column indices in the original input matrices, before
+   *     it was converted to (Vec::io_t*) type (which possibly involves shifting a pointer
+   *     a bit to align to the cuda-vector-size). Thus, they are used to track the index for
+   *     the argument vectors only (the vector pointers are not altered in any way).
+   *
+   *
+   * @tparam Vecs a pack of pointers to vectors (Type*)
+   * @param [out] out (aligned part of) the output matrix
+   * @param [in] in (aligned part of) the input matrix
+   * @param [in] in_end end of the (aligned part of the) input matrix
+   * @param [in] rowLen number of elements in a row (NOT the vector size)
+   * @param [in] rowDiv the index in the vectors (= row num in the original unaligned input matrix)
+   * @param [in] rowMod the index within a row in the original unaligned input matrix.
+   * @param [in] op the function to apply
+   * @param [in] vecs pointers to the argument vectors.
+   *
+   */
+  template <typename Lambda, typename... Vecs>
+  static __device__ __forceinline__ void vectorCols(typename Vec::io_t* out,
+                                                    const typename Vec::io_t* in,
+                                                    const typename Vec::io_t* in_end,
+                                                    const IdxType rowLen,
+                                                    IdxType rowDiv,
+                                                    IdxType rowMod,
+                                                    Lambda op,
+                                                    Vecs... vecs) noexcept
+  {
+    constexpr IdxType warpPad = (AlignWarp::Value - 1) * VecElems;
+    Type args[sizeof...(Vecs)];
+    Vec v, w;
+    bool update = true;
+    for (; in < in_end; in += AlignWarp::Value, out += AlignWarp::Value, rowMod += warpPad) {
+      v.val.internal = __ldcv(in);
+      while (rowMod >= rowLen) {
+        rowMod -= rowLen;
+        rowDiv++;
+        update = true;
+      }
+      if (update) {
+        int l = 0;
+        ((args[l] = vecs[rowDiv], l++), ...);
+        update = false;
+      }
+#pragma unroll VecElems
+      for (int k = 0; k < VecElems; k++, rowMod++) {
+        if (rowMod == rowLen) {
+          rowMod = 0;
+          rowDiv++;
+          int l = 0;
+          ((args[l] = vecs[rowDiv], l++), ...);
+        }
+        int l         = 0;
+        w.val.data[k] = op(v.val.data[k], (std::ignore = vecs, args[l++])...);
+      }
+      *out = w.val.internal;
+    }
+  }
+
+  /**
+   * Compute op(matrix_in, vec_1, vec_2, ...) where vectors are applied along
+   * matrix rows (vector and matrix indices are 1-1).
+   *
+   * It's assumed that `in` and `out` are aligned to the cuda-vector-size,
+   * and their length is multiple of that.
+   *
+   * Block work arrangement: striped;
+   *     the grid size is chosen in such a way, that one thread always processes
+   *     the same vector elements. That's why there is no need to read the
+   *     vector arguments multiple times.
+   *
+   * @tparam Args a pack of raft::TxN_t<Type, VecElems>
+   * @param [out] out (aligned part of) the output matrix
+   * @param [in] in (aligned part of) the input matrix
+   * @param [in] len total length of (the aligned part of) the input/output matrices
+   * @param [in] op the function to apply
+   * @param [in] args the cuda-vector-sized chunks on input vectors (raft::TxN_t<Type, VecElems>)
+   */
+  template <typename Lambda, typename... Args>
+  static __device__ __forceinline__ void vectorRows(typename Vec::io_t* out,
+                                                    const typename Vec::io_t* in,
+                                                    const IdxType len,
+                                                    Lambda op,
+                                                    Args... args) noexcept
+  {
+    Vec v;
+    const IdxType d = BlockSize * gridDim.x;
+    for (IdxType i = threadIdx.x + blockIdx.x * BlockSize; i < len; i += d) {
+      v.val.internal = __ldcv(in + i);
+#pragma unroll VecElems
+      for (int k = 0; k < VecElems; k++)
+        v.val.data[k] = op(v.val.data[k], args.val.data[k]...);
+      __stwt(out + i, v.val.internal);
+    }
+  }
+
+  /**
+   * The helper for `vectorRows`. Loads the `raft::TxN_t<Type, VecElems>` chunk
+   * of a vector. Most of the time this is not aligned, so we load it thread-striped
+   * within a block and then use the shared memory to get a contiguous chunk.
+   *
+   * @param [in] shm a shared memory region for rearranging the data among threads
+   * @param [in] p pointer to a vector
+   * @param [in] blockOffset the offset of the current block into a vector.
+   * @param [in] rowLen the length of a vector.
+   * @return a contiguous chunk of a vector, suitable for `vectorRows`.
+   */
+  static __device__ __forceinline__ Vec loadVec(Type* shm,
+                                                const Type* p,
+                                                const IdxType blockOffset,
+                                                const IdxType rowLen) noexcept
+  {
+    IdxType j = blockOffset + threadIdx.x;
+#pragma unroll VecElems
+    for (int k = threadIdx.x; k < VecElems * BlockSize; k += BlockSize, j += BlockSize) {
+      while (j >= rowLen)
+        j -= rowLen;
+      shm[k] = p[j];
+    }
+    __syncthreads();
+    {
+      Vec out;
+      out.val.internal = reinterpret_cast<typename Vec::io_t*>(shm)[threadIdx.x];
+      return out;
+    }
+  }
+};
+
+/**
+ * This kernel prepares the inputs for the `vectorCols` function where the most of the
+ * work happens; see `vectorCols` for details.
+ *
+ * The work arrangement is blocked; a single block works on a contiguous chunk of flattened
+ * matrix data and does not care about the gridDim.
+ *
+ * @param [out] out the output matrix
+ * @param [in] in the input matrix
+ * @param [in] arrOffset such an offset into the matrices that makes them aligned to the
+ * cuda-vector-size
+ * @param [in] rowLen number of elements in a row (NOT the vector size)
+ * @param [in] len the total length of the aligned part of the matrices
+ * @param [in] elemsPerThread how many elements are processed by a single thread in total
+ * @param [in] op the function to apply
+ * @param [in] vecs pointers to the argument vectors
+ */
+template <typename Type,
+          typename IdxType,
+          std::size_t VecBytes,
+          int BlockSize,
+          typename Lambda,
+          typename... Vecs>
+__global__ void __launch_bounds__(BlockSize)
+  matrixLinewiseVecColsMainKernel(Type* out,
+                                  const Type* in,
+                                  const IdxType arrOffset,
+                                  const IdxType rowLen,
+                                  const IdxType len,
+                                  const IdxType elemsPerThread,
+                                  Lambda op,
+                                  Vecs... vecs)
+{
+  typedef Linewise<Type, IdxType, VecBytes, BlockSize> L;
+
+  IdxType t = L::AlignWarp::mod(threadIdx.x);
+  t = arrOffset + elemsPerThread * (blockIdx.x * BlockSize + threadIdx.x - t) + t * L::VecElems;
+
+  return L::vectorCols(reinterpret_cast<typename L::Vec::io_t*>(out + t),
+                       reinterpret_cast<const typename L::Vec::io_t*>(in + t),
+                       reinterpret_cast<const typename L::Vec::io_t*>(
+                         in + min(t + elemsPerThread * L::AlignWarp::Value, len)),
+                       rowLen,
+                       t / rowLen,
+                       t % rowLen,
+                       op,
+                       vecs...);
+}
+
+/**
+ * This kernel is similar to `matrixLinewiseVecColsMainKernel`, but processes only the unaligned
+ * head and tail parts of the matrix.
+ * This kernel is always launched in just two blocks; the first block processes the head of the
+ * matrix, the second block processes the tail. It uses the same `vectorCols` function, but
+ * sets `VecElems = 1`
+ *
+ * @param [out] out the output matrix
+ * @param [in] in the input matrix
+ * @param [in] arrOffset the length of the unaligned head - such an offset into the matrices that
+ * makes them aligned to the `VecBytes`
+ * @param [in] arrTail the offset to the unaligned tail
+ * @param [in] rowLen number of elements in a row (NOT the vector size)
+ * @param [in] len the total length of the matrices (rowLen * nRows)
+ * @param [in] op the function to apply
+ * @param [in] vecs pointers to the argument vectors
+ */
+template <typename Type, typename IdxType, std::size_t MaxOffset, typename Lambda, typename... Vecs>
+__global__ void __launch_bounds__(MaxOffset, 2)
+  matrixLinewiseVecColsTailKernel(Type* out,
+                                  const Type* in,
+                                  const IdxType arrOffset,
+                                  const IdxType arrTail,
+                                  const IdxType rowLen,
+                                  const IdxType len,
+                                  Lambda op,
+                                  Vecs... vecs)
+{
+  // Note, L::VecElems == 1
+  typedef Linewise<Type, IdxType, sizeof(Type), MaxOffset> L;
+  IdxType threadOffset, elemsPerWarp;
+  if (blockIdx.x == 0) {
+    // first block: offset = 0, length = arrOffset
+    threadOffset = threadIdx.x;
+    elemsPerWarp = threadOffset < arrOffset;
+  } else {
+    // second block: offset = arrTail, length = len - arrTail
+    threadOffset = arrTail + threadIdx.x;
+    elemsPerWarp = threadOffset < len;
+  }
+  const IdxType rowDiv = threadOffset / rowLen;
+  const IdxType rowMod = threadOffset % rowLen;
+  return L::vectorCols(
+    reinterpret_cast<typename L::Vec::io_t*>(out + threadOffset),
+    reinterpret_cast<const typename L::Vec::io_t*>(in + threadOffset),
+    reinterpret_cast<const typename L::Vec::io_t*>(in + threadOffset + elemsPerWarp),
+    rowLen,
+    rowDiv,
+    rowMod,
+    op,
+    vecs...);
+}
+
+/**
+ * This kernel prepares the inputs for the `vectorRows` function where the most of the
+ * work happens; see `vectorRows` for details.
+ *
+ * The work arrangement is striped; the gridDim should be selected in such a way, that
+ * on each iteration a thread processes the same indices along rows:
+ *   `(gridDim.x * BlockSize * VecElems) % rowLen == 0`.
+ *
+ * @param [out] out the start of the *aligned* part of the output matrix
+ * @param [in] in the start of the *aligned* part of the input matrix
+ * @param [in] arrOffset such an offset into the matrices that makes them aligned to `VecBytes`
+ * @param [in] rowLen number of elements in a row (= the vector size)
+ * @param [in] len the total length of the aligned part of the matrices
+ * @param [in] op the function to apply
+ * @param [in] vecs pointers to the argument vectors
+ */
+template <typename Type,
+          typename IdxType,
+          std::size_t VecBytes,
+          int BlockSize,
+          typename Lambda,
+          typename... Vecs>
+__global__ void __launch_bounds__(BlockSize)
+  matrixLinewiseVecRowsMainKernel(Type* out,
+                                  const Type* in,
+                                  const IdxType arrOffset,
+                                  const IdxType rowLen,
+                                  const IdxType len,
+                                  Lambda op,
+                                  Vecs... vecs)
+{
+  typedef Linewise<Type, IdxType, VecBytes, BlockSize> L;
+  constexpr uint workSize = L::VecElems * BlockSize;
+  uint workOffset         = workSize;
+  __shared__ alignas(sizeof(Type) * L::VecElems)
+    Type shm[workSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)];
+  const IdxType blockOffset = (arrOffset + BlockSize * L::VecElems * blockIdx.x) % rowLen;
+  return L::vectorRows(
+    reinterpret_cast<typename L::Vec::io_t*>(out),
+    reinterpret_cast<const typename L::Vec::io_t*>(in),
+    L::AlignElems::div(len),
+    op,
+    (workOffset ^= workSize, L::loadVec(shm + workOffset, vecs, blockOffset, rowLen))...);
+}
+
+/**
+ * This kernel is similar to `matrixLinewiseVecRowsMainKernel`, but processes only the unaligned
+ * head and tail parts of the matrix.
+ * This kernel is always launched in just two blocks; the first block processes the head of the
+ * matrix, the second block processes the tail. It uses the same `vectorRows` function, but
+ * sets `VecElems = 1`
+ *
+ * @param [out] out the output matrix
+ * @param [in] in the input matrix
+ * @param [in] arrOffset the length of the unaligned head - such an offset into the matrices that
+ * makes them aligned to the `VecBytes`
+ * @param [in] arrTail the offset to the unaligned tail
+ * @param [in] rowLen number of elements in a row (= the vector size)
+ * @param [in] len the total length of the matrices (rowLen * nRows)
+ * @param [in] op the function to apply
+ * @param [in] vecs pointers to the argument vectors
+ */
+template <typename Type, typename IdxType, std::size_t MaxOffset, typename Lambda, typename... Vecs>
+__global__ void __launch_bounds__(MaxOffset, 2)
+  matrixLinewiseVecRowsTailKernel(Type* out,
+                                  const Type* in,
+                                  const IdxType arrOffset,
+                                  const IdxType arrTail,
+                                  const IdxType rowLen,
+                                  const IdxType len,
+                                  Lambda op,
+                                  Vecs... vecs)
+{
+  // Note, L::VecElems == 1
+  constexpr uint workSize = MaxOffset;
+  uint workOffset         = workSize;
+  __shared__ Type shm[workSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)];
+  typedef Linewise<Type, IdxType, sizeof(Type), MaxOffset> L;
+  if (blockIdx.x == 0) {
+    // first block: offset = 0, length = arrOffset
+    L::vectorRows(reinterpret_cast<typename L::Vec::io_t*>(out),
+                  reinterpret_cast<const typename L::Vec::io_t*>(in),
+                  arrOffset,
+                  op,
+                  (workOffset ^= workSize, L::loadVec(shm + workOffset, vecs, 0, rowLen))...);
+  } else {
+    // second block: offset = arrTail, length = len - arrTail
+    // NB: I substract MaxOffset (= blockDim.x) to get the correct indexing for block 1
+    L::vectorRows(
+      reinterpret_cast<typename L::Vec::io_t*>(out + arrTail - MaxOffset),
+      reinterpret_cast<const typename L::Vec::io_t*>(in + arrTail - MaxOffset),
+      len - arrTail + MaxOffset,
+      op,
+      (workOffset ^= workSize, L::loadVec(shm + workOffset, vecs, arrTail % rowLen, rowLen))...);
+  }
+}
+
+/** Fully occupy GPU this many times for better work balancing. */
+static inline constexpr uint OptimalSmOccupancy = 16;
+
+/**
+ * Calculate the grid size to be `OptimalSmOccupancy * FullyOccupiedGPU`, where `FullyOccupiedGPU`
+ * is the maximum number of blocks fitting in all available SMs.
+ *
+ * @tparam BlockSize blockDim of the kernel.
+ * @return OptimalSmOccupancy * FullyOccupiedGPU
+ */
+template <int BlockSize>
+inline uint getOptimalGridSize()
+{
+  int devId, smCount, maxBlockSize;
+  RAFT_CUDA_TRY(cudaGetDevice(&devId));
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&smCount, cudaDevAttrMultiProcessorCount, devId));
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&maxBlockSize, cudaDevAttrMaxThreadsPerBlock, devId));
+  return OptimalSmOccupancy * static_cast<uint>(smCount * maxBlockSize / BlockSize);
+}
+
+template <typename Type,
+          typename IdxType,
+          std::size_t VecBytes,
+          int BlockSize,
+          typename Lambda,
+          typename... Vecs>
+void matrixLinewiseVecCols(Type* out,
+                           const Type* in,
+                           const IdxType rowLen,
+                           const IdxType nRows,
+                           Lambda op,
+                           cudaStream_t stream,
+                           Vecs... vecs)
+{
+  typedef raft::Pow2<VecBytes> AlignBytes;
+  constexpr std::size_t VecElems = VecBytes / sizeof(Type);
+  const IdxType totalLen         = rowLen * nRows;
+  const Type* alignedStart       = AlignBytes::roundUp(in);
+  const IdxType alignedOff       = IdxType(alignedStart - in);
+  const IdxType alignedEnd       = IdxType(AlignBytes::roundDown(in + totalLen) - in);
+  const IdxType alignedLen       = alignedEnd - alignedOff;
+  if (alignedLen > 0) {
+    constexpr dim3 bs(BlockSize, 1, 1);
+    // Minimum size of the grid to make the device well occupied
+    const uint occupy = getOptimalGridSize<BlockSize>();
+    // does not make sense to have more blocks than this
+    const uint maxBlocks = raft::ceildiv<uint>(uint(alignedLen), bs.x * VecElems);
+    const dim3 gs(min(maxBlocks, occupy), 1, 1);
+    // The work arrangement is blocked on the block and warp levels;
+    //   see more details at Linewise::vectorCols.
+    // The value below determines how many scalar elements are processed by on thread in total.
+    const IdxType elemsPerThread =
+      raft::ceildiv<IdxType>(alignedLen, gs.x * VecElems * BlockSize) * VecElems;
+    matrixLinewiseVecColsMainKernel<Type, IdxType, VecBytes, BlockSize, Lambda, Vecs...>
+      <<<gs, bs, 0, stream>>>(out, in, alignedOff, rowLen, alignedLen, elemsPerThread, op, vecs...);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+  if (alignedLen < totalLen) {
+    // should be not smaller than the warp size for better branching
+    constexpr std::size_t MaxOffset = std::max(std::size_t(raft::WarpSize), VecBytes);
+    matrixLinewiseVecColsTailKernel<Type, IdxType, MaxOffset, Lambda, Vecs...>
+      <<<dim3(2, 1, 1), dim3(MaxOffset, 1, 1), 0, stream>>>(
+        out, in, alignedOff, alignedEnd, rowLen, totalLen, op, vecs...);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+}
+
+template <typename Type,
+          typename IdxType,
+          std::size_t VecBytes,
+          int BlockSize,
+          typename Lambda,
+          typename... Vecs>
+void matrixLinewiseVecRows(Type* out,
+                           const Type* in,
+                           const IdxType rowLen,
+                           const IdxType nRows,
+                           Lambda op,
+                           cudaStream_t stream,
+                           Vecs... vecs)
+{
+  typedef raft::Pow2<VecBytes> AlignBytes;
+  constexpr std::size_t VecElems = VecBytes / sizeof(Type);
+  const IdxType totalLen         = rowLen * nRows;
+  const Type* alignedStart       = AlignBytes::roundUp(in);
+  const IdxType alignedOff       = IdxType(alignedStart - in);
+  const IdxType alignedEnd       = IdxType(AlignBytes::roundDown(in + totalLen) - in);
+  const IdxType alignedLen       = alignedEnd - alignedOff;
+  if (alignedLen > 0) {
+    constexpr dim3 bs(BlockSize, 1, 1);
+    // The work arrangement is striped;
+    //   see more details at Linewise::vectorRows.
+    // Below is the work amount performed by one block in one iteration.
+    constexpr uint block_work_size = bs.x * uint(VecElems);
+    /* Here I would define `grid_work_size = lcm(block_work_size, rowLen)` (Least Common Multiple)
+       This way, the grid spans a set of one or more rows each iteration, and, most importantly,
+       on every iteration each row processes the same set of indices within a row (= the same set
+       of vector indices).
+       This means, each block needs to load the values from the vector arguments only once.
+       Sadly, sometimes `grid_work_size > rowLen*nRows`, and sometimes grid_work_size > UINT_MAX.
+       That's why I don't declare it here explicitly.
+       Instead, I straightaway compute the
+         expected_grid_size = lcm(block_work_size, rowLen) / block_work_size
+     */
+    const uint expected_grid_size = rowLen / raft::gcd(block_work_size, uint(rowLen));
+    // Minimum size of the grid to make the device well occupied
+    const uint occupy = getOptimalGridSize<BlockSize>();
+    const dim3 gs(min(
+                    // does not make sense to have more blocks than this
+                    raft::ceildiv<uint>(uint(totalLen), block_work_size),
+                    // increase the grid size to be not less than `occupy` while
+                    // still being the multiple of `expected_grid_size`
+                    raft::ceildiv<uint>(occupy, expected_grid_size) * expected_grid_size),
+                  1,
+                  1);
+
+    matrixLinewiseVecRowsMainKernel<Type, IdxType, VecBytes, BlockSize, Lambda, Vecs...>
+      <<<gs, bs, 0, stream>>>(
+        out + alignedOff, alignedStart, alignedOff, rowLen, alignedLen, op, vecs...);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+  if (alignedLen < totalLen) {
+    // should be not smaller than the warp size for better branching
+    constexpr std::size_t MaxOffset = std::max(std::size_t(raft::WarpSize), VecBytes);
+    matrixLinewiseVecRowsTailKernel<Type, IdxType, MaxOffset, Lambda, Vecs...>
+      <<<dim3(2, 1, 1), dim3(MaxOffset, 1, 1), 0, stream>>>(
+        out, in, alignedOff, alignedEnd, rowLen, totalLen, op, vecs...);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+}
+
+/**
+ * Select one of the implementations:
+ *   a. vectors applied along/across lines
+ *   b. recursively try different VecBytes, such that alignments of `in` and `out`
+ *      are the same.
+ *
+ * @tparam VecBytes - size of the load/store ops in bytes.
+ * @tparam BlockSize - is fixed and should not affect the performance.
+ */
+template <std::size_t VecBytes = 16, int BlockSize = 256>
+struct MatrixLinewiseOp {
+  template <typename Type, typename IdxType, typename Lambda, typename... Vecs>
+  static void run(Type* out,
+                  const Type* in,
+                  const IdxType lineLen,
+                  const IdxType nLines,
+                  const bool alongLines,
+                  Lambda op,
+                  cudaStream_t stream,
+                  Vecs... vecs)
+  {
+    if constexpr (VecBytes > sizeof(Type)) {
+      if (!raft::Pow2<VecBytes>::areSameAlignOffsets(in, out))
+        return MatrixLinewiseOp<std::max((VecBytes >> 1), sizeof(Type)), BlockSize>::run(
+          out, in, lineLen, nLines, alongLines, op, stream, vecs...);
+    }
+    if (alongLines)
+      return matrixLinewiseVecRows<Type, IdxType, VecBytes, BlockSize, Lambda, Vecs...>(
+        out, in, lineLen, nLines, op, stream, vecs...);
+    else
+      return matrixLinewiseVecCols<Type, IdxType, VecBytes, BlockSize, Lambda, Vecs...>(
+        out, in, lineLen, nLines, op, stream, vecs...);
+  }
+};
+
+}  // end namespace detail
+}  // end namespace matrix
+}  // end namespace raft
diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp
index d3d98cb872..e3e2f88d14 100644
--- a/cpp/include/raft/matrix/matrix.hpp
+++ b/cpp/include/raft/matrix/matrix.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,11 @@
 
 #pragma once
 
+#include "detail/linewise_op.cuh"
 #include "detail/matrix.cuh"
 
+#include <raft/common/nvtx.hpp>
+
 namespace raft {
 namespace matrix {
 
@@ -223,5 +226,48 @@ m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t st
   return detail::getL2Norm(handle, in, size, stream);
 }
 
+/**
+ * Run a function over matrix lines (rows or columns) with a variable number
+ * row-vectors or column-vectors.
+ * The term `line` here signifies that the lines can be either columns or rows,
+ * depending on the matrix layout.
+ * What matters is if the vectors are applied along lines (indices of vectors correspond to
+ * indices within lines), or across lines (indices of vectors correspond to line numbers).
+ *
+ * @param [out] out result of the operation; can be same as `in`; should be aligned the same
+ *        as `in` to allow faster vectorized memory transfers.
+ * @param [in] in input matrix consisting of `nLines` lines, each `lineLen`-long.
+ * @param [in] lineLen length of matrix line in elements (`=nCols` in row-major or `=nRows` in
+ * col-major)
+ * @param [in] nLines number of matrix lines (`=nRows` in row-major or `=nCols` in col-major)
+ * @param [in] alongLines whether vectors are indices along or across lines.
+ * @param [in] op the operation applied on each line:
+ *    for i in [0..lineLen) and j in [0..nLines):
+ *      out[i, j] = op(in[i, j], vec1[i], vec2[i], ... veck[i])   if alongLines = true
+ *      out[i, j] = op(in[i, j], vec1[j], vec2[j], ... veck[j])   if alongLines = false
+ *    where matrix indexing is row-major ([i, j] = [i + lineLen * j]).
+ * @param [in] stream a cuda stream for the kernels
+ * @param [in] vecs zero or more vectors to be passed as arguments,
+ *    size of each vector is `alongLines ? lineLen : nLines`.
+ */
+template <typename m_t, typename idx_t = int, typename Lambda, typename... Vecs>
+void linewiseOp(m_t* out,
+                const m_t* in,
+                const idx_t lineLen,
+                const idx_t nLines,
+                const bool alongLines,
+                Lambda op,
+                cudaStream_t stream,
+                Vecs... vecs)
+{
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope("linewiseOp-%c-%zu (%zu, %zu)",
+                                                            alongLines ? 'l' : 'x',
+                                                            sizeof...(Vecs),
+                                                            size_t(lineLen),
+                                                            size_t(nLines));
+  detail::MatrixLinewiseOp<16, 256>::run<m_t, idx_t, Lambda, Vecs...>(
+    out, in, lineLen, nLines, alongLines, op, stream, vecs...);
+}
+
 };  // end namespace matrix
 };  // end namespace raft
diff --git a/cpp/include/raft/pow2_utils.cuh b/cpp/include/raft/pow2_utils.cuh
index 56a3192f9f..93f81db1ac 100644
--- a/cpp/include/raft/pow2_utils.cuh
+++ b/cpp/include/raft/pow2_utils.cuh
@@ -35,7 +35,9 @@ struct Pow2 {
   static_assert(std::is_integral<Type>::value, "Value must be integral.");
   static_assert(Value && !(Value & Mask), "Value must be power of two.");
 
-#define Pow2_IsRepresentableAs(I) (std::is_integral<I>::value && Type(I(Value)) == Value)
+#define Pow2_FUNC_QUALIFIER         static constexpr __host__ __device__ __forceinline__
+#define Pow2_WHEN_INTEGRAL(I)       std::enable_if_t<Pow2_IS_REPRESENTABLE_AS(I), I>
+#define Pow2_IS_REPRESENTABLE_AS(I) (std::is_integral<I>::value && Type(I(Value)) == Value)
 
   /**
    * Integer division by Value truncated toward zero
@@ -44,7 +46,7 @@ struct Pow2 {
    *  Invariant: `x = Value * quot(x) + rem(x)`
    */
   template <typename I>
-  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> quot(I x) noexcept
+  Pow2_FUNC_QUALIFIER Pow2_WHEN_INTEGRAL(I) quot(I x) noexcept
   {
     if constexpr (std::is_signed<I>::value) return (x >> I(Log2)) + (x < 0 && (x & I(Mask)));
     if constexpr (std::is_unsigned<I>::value) return x >> I(Log2);
@@ -57,7 +59,7 @@ struct Pow2 {
    *  Invariant: `x = Value * quot(x) + rem(x)`.
    */
   template <typename I>
-  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> rem(I x) noexcept
+  Pow2_FUNC_QUALIFIER Pow2_WHEN_INTEGRAL(I) rem(I x) noexcept
   {
     if constexpr (std::is_signed<I>::value) return x < 0 ? -((-x) & I(Mask)) : (x & I(Mask));
     if constexpr (std::is_unsigned<I>::value) return x & I(Mask);
@@ -74,7 +76,7 @@ struct Pow2 {
    * compared to normal C++ operators `/` and `%`.
    */
   template <typename I>
-  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> div(I x) noexcept
+  Pow2_FUNC_QUALIFIER Pow2_WHEN_INTEGRAL(I) div(I x) noexcept
   {
     return x >> I(Log2);
   }
@@ -91,7 +93,7 @@ struct Pow2 {
    * compared to normal C++ operators `/` and `%`.
    */
   template <typename I>
-  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> mod(I x) noexcept
+  Pow2_FUNC_QUALIFIER Pow2_WHEN_INTEGRAL(I) mod(I x) noexcept
   {
     return x & I(Mask);
   }
@@ -105,25 +107,25 @@ struct Pow2 {
    * NB: for pointers, the alignment is checked in bytes, not in elements.
    */
   template <typename PtrT>
-  static constexpr HDI bool isAligned(PtrT p) noexcept
+  Pow2_FUNC_QUALIFIER bool isAligned(PtrT p) noexcept
   {
     Pow2_CHECK_TYPE(PtrT);
-    if constexpr (Pow2_IsRepresentableAs(PtrT)) return mod(p) == 0;
-    if constexpr (!Pow2_IsRepresentableAs(PtrT)) return mod(reinterpret_cast<Type>(p)) == 0;
+    if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrT)) return mod(p) == 0;
+    if constexpr (!Pow2_IS_REPRESENTABLE_AS(PtrT)) return mod(reinterpret_cast<Type>(p)) == 0;
   }
 
   /** Tell whether two pointers have the same address modulo Value. */
   template <typename PtrT, typename PtrS>
-  static constexpr HDI bool areSameAlignOffsets(PtrT a, PtrS b) noexcept
+  Pow2_FUNC_QUALIFIER bool areSameAlignOffsets(PtrT a, PtrS b) noexcept
   {
     Pow2_CHECK_TYPE(PtrT);
     Pow2_CHECK_TYPE(PtrS);
     Type x, y;
-    if constexpr (Pow2_IsRepresentableAs(PtrT))
+    if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrT))
       x = Type(mod(a));
     else
       x = mod(reinterpret_cast<Type>(a));
-    if constexpr (Pow2_IsRepresentableAs(PtrS))
+    if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrS))
       y = Type(mod(b));
     else
       y = mod(reinterpret_cast<Type>(b));
@@ -132,29 +134,31 @@ struct Pow2 {
 
   /** Get this or next Value-aligned address (in bytes) or integral. */
   template <typename PtrT>
-  static constexpr HDI PtrT roundUp(PtrT p) noexcept
+  Pow2_FUNC_QUALIFIER PtrT roundUp(PtrT p) noexcept
   {
     Pow2_CHECK_TYPE(PtrT);
-    if constexpr (Pow2_IsRepresentableAs(PtrT)) return p + PtrT(Mask) - mod(p + PtrT(Mask));
-    if constexpr (!Pow2_IsRepresentableAs(PtrT)) {
+    if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrT)) return (p + PtrT(Mask)) & PtrT(~Mask);
+    if constexpr (!Pow2_IS_REPRESENTABLE_AS(PtrT)) {
       auto x = reinterpret_cast<Type>(p);
-      return reinterpret_cast<PtrT>(x + Mask - mod(x + Mask));
+      return reinterpret_cast<PtrT>((x + Mask) & (~Mask));
     }
   }
 
   /** Get this or previous Value-aligned address (in bytes) or integral. */
   template <typename PtrT>
-  static constexpr HDI PtrT roundDown(PtrT p) noexcept
+  Pow2_FUNC_QUALIFIER PtrT roundDown(PtrT p) noexcept
   {
     Pow2_CHECK_TYPE(PtrT);
-    if constexpr (Pow2_IsRepresentableAs(PtrT)) return p - mod(p);
-    if constexpr (!Pow2_IsRepresentableAs(PtrT)) {
+    if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrT)) return p & PtrT(~Mask);
+    if constexpr (!Pow2_IS_REPRESENTABLE_AS(PtrT)) {
       auto x = reinterpret_cast<Type>(p);
-      return reinterpret_cast<PtrT>(x - mod(x));
+      return reinterpret_cast<PtrT>(x & (~Mask));
     }
   }
 #undef Pow2_CHECK_TYPE
-#undef Pow2_IsRepresentableAs
+#undef Pow2_IS_REPRESENTABLE_AS
+#undef Pow2_FUNC_QUALIFIER
+#undef Pow2_WHEN_INTEGRAL
 };
 
 };  // namespace raft
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index b37c671525..5bf836e8d5 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -63,6 +63,7 @@ add_executable(test_raft
     test/linalg/unary_op.cu
     test/matrix/math.cu
     test/matrix/matrix.cu
+    test/matrix/linewise_op.cu
     test/mr/device/buffer.cpp
     test/mr/host/buffer.cpp
     test/mst.cu
diff --git a/cpp/test/matrix/linewise_op.cu b/cpp/test/matrix/linewise_op.cu
new file mode 100644
index 0000000000..1cd00b8adc
--- /dev/null
+++ b/cpp/test/matrix/linewise_op.cu
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../linalg/matrix_vector_op.cuh"
+#include "../test_utils.h"
+#include <cuda_profiler_api.h>
+#include <gtest/gtest.h>
+#include <raft/common/nvtx.hpp>
+#include <raft/cudart_utils.h>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/matrix/matrix.hpp>
+#include <raft/random/rng.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace raft {
+namespace matrix {
+
+constexpr std::size_t PTR_PADDING = 128;
+
+struct LinewiseTestParams {
+  double tolerance;
+  std::size_t workSizeBytes;
+  uint64_t seed;
+  bool checkCorrectness;
+  int inAlignOffset;
+  int outAlignOffset;
+};
+
+template <typename T, typename I, typename ParamsReader>
+struct LinewiseTest : public ::testing::TestWithParam<typename ParamsReader::Params> {
+  const LinewiseTestParams params;
+  const raft::handle_t handle;
+  rmm::cuda_stream_view stream;
+
+  LinewiseTest()
+    : testing::TestWithParam<typename ParamsReader::Params>(),
+      params(
+        ParamsReader::read(::testing::TestWithParam<typename ParamsReader::Params>::GetParam())),
+      handle(),
+      stream(handle.get_stream())
+  {
+  }
+
+  void runLinewiseSum(
+    T* out, const T* in, const I lineLen, const I nLines, const bool alongLines, const T* vec)
+  {
+    auto f = [] __device__(T a, T b) -> T { return a + b; };
+    matrix::linewiseOp(out, in, lineLen, nLines, alongLines, f, stream, vec);
+  }
+
+  void runLinewiseSum(T* out,
+                      const T* in,
+                      const I lineLen,
+                      const I nLines,
+                      const bool alongLines,
+                      const T* vec1,
+                      const T* vec2)
+  {
+    auto f = [] __device__(T a, T b, T c) -> T { return a + b + c; };
+    matrix::linewiseOp(out, in, lineLen, nLines, alongLines, f, stream, vec1, vec2);
+  }
+
+  rmm::device_uvector<T> genData(size_t workSizeBytes)
+  {
+    raft::random::Rng r(params.seed);
+    const std::size_t workSizeElems = workSizeBytes / sizeof(T);
+    rmm::device_uvector<T> blob(workSizeElems, stream);
+    r.uniform(blob.data(), workSizeElems, T(-1.0), T(1.0), stream);
+    return blob;
+  }
+
+  /**
+   * Suggest multiple versions of matrix dimensions (n, m), such that
+   *
+   * (2 * n * m + numVectors * m + minUnused) * sizeof(T) <= workSize.
+   *
+   * This way I know I can create two matrices and numVectors vectors of size m,
+   * such that they fit into the allocated workSet.
+   */
+  std::vector<std::tuple<I, I>> suggestDimensions(I numVectors)
+  {
+    const std::size_t workSizeElems = params.workSizeBytes / sizeof(T);
+    std::vector<std::tuple<I, I>> out;
+    const double b = double(numVectors);
+    const double s = double(workSizeElems) - double(PTR_PADDING * 2 * (2 + b));
+    double squareN = 0.25 * (sqrt(8.0 * s + b * b) - b);
+
+    auto solveForN       = [s, b](I m) -> double { return (s - b * double(m)) / double(2 * m); };
+    auto solveForM       = [s, b](I n) -> double { return s / double(2 * n + b); };
+    auto addIfMakesSense = [&out](double x, double y) {
+      if (x <= 0 || y <= 0) return;
+      I n = I(floor(x));
+      I m = I(floor(y));
+      if (n > 0 && m > 0) out.push_back(std::make_tuple(n, m));
+    };
+    std::vector<double> sizes = {15, 16, 17, 256, 257, 263, 1024};
+    addIfMakesSense(squareN, squareN);
+    for (I k : sizes) {
+      addIfMakesSense(solveForN(k), k);
+      addIfMakesSense(k, solveForM(k));
+    }
+
+    return out;
+  }
+
+  std::tuple<T*, const T*, const T*, const T*> assignSafePtrs(rmm::device_uvector<T>& blob,
+                                                              I n,
+                                                              I m)
+  {
+    typedef raft::Pow2<PTR_PADDING> Align;
+    T* out = Align::roundUp(blob.data()) + params.outAlignOffset;
+    const T* in =
+      const_cast<const T*>(Align::roundUp(out + n * m + PTR_PADDING)) + params.inAlignOffset;
+    const T* vec1 = Align::roundUp(in + n * m + PTR_PADDING);
+    const T* vec2 = Align::roundUp(vec1 + m + PTR_PADDING);
+    ASSERT(blob.data() + blob.size() >= vec2 + PTR_PADDING,
+           "Failed to allocate pointers: the workset is not big enough.");
+    return std::make_tuple(out, in, vec1, vec2);
+  }
+
+  testing::AssertionResult run(std::vector<std::tuple<I, I>>&& dims, rmm::device_uvector<T>&& blob)
+  {
+    rmm::device_uvector<T> blob_val(params.checkCorrectness ? blob.size() / 2 : 0, stream);
+
+    stream.synchronize();
+    cudaProfilerStart();
+    testing::AssertionResult r = testing::AssertionSuccess();
+    for (auto [n, m] : dims) {
+      if (!r) break;
+      auto [out, in, vec1, vec2] = assignSafePtrs(blob, n, m);
+      common::nvtx::range dims_scope("Dims-%zu-%zu", std::size_t(n), std::size_t(m));
+      for (auto alongRows : ::testing::Bool()) {
+        common::nvtx::range dir_scope(alongRows ? "alongRows" : "acrossRows");
+        auto lineLen = alongRows ? m : n;
+        auto nLines  = alongRows ? n : m;
+        {
+          {
+            common::nvtx::range vecs_scope("one vec");
+            runLinewiseSum(out, in, lineLen, nLines, alongRows, vec1);
+          }
+          if (params.checkCorrectness) {
+            linalg::naiveMatVec(
+              blob_val.data(), in, vec1, lineLen, nLines, true, alongRows, T(1), stream);
+            r = devArrMatch(blob_val.data(), out, n * m, CompareApprox<T>(params.tolerance))
+                << " " << (alongRows ? "alongRows" : "acrossRows")
+                << " with one vec; lineLen: " << lineLen << "; nLines " << nLines;
+            if (!r) break;
+          }
+          {
+            common::nvtx::range vecs_scope("two vecs");
+            runLinewiseSum(out, in, lineLen, nLines, alongRows, vec1, vec2);
+          }
+          if (params.checkCorrectness) {
+            linalg::naiveMatVec(
+              blob_val.data(), in, vec1, vec2, lineLen, nLines, true, alongRows, T(1), stream);
+            r = devArrMatch(blob_val.data(), out, n * m, CompareApprox<T>(params.tolerance))
+                << " " << (alongRows ? "alongRows" : "acrossRows")
+                << " with two vecs;  lineLen: " << lineLen << "; nLines " << nLines;
+            if (!r) break;
+          }
+        }
+      }
+    }
+    cudaProfilerStop();
+
+    return r;
+  }
+
+  testing::AssertionResult run()
+  {
+    return run(suggestDimensions(2), genData(params.workSizeBytes));
+  }
+
+  testing::AssertionResult runEdgeCases()
+  {
+    std::vector<I> sizes = {1, 2, 3, 4, 7, 16};
+    std::vector<std::tuple<I, I>> dims;
+    for (auto m : sizes) {
+      for (auto n : sizes) {
+        dims.push_back(std::make_tuple(n, m));
+        dims.push_back(std::make_tuple(m, n));
+      }
+    }
+
+    return run(std::move(dims), genData(1024 * 1024));
+  }
+};
+
+#define TEST_IT(fun, TestClass, ElemType, IndexType)                                         \
+  typedef LinewiseTest<ElemType, IndexType, TestClass> TestClass##_##ElemType##_##IndexType; \
+  TEST_P(TestClass##_##ElemType##_##IndexType, fun) { ASSERT_TRUE(fun()); }                  \
+  INSTANTIATE_TEST_SUITE_P(LinewiseOp, TestClass##_##ElemType##_##IndexType, TestClass##Params)
+
+auto TinyParams = ::testing::Combine(::testing::Values(0, 1, 2, 4), ::testing::Values(0, 1, 2, 3));
+
+struct Tiny {
+  typedef std::tuple<int, int> Params;
+  static LinewiseTestParams read(Params ps)
+  {
+    return {/** .tolerance */ 0.00001,
+            /** .workSizeBytes */ 0 /* not used anyway */,
+            /** .seed */ 42ULL,
+            /** .checkCorrectness */ true,
+            /** .inAlignOffset */ std::get<0>(ps),
+            /** .outAlignOffset */ std::get<1>(ps)};
+  }
+};
+
+auto MegabyteParams = TinyParams;
+
+struct Megabyte {
+  typedef std::tuple<int, int> Params;
+  static LinewiseTestParams read(Params ps)
+  {
+    return {/** .tolerance */ 0.00001,
+            /** .workSizeBytes */ 1024 * 1024,
+            /** .seed */ 42ULL,
+            /** .checkCorrectness */ true,
+            /** .inAlignOffset */ std::get<0>(ps),
+            /** .outAlignOffset */ std::get<1>(ps)};
+  }
+};
+
+auto GigabyteParams = ::testing::Combine(::testing::Values(0, 1, 2), ::testing::Values(0, 1, 2));
+
+struct Gigabyte {
+  typedef std::tuple<int, int> Params;
+  static LinewiseTestParams read(Params ps)
+  {
+    return {/** .tolerance */ 0.00001,
+            /** .workSizeBytes */ 1024 * 1024 * 1024,
+            /** .seed */ 42ULL,
+            /** .checkCorrectness */ false,
+            /** .inAlignOffset */ std::get<0>(ps),
+            /** .outAlignOffset */ std::get<1>(ps)};
+  }
+};
+
+auto TenGigsParams = GigabyteParams;
+
+struct TenGigs {
+  typedef std::tuple<int, int> Params;
+  static LinewiseTestParams read(Params ps)
+  {
+    return {/** .tolerance */ 0.00001,
+            /** .workSizeBytes */ 10ULL * 1024ULL * 1024ULL * 1024ULL,
+            /** .seed */ 42ULL,
+            /** .checkCorrectness */ false,
+            /** .inAlignOffset */ std::get<0>(ps),
+            /** .outAlignOffset */ std::get<1>(ps)};
+  }
+};
+
+TEST_IT(runEdgeCases, Tiny, float, int);
+TEST_IT(runEdgeCases, Tiny, double, int);
+TEST_IT(run, Megabyte, float, int);
+TEST_IT(run, Megabyte, double, int);
+TEST_IT(run, Gigabyte, float, int);
+TEST_IT(run, Gigabyte, double, int);
+TEST_IT(run, TenGigs, float, uint64_t);
+TEST_IT(run, TenGigs, double, uint64_t);
+
+}  // namespace matrix
+}  // end namespace raft