diff --git a/cpp/include/raft/linalg/matrix_linewise_op.cuh b/cpp/include/raft/linalg/matrix_linewise_op.cuh
index d3789be504..af7dd41971 100644
--- a/cpp/include/raft/linalg/matrix_linewise_op.cuh
+++ b/cpp/include/raft/linalg/matrix_linewise_op.cuh
@@ -190,7 +190,7 @@ __global__ void __launch_bounds__(MaxOffset, 2)
       L::loadVec(vecs, arrTail % rowLen, rowLen)...);
 }
 
-template <typename Type, typename IdxType, std::size_t VecBytes,
+template <typename Type, typename IdxType, std::size_t VecBytes, int BlockSize,
           typename Lambda, typename... Vecs>
 void matrixLinewiseVecCols(Type* out, const Type* in, const IdxType rowLen,
                            const IdxType nRows, Lambda op, cudaStream_t stream,
@@ -202,11 +202,9 @@ void matrixLinewiseVecCols(Type* out, const Type* in, const IdxType rowLen,
   const IdxType alignedOff = IdxType(alignedStart - in);
   const IdxType alignedEnd = IdxType(AlignBytes::roundDown(in + totalLen) - in);
   const IdxType alignedLen = alignedEnd - alignedOff;
-  // blockSize
-  constexpr int BlockSize = 256;
   constexpr dim3 bs(BlockSize, 1, 1);
-  // Minimum size of the grid to make device well occupied
-  const uint occupy = raft::getMultiProcessorCount() * 64;
+  // Minimum size of the grid to make the device well occupied
+  const uint occupy = raft::getMultiProcessorCount() * (16384 / BlockSize);
   // does not make sense to have more blocks than this
   const uint maxBlocks = raft::ceildiv<uint>(uint(alignedLen), bs.x * VecElems);
   const dim3 gs(min(maxBlocks, occupy), 1, 1);
@@ -227,7 +225,7 @@ void matrixLinewiseVecCols(Type* out, const Type* in, const IdxType rowLen,
   }
 }
 
-template <typename Type, typename IdxType, std::size_t VecBytes,
+template <typename Type, typename IdxType, std::size_t VecBytes, int BlockSize,
           typename Lambda, typename... Vecs>
 void matrixLinewiseVecRows(Type* out, const Type* in, const IdxType rowLen,
                            const IdxType nRows, Lambda op, cudaStream_t stream,
@@ -236,14 +234,13 @@ void matrixLinewiseVecRows(Type* out, const Type* in, const IdxType rowLen,
   constexpr std::size_t VecElems = VecBytes / sizeof(Type);
   const IdxType totalLen = rowLen * nRows;
   // blockSize
-  constexpr int BlockSize = 256;
   constexpr dim3 bs(BlockSize, 1, 1);
   // if we have `stride` number of blocks, then each block processes always the same
   // indices along dimension rowLen; this means a block needs to index `vecs` only once!
   const uint stride =
     (rowLen / raft::gcd(bs.x * uint(VecElems), uint(rowLen))) * VecElems;
-  // Minimum size of the grid to make device well occupied
-  const uint occupy = raft::getMultiProcessorCount() * 64;
+  // Minimum size of the grid to make the device well occupied
+  const uint occupy = raft::getMultiProcessorCount() * (16384 / BlockSize);
   const dim3 gs(min(
                   // does not make sense to have more blocks than this
                   raft::ceildiv<uint>(uint(totalLen), bs.x * VecElems),
@@ -270,7 +267,16 @@ void matrixLinewiseVecRows(Type* out, const Type* in, const IdxType rowLen,
   }
 }
 
-template <std::size_t VecBytes = 16>
+/**
+ * Select one of the implementations:
+ *   a. vectors applied along/across lines
+ *   b. recursively try different VecBytes, such that alignments of `in` and `out`
+ *      are the same.
+ *
+ * @tparam VecBytes - size of the load/store ops in bytes.
+ * @tparam BlockSize - is fixed and should not affect the performance.
+ */
+template <std::size_t VecBytes = 16, int BlockSize = 256>
 struct MatrixLinewiseOp {
   template <typename Type, typename IdxType, typename Lambda, typename... Vecs>
   static void run(Type* out, const Type* in, const IdxType lineLen,
@@ -278,15 +284,19 @@ struct MatrixLinewiseOp {
                   cudaStream_t stream, Vecs... vecs) {
     if constexpr (VecBytes > sizeof(Type)) {
       if (!raft::Pow2<VecBytes>::areSameAlignOffsets(in, out))
-        return MatrixLinewiseOp<std::max((VecBytes >> 1), sizeof(Type))>::run(
-          out, in, lineLen, nLines, alongLines, op, stream, vecs...);
+        return MatrixLinewiseOp<std::max((VecBytes >> 1), sizeof(Type)),
+                                BlockSize>::run(out, in, lineLen, nLines,
+                                                alongLines, op, stream,
+                                                vecs...);
     }
     if (alongLines)
-      return matrixLinewiseVecRows<Type, IdxType, VecBytes, Lambda, Vecs...>(
-        out, in, lineLen, nLines, op, stream, vecs...);
+      return matrixLinewiseVecRows<Type, IdxType, VecBytes, BlockSize, Lambda,
+                                   Vecs...>(out, in, lineLen, nLines, op,
+                                            stream, vecs...);
     else
-      return matrixLinewiseVecCols<Type, IdxType, VecBytes, Lambda, Vecs...>(
-        out, in, lineLen, nLines, op, stream, vecs...);
+      return matrixLinewiseVecCols<Type, IdxType, VecBytes, BlockSize, Lambda,
+                                   Vecs...>(out, in, lineLen, nLines, op,
+                                            stream, vecs...);
   }
 };
 
@@ -297,29 +307,29 @@ struct MatrixLinewiseOp {
  * row-vectors or column-vectors.
  * The term `line` here signifies that the lines can be either columns or rows,
  * depending on the matrix layout.
- * What matters is if vectors are applied along lines (indices of vectors correspond
- * indices within lines), or across lines (indices of vectors correspond to line indices).
+ * What matters is if the vectors are applied along lines (indices of vectors correspond to
+ * indices within lines), or across lines (indices of vectors correspond to line numbers).
  *
- * @param out result of the operation; can be same as `in`; should be aligned the same as `in`
- *        to allow faster vectorized memory transfers.
- * @param in input matrix consisting of `nLines` lines, each `lineLen`-long.
- * @param lineLen length of matrix line in elements (`=nCols` in row-major or `=nRows` in col-major)
- * @param nLines number of matrix lines (`=nRows` in row-major or `=nCols` in col-major)
- * @param alongLines whether vectors are indices along or across lines.
- * @param op the operation applied on each line:
+ * @param [out] out result of the operation; can be same as `in`; should be aligned the same
+ *        as `in` to allow faster vectorized memory transfers.
+ * @param [in] in input matrix consisting of `nLines` lines, each `lineLen`-long.
+ * @param [in] lineLen length of matrix line in elements (`=nCols` in row-major or `=nRows` in col-major)
+ * @param [in] nLines number of matrix lines (`=nRows` in row-major or `=nCols` in col-major)
+ * @param [in] alongLines whether vectors are indices along or across lines.
+ * @param [in] op the operation applied on each line:
  *    for i in [0..lineLen) and j in [0..nLines):
  *      out[i, j] = op(in[i, j], vec1[i], vec2[i], ... veck[i])   if alongLines = true
  *      out[i, j] = op(in[i, j], vec1[j], vec2[j], ... veck[j])   if alongLines = false
  *    where matrix indexing is row-major ([i, j] = [i + lineLen * j]).
- * @param stream a cuda stream for the kernels
- * @param vecs zero or more vectors to be passed as arguments,
+ * @param [in] stream a cuda stream for the kernels
+ * @param [in] vecs zero or more vectors to be passed as arguments,
  *    size of each vector is `alongLines ? lineLen : nLines`.
  */
 template <typename Type, typename IdxType, typename Lambda, typename... Vecs>
 void matrixLinewiseOp(Type* out, const Type* in, const IdxType lineLen,
                       const IdxType nLines, const bool alongLines, Lambda op,
                       cudaStream_t stream, Vecs... vecs) {
-  linewise_impl::MatrixLinewiseOp<16>::run<Type, IdxType, Lambda, Vecs...>(
+  linewise_impl::MatrixLinewiseOp<16, 256>::run<Type, IdxType, Lambda, Vecs...>(
     out, in, lineLen, nLines, alongLines, op, stream, vecs...);
 }
 
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index 81c1919b2e..3082d92d9f 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
+#include <raft/linalg/matrix_linewise_op.cuh>
 #include <raft/pow2_utils.cuh>
 #include <raft/vectorized.cuh>
 
@@ -127,22 +128,30 @@ void matrixVectorOp(Type* out,
                     Lambda op,
                     cudaStream_t stream)
 {
-  IdxType stride      = rowMajor ? D : N;
+  IdxType stride = rowMajor ? D : N;
+  // IdxType nLines = rowMajor ? N : D;
+  // return matrixLinewiseOp(out, matrix, stride, nLines,
+  //                         rowMajor == bcastAlongRows, op, stream, vec);
+
   size_t stride_bytes = stride * sizeof(Type);
 
-  if (AlignedAccess<16>::test(matrix, stride_bytes)) {
+  if (AlignedAccess<16>::test(matrix, stride_bytes) && AlignedAccess<16>::test(out, stride_bytes)) {
     matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<8>::test(matrix, stride_bytes)) {
+  } else if (AlignedAccess<8>::test(matrix, stride_bytes) &&
+             AlignedAccess<8>::test(out, stride_bytes)) {
     matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<4>::test(matrix, stride_bytes)) {
+  } else if (AlignedAccess<4>::test(matrix, stride_bytes) &&
+             AlignedAccess<4>::test(out, stride_bytes)) {
     matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<2>::test(matrix, stride_bytes)) {
+  } else if (AlignedAccess<2>::test(matrix, stride_bytes) &&
+             AlignedAccess<2>::test(out, stride_bytes)) {
     matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<1>::test(matrix, stride_bytes)) {
+  } else if (AlignedAccess<1>::test(matrix, stride_bytes) &&
+             AlignedAccess<1>::test(out, stride_bytes)) {
     matrixVectorOpImpl<Type, 1 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
   } else {
@@ -250,22 +259,30 @@ void matrixVectorOp(Type* out,
                     Lambda op,
                     cudaStream_t stream)
 {
-  IdxType stride      = rowMajor ? D : N;
+  IdxType stride = rowMajor ? D : N;
+  // IdxType nLines = rowMajor ? N : D;
+  // return matrixLinewiseOp(out, matrix, stride, nLines,
+  //                         rowMajor == bcastAlongRows, op, stream, vec1, vec2);
+
   size_t stride_bytes = stride * sizeof(Type);
 
-  if (AlignedAccess<16>::test(matrix, stride_bytes)) {
+  if (AlignedAccess<16>::test(matrix, stride_bytes) && AlignedAccess<16>::test(out, stride_bytes)) {
     matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<8>::test(matrix, stride_bytes)) {
+  } else if (AlignedAccess<8>::test(matrix, stride_bytes) &&
+             AlignedAccess<8>::test(out, stride_bytes)) {
     matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<4>::test(matrix, stride_bytes)) {
+  } else if (AlignedAccess<4>::test(matrix, stride_bytes) &&
+             AlignedAccess<4>::test(out, stride_bytes)) {
     matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<2>::test(matrix, stride_bytes)) {
+  } else if (AlignedAccess<2>::test(matrix, stride_bytes) &&
+             AlignedAccess<2>::test(out, stride_bytes)) {
     matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (AlignedAccess<1>::test(matrix, stride_bytes)) {
+  } else if (AlignedAccess<1>::test(matrix, stride_bytes) &&
+             AlignedAccess<1>::test(out, stride_bytes)) {
     matrixVectorOpImpl<Type, 1 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
   } else {
diff --git a/cpp/test/linalg/matrix_linewise_op.cu b/cpp/test/linalg/matrix_linewise_op.cu
index c2f70d4525..4a77fe57aa 100644
--- a/cpp/test/linalg/matrix_linewise_op.cu
+++ b/cpp/test/linalg/matrix_linewise_op.cu
@@ -23,6 +23,7 @@
 #include <raft/random/rng.hpp>
 #include <rmm/device_uvector.hpp>
 #include "../test_utils.h"
+#include "matrix_vector_op.cuh"
 
 namespace raft {
 namespace linalg {
@@ -54,6 +55,9 @@ struct LinewiseTestParams {
   std::size_t workSizeBytes;
   uint64_t seed;
   bool useVanillaMatrixVectorOp;
+  bool checkCorrectness;
+  int inAlignOffset;
+  int outAlignOffset;
 };
 
 template <typename T, typename I, typename ParamsReader>
@@ -124,7 +128,7 @@ struct LinewiseTest
       I m = I(floor(y));
       if (n > 0 && m > 0) out.push_back(std::make_tuple(n, m));
     };
-    std::vector<double> sizes = {15, 16, 17, 256, 257, 263};
+    std::vector<double> sizes = {15, 16, 17, 256, 257, 263, 1024};
     addIfMakesSense(squareN, squareN);
     for (I k : sizes) {
       addIfMakesSense(solveForN(k), k);
@@ -137,9 +141,10 @@ struct LinewiseTest
   std::tuple<T*, const T*, const T*, const T*> assignSafePtrs(
     rmm::device_uvector<T>& blob, I n, I m) {
     typedef raft::Pow2<PTR_PADDING> Align;
-    T* out = Align::roundUp(blob.data());
+    T* out = Align::roundUp(blob.data()) + params.outAlignOffset;
     const T* in =
-      const_cast<const T*>(Align::roundUp(out + n * m + PTR_PADDING));
+      const_cast<const T*>(Align::roundUp(out + n * m + PTR_PADDING)) +
+      params.inAlignOffset;
     const T* vec1 = Align::roundUp(in + n * m + PTR_PADDING);
     const T* vec2 = Align::roundUp(vec1 + m + PTR_PADDING);
     ASSERT(blob.data() + blob.size() >= vec2 + PTR_PADDING,
@@ -149,6 +154,8 @@ struct LinewiseTest
 
   testing::AssertionResult run() {
     rmm::device_uvector<T> blob = genData();
+    rmm::device_uvector<T> blob_val(
+      params.checkCorrectness ? blob.size() / 2 : 0, stream);
 
     auto dims = suggestDimensions(2);
 
@@ -167,9 +174,25 @@ struct LinewiseTest
           PUSH_RANGE(stream, "one vec");
           runLinewiseSum(out, in, lineLen, nLines, alongRows, vec1);
           POP_RANGE(stream);
+          if (params.checkCorrectness) {
+            naiveMatVec(blob_val.data(), in, vec1, lineLen, nLines, true,
+                        alongRows, T(1));
+            EXPECT_NO_FATAL_FAILURE(
+              devArrMatch(blob_val.data(), out, n * m,
+                          CompareApprox<float>(params.tolerance)))
+              << "with one vec";
+          }
           PUSH_RANGE(stream, "two vecs");
           runLinewiseSum(out, in, lineLen, nLines, alongRows, vec1, vec2);
           POP_RANGE(stream);
+          if (params.checkCorrectness) {
+            naiveMatVec(blob_val.data(), in, vec1, vec2, lineLen, nLines, true,
+                        alongRows, T(1));
+            EXPECT_NO_FATAL_FAILURE(
+              devArrMatch(blob_val.data(), out, n * m,
+                          CompareApprox<float>(params.tolerance)))
+              << "with two vecs";
+          }
         }
         POP_RANGE(stream);
       }
@@ -189,39 +212,51 @@ struct LinewiseTest
   INSTANTIATE_TEST_SUITE_P(LinewiseOp, TestClass##_##ElemType##_##IndexType, \
                            TestClass##Params)
 
-auto MegabyteParams = ::testing::Bool();
+auto MegabyteParams =
+  ::testing::Combine(::testing::Bool(), ::testing::Values(0, 1, 2, 4),
+                     ::testing::Values(0, 1, 2, 3));
 
 struct Megabyte {
-  typedef bool Params;
+  typedef std::tuple<bool, int, int> Params;
   static LinewiseTestParams read(Params ps) {
     return {/** .tolerance */ 0.00001,
             /** .workSizeBytes */ 1024 * 1024,
             /** .seed */ 42ULL,
-            /** .useVanillaMatrixVectorOp */ ps};
+            /** .useVanillaMatrixVectorOp */ std::get<0>(ps),
+            /** .checkCorrectness */ true,
+            /** .inAlignOffset */ std::get<1>(ps),
+            /** .outAlignOffset */ std::get<2>(ps)};
   }
 };
 
-auto GigabyteParams = ::testing::Bool();
+auto GigabyteParams = ::testing::Combine(
+  ::testing::Bool(), ::testing::Values(0, 1, 2), ::testing::Values(0, 1, 2));
 
 struct Gigabyte {
-  typedef bool Params;
+  typedef std::tuple<bool, int, int> Params;
   static LinewiseTestParams read(Params ps) {
     return {/** .tolerance */ 0.00001,
             /** .workSizeBytes */ 1024 * 1024 * 1024,
             /** .seed */ 42ULL,
-            /** .useVanillaMatrixVectorOp */ ps};
+            /** .useVanillaMatrixVectorOp */ std::get<0>(ps),
+            /** .checkCorrectness */ false,
+            /** .inAlignOffset */ std::get<1>(ps),
+            /** .outAlignOffset */ std::get<2>(ps)};
   }
 };
 
-auto TenGigsParams = ::testing::Bool();
+auto TenGigsParams = GigabyteParams;
 
 struct TenGigs {
-  typedef bool Params;
+  typedef std::tuple<bool, int, int> Params;
   static LinewiseTestParams read(Params ps) {
     return {/** .tolerance */ 0.00001,
             /** .workSizeBytes */ 10ULL * 1024ULL * 1024ULL * 1024ULL,
             /** .seed */ 42ULL,
-            /** .useVanillaMatrixVectorOp */ ps};
+            /** .useVanillaMatrixVectorOp */ std::get<0>(ps),
+            /** .checkCorrectness */ false,
+            /** .inAlignOffset */ std::get<1>(ps),
+            /** .outAlignOffset */ std::get<2>(ps)};
   }
 };