rapidsai · rapids-bot · Sep 24, 2021 · Sep 9, 2021 · Sep 9, 2021 · Sep 24, 2021
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -93,17 +93,23 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D,
                     IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op,
                     cudaStream_t stream) {
   IdxType stride = rowMajor ? D : N;
-  size_t bytes = stride * sizeof(Type);
-  if (16 / sizeof(Type) && bytes % 16 == 0) {
+  size_t stride_bytes = stride * sizeof(Type);
+
+  auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) {
+    return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 &&
+           reinterpret_cast<uintptr_t>(matrix) % sizeof(Type);
+  };
+
+  if (test_aligned_access(16)) {
     matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (8 / sizeof(Type) && bytes % 8 == 0) {
+  } else if (test_aligned_access(8)) {
     matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (4 / sizeof(Type) && bytes % 4 == 0) {
+  } else if (test_aligned_access(4)) {
     matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (2 / sizeof(Type) && bytes % 2 == 0) {
+  } else if (test_aligned_access(2)) {
     matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
   } else if (1 / sizeof(Type)) {
@@ -189,17 +195,23 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1,
                     const Type *vec2, IdxType D, IdxType N, bool rowMajor,
                     bool bcastAlongRows, Lambda op, cudaStream_t stream) {
   IdxType stride = rowMajor ? D : N;
-  size_t bytes = stride * sizeof(Type);
-  if (16 / sizeof(Type) && bytes % 16 == 0) {
+  size_t stride_bytes = stride * sizeof(Type);
+
+  auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) {
+    return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 &&
+           reinterpret_cast<uintptr_t>(matrix) % sizeof(Type);
+  };
+
+  if (test_aligned_access(16)) {
     matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (8 / sizeof(Type) && bytes % 8 == 0) {
+  } else if (test_aligned_access(8)) {
     matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (4 / sizeof(Type) && bytes % 4 == 0) {
+  } else if (test_aligned_access(4)) {
     matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (2 / sizeof(Type) && bytes % 2 == 0) {
+  } else if (test_aligned_access(2)) {
     matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
   } else if (1 / sizeof(Type)) {