From 4666c76899057e5f42b1df5ba287f84f72836069 Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Thu, 9 Sep 2021 15:46:02 +0200
Subject: [PATCH 1/3] Fix matrixVectorOp

---
 cpp/include/raft/linalg/matrix_vector_op.cuh | 34 ++++++++++++++------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index 902816418f..b2ae53a931 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -93,17 +93,24 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D,
                     IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op,
                     cudaStream_t stream) {
   IdxType stride = rowMajor ? D : N;
-  size_t bytes = stride * sizeof(Type);
-  if (16 / sizeof(Type) && bytes % 16 == 0) {
+  size_t stride_bytes = stride * sizeof(Type);
+
+  auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) {
+    return n_bytes / sizeof(Type) &&
+    stride_bytes % n_bytes == 0 &&
+    reinterpret_cast<uintptr_t>(matrix) % sizeof(Type);
+  };
+
+  if (test_aligned_access(16)) {
     matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (8 / sizeof(Type) && bytes % 8 == 0) {
+  } else if (test_aligned_access(8)) {
     matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (4 / sizeof(Type) && bytes % 4 == 0) {
+  } else if (test_aligned_access(4)) {
     matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (2 / sizeof(Type) && bytes % 2 == 0) {
+  } else if (test_aligned_access(2)) {
     matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
   } else if (1 / sizeof(Type)) {
@@ -189,17 +196,24 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1,
                     const Type *vec2, IdxType D, IdxType N, bool rowMajor,
                     bool bcastAlongRows, Lambda op, cudaStream_t stream) {
   IdxType stride = rowMajor ? D : N;
-  size_t bytes = stride * sizeof(Type);
-  if (16 / sizeof(Type) && bytes % 16 == 0) {
+  size_t stride_bytes = stride * sizeof(Type);
+
+  auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) {
+    return n_bytes / sizeof(Type) &&
+    stride_bytes % n_bytes == 0 &&
+    reinterpret_cast<uintptr_t>(matrix) % sizeof(Type);
+  };
+
+  if (test_aligned_access(16)) {
     matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (8 / sizeof(Type) && bytes % 8 == 0) {
+  } else if (test_aligned_access(8)) {
     matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (4 / sizeof(Type) && bytes % 4 == 0) {
+  } else if (test_aligned_access(4)) {
     matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
-  } else if (2 / sizeof(Type) && bytes % 2 == 0) {
+  } else if (test_aligned_access(2)) {
     matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
   } else if (1 / sizeof(Type)) {

From 11d08869b0b226125d79481facd8f20ccad5eec8 Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Thu, 9 Sep 2021 15:49:57 +0200
Subject: [PATCH 2/3] Clang formatting

---
 cpp/include/raft/linalg/matrix_vector_op.cuh | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index b2ae53a931..9018304a1c 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -96,9 +96,8 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D,
   size_t stride_bytes = stride * sizeof(Type);
 
   auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) {
-    return n_bytes / sizeof(Type) &&
-    stride_bytes % n_bytes == 0 &&
-    reinterpret_cast<uintptr_t>(matrix) % sizeof(Type);
+    return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 &&
+           reinterpret_cast<uintptr_t>(matrix) % sizeof(Type);
   };
 
   if (test_aligned_access(16)) {
@@ -199,9 +198,8 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1,
   size_t stride_bytes = stride * sizeof(Type);
 
   auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) {
-    return n_bytes / sizeof(Type) &&
-    stride_bytes % n_bytes == 0 &&
-    reinterpret_cast<uintptr_t>(matrix) % sizeof(Type);
+    return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 &&
+           reinterpret_cast<uintptr_t>(matrix) % sizeof(Type);
   };
 
   if (test_aligned_access(16)) {

From 3778e4a7debd74afd5d14d1b270a887bc8aa81a0 Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Fri, 24 Sep 2021 11:43:17 +0200
Subject: [PATCH 3/3] Completing documentation

---
 cpp/include/raft/linalg/matrix_vector_op.cuh | 12 ++++++++++++
 cpp/include/raft/vectorized.cuh              | 10 +++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index 9018304a1c..e948c3e673 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -73,6 +73,12 @@ void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec,
 
 /**
  * @brief Operations for all the columns or rows with a given vector.
+ * Caution : Threads process multiple elements to speed up processing. These
+ * are loaded in a single read thanks to type promotion. Faster processing
+ * would thus only be enabled when adresses are optimally aligned for it.
+ * Note : the function will also check that the size of the window of accesses
+ * is a multiple of the number of elements processed by a thread in order to
+ * enable faster processing
  * @tparam Type the matrix/vector type
  * @tparam Lambda a device function which represents a binary operator
  * @tparam IdxType Integer type used to for addressing
@@ -174,6 +180,12 @@ void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1,
 
 /**
  * @brief Operations for all the columns or rows with the given vectors.
+ * Caution : Threads process multiple elements to speed up processing. These
+ * are loaded in a single read thanks to type promotion. Faster processing
+ * would thus only be enabled when adresses are optimally aligned for it.
+ * Note : the function will also check that the size of the window of accesses
+ * is a multiple of the number of elements processed by a thread in order to
+ * enable faster processing
  * @tparam Type the matrix/vector type
  * @tparam Lambda a device function which represents a binary operator
  * @tparam IdxType Integer type used to for addressing
diff --git a/cpp/include/raft/vectorized.cuh b/cpp/include/raft/vectorized.cuh
index 1829fc0351..ceffbcca78 100644
--- a/cpp/include/raft/vectorized.cuh
+++ b/cpp/include/raft/vectorized.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -227,6 +227,14 @@ struct IOType<double, 2> {
      * reasons if one is unable to issue such vectorized operations, one can always
      * fallback to using POD types.
      *
+     * Concept of vectorized accesses : Threads process multiple elements
+     * to speed up processing. These are loaded in a single read thanks
+     * to type promotion. It is then reinterpreted as a vector elements
+     * to perform the kernel's work.
+     *
+     * Caution : vectorized accesses requires input adresses to be memory aligned
+     * according not to the input type but to the promoted type used for reading.
+     *
      * Example demonstrating the use of load operations, performing math on such
      * loaded data and finally storing it back.
      * @code{.cu}