From 4666c76899057e5f42b1df5ba287f84f72836069 Mon Sep 17 00:00:00 2001 From: viclafargue Date: Thu, 9 Sep 2021 15:46:02 +0200 Subject: [PATCH 1/3] Fix matrixVectorOp --- cpp/include/raft/linalg/matrix_vector_op.cuh | 34 ++++++++++++++------ 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh index 902816418f..b2ae53a931 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/matrix_vector_op.cuh @@ -93,17 +93,24 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D, IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op, cudaStream_t stream) { IdxType stride = rowMajor ? D : N; - size_t bytes = stride * sizeof(Type); - if (16 / sizeof(Type) && bytes % 16 == 0) { + size_t stride_bytes = stride * sizeof(Type); + + auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) { + return n_bytes / sizeof(Type) && + stride_bytes % n_bytes == 0 && + reinterpret_cast(matrix) % sizeof(Type); + }; + + if (test_aligned_access(16)) { matrixVectorOpImpl( out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (8 / sizeof(Type) && bytes % 8 == 0) { + } else if (test_aligned_access(8)) { matrixVectorOpImpl( out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (4 / sizeof(Type) && bytes % 4 == 0) { + } else if (test_aligned_access(4)) { matrixVectorOpImpl( out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (2 / sizeof(Type) && bytes % 2 == 0) { + } else if (test_aligned_access(2)) { matrixVectorOpImpl( out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); } else if (1 / sizeof(Type)) { @@ -189,17 +196,24 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1, const Type *vec2, IdxType D, IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op, cudaStream_t stream) { IdxType stride = rowMajor ? D : N; - size_t bytes = stride * sizeof(Type); - if (16 / sizeof(Type) && bytes % 16 == 0) { + size_t stride_bytes = stride * sizeof(Type); + + auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) { + return n_bytes / sizeof(Type) && + stride_bytes % n_bytes == 0 && + reinterpret_cast(matrix) % sizeof(Type); + }; + + if (test_aligned_access(16)) { matrixVectorOpImpl( out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (8 / sizeof(Type) && bytes % 8 == 0) { + } else if (test_aligned_access(8)) { matrixVectorOpImpl( out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (4 / sizeof(Type) && bytes % 4 == 0) { + } else if (test_aligned_access(4)) { matrixVectorOpImpl( out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (2 / sizeof(Type) && bytes % 2 == 0) { + } else if (test_aligned_access(2)) { matrixVectorOpImpl( out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); } else if (1 / sizeof(Type)) { From 11d08869b0b226125d79481facd8f20ccad5eec8 Mon Sep 17 00:00:00 2001 From: viclafargue Date: Thu, 9 Sep 2021 15:49:57 +0200 Subject: [PATCH 2/3] Clang formatting --- cpp/include/raft/linalg/matrix_vector_op.cuh | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh index b2ae53a931..9018304a1c 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/matrix_vector_op.cuh @@ -96,9 +96,8 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D, size_t stride_bytes = stride * sizeof(Type); auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) { - return n_bytes / sizeof(Type) && - stride_bytes % n_bytes == 0 && - reinterpret_cast(matrix) % sizeof(Type); + return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 && + reinterpret_cast(matrix) % sizeof(Type); }; if (test_aligned_access(16)) { @@ -199,9 +198,8 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1, size_t stride_bytes = stride * sizeof(Type); auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) { - return n_bytes / sizeof(Type) && - stride_bytes % n_bytes == 0 && - reinterpret_cast(matrix) % sizeof(Type); + return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 && + reinterpret_cast(matrix) % sizeof(Type); }; if (test_aligned_access(16)) { From 3778e4a7debd74afd5d14d1b270a887bc8aa81a0 Mon Sep 17 00:00:00 2001 From: viclafargue Date: Fri, 24 Sep 2021 11:43:17 +0200 Subject: [PATCH 3/3] Completing documentation --- cpp/include/raft/linalg/matrix_vector_op.cuh | 12 ++++++++++++ cpp/include/raft/vectorized.cuh | 10 +++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh index 9018304a1c..e948c3e673 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/matrix_vector_op.cuh @@ -73,6 +73,12 @@ void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec, /** * @brief Operations for all the columns or rows with a given vector. + * Caution : Threads process multiple elements to speed up processing. These + * are loaded in a single read thanks to type promotion. Faster processing + * would thus only be enabled when adresses are optimally aligned for it. + * Note : the function will also check that the size of the window of accesses + * is a multiple of the number of elements processed by a thread in order to + * enable faster processing * @tparam Type the matrix/vector type * @tparam Lambda a device function which represents a binary operator * @tparam IdxType Integer type used to for addressing @@ -174,6 +180,12 @@ void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1, /** * @brief Operations for all the columns or rows with the given vectors. + * Caution : Threads process multiple elements to speed up processing. These + * are loaded in a single read thanks to type promotion. Faster processing + * would thus only be enabled when adresses are optimally aligned for it. + * Note : the function will also check that the size of the window of accesses + * is a multiple of the number of elements processed by a thread in order to + * enable faster processing * @tparam Type the matrix/vector type * @tparam Lambda a device function which represents a binary operator * @tparam IdxType Integer type used to for addressing diff --git a/cpp/include/raft/vectorized.cuh b/cpp/include/raft/vectorized.cuh index 1829fc0351..ceffbcca78 100644 --- a/cpp/include/raft/vectorized.cuh +++ b/cpp/include/raft/vectorized.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, NVIDIA CORPORATION. + * Copyright (c) 2018-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -227,6 +227,14 @@ struct IOType { * reasons if one is unable to issue such vectorized operations, one can always * fallback to using POD types. * + * Concept of vectorized accesses : Threads process multiple elements + * to speed up processing. These are loaded in a single read thanks + * to type promotion. It is then reinterpreted as a vector elements + * to perform the kernel's work. + * + * Caution : vectorized accesses requires input adresses to be memory aligned + * according not to the input type but to the promoted type used for reading. + * * Example demonstrating the use of load operations, performing math on such * loaded data and finally storing it back. * @code{.cu}