diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh index 902816418f..e948c3e673 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/matrix_vector_op.cuh @@ -73,6 +73,12 @@ void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec, /** * @brief Operations for all the columns or rows with a given vector. + * Caution : Threads process multiple elements to speed up processing. These + * are loaded in a single read thanks to type promotion. Faster processing + * would thus only be enabled when adresses are optimally aligned for it. + * Note : the function will also check that the size of the window of accesses + * is a multiple of the number of elements processed by a thread in order to + * enable faster processing * @tparam Type the matrix/vector type * @tparam Lambda a device function which represents a binary operator * @tparam IdxType Integer type used to for addressing @@ -93,17 +99,23 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D, IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op, cudaStream_t stream) { IdxType stride = rowMajor ? D : N; - size_t bytes = stride * sizeof(Type); - if (16 / sizeof(Type) && bytes % 16 == 0) { + size_t stride_bytes = stride * sizeof(Type); + + auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) { + return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 && + reinterpret_cast(matrix) % sizeof(Type); + }; + + if (test_aligned_access(16)) { matrixVectorOpImpl( out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (8 / sizeof(Type) && bytes % 8 == 0) { + } else if (test_aligned_access(8)) { matrixVectorOpImpl( out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (4 / sizeof(Type) && bytes % 4 == 0) { + } else if (test_aligned_access(4)) { matrixVectorOpImpl( out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (2 / sizeof(Type) && bytes % 2 == 0) { + } else if (test_aligned_access(2)) { matrixVectorOpImpl( out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); } else if (1 / sizeof(Type)) { @@ -168,6 +180,12 @@ void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1, /** * @brief Operations for all the columns or rows with the given vectors. + * Caution : Threads process multiple elements to speed up processing. These + * are loaded in a single read thanks to type promotion. Faster processing + * would thus only be enabled when adresses are optimally aligned for it. + * Note : the function will also check that the size of the window of accesses + * is a multiple of the number of elements processed by a thread in order to + * enable faster processing * @tparam Type the matrix/vector type * @tparam Lambda a device function which represents a binary operator * @tparam IdxType Integer type used to for addressing @@ -189,17 +207,23 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1, const Type *vec2, IdxType D, IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op, cudaStream_t stream) { IdxType stride = rowMajor ? D : N; - size_t bytes = stride * sizeof(Type); - if (16 / sizeof(Type) && bytes % 16 == 0) { + size_t stride_bytes = stride * sizeof(Type); + + auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) { + return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 && + reinterpret_cast(matrix) % sizeof(Type); + }; + + if (test_aligned_access(16)) { matrixVectorOpImpl( out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (8 / sizeof(Type) && bytes % 8 == 0) { + } else if (test_aligned_access(8)) { matrixVectorOpImpl( out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (4 / sizeof(Type) && bytes % 4 == 0) { + } else if (test_aligned_access(4)) { matrixVectorOpImpl( out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (2 / sizeof(Type) && bytes % 2 == 0) { + } else if (test_aligned_access(2)) { matrixVectorOpImpl( out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); } else if (1 / sizeof(Type)) { diff --git a/cpp/include/raft/vectorized.cuh b/cpp/include/raft/vectorized.cuh index 1829fc0351..ceffbcca78 100644 --- a/cpp/include/raft/vectorized.cuh +++ b/cpp/include/raft/vectorized.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, NVIDIA CORPORATION. + * Copyright (c) 2018-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -227,6 +227,14 @@ struct IOType { * reasons if one is unable to issue such vectorized operations, one can always * fallback to using POD types. * + * Concept of vectorized accesses : Threads process multiple elements + * to speed up processing. These are loaded in a single read thanks + * to type promotion. It is then reinterpreted as a vector elements + * to perform the kernel's work. + * + * Caution : vectorized accesses requires input adresses to be memory aligned + * according not to the input type but to the promoted type used for reading. + * * Example demonstrating the use of load operations, performing math on such * loaded data and finally storing it back. * @code{.cu}