Skip to content

Commit

Permalink
Merge pull request #337 from rapidsai/branch-21.10
Browse files Browse the repository at this point in the history
[gpuCI] Forward-merge branch-21.10 to branch-21.12 [skip gpuci]
  • Loading branch information
GPUtester authored Sep 24, 2021
2 parents 61411b9 + 6d7f897 commit c2e5a09
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 11 deletions.
44 changes: 34 additions & 10 deletions cpp/include/raft/linalg/matrix_vector_op.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,12 @@ void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec,

/**
* @brief Operations for all the columns or rows with a given vector.
* Caution : Threads process multiple elements to speed up processing. These
* are loaded in a single read thanks to type promotion. Faster processing
* would thus only be enabled when adresses are optimally aligned for it.
* Note : the function will also check that the size of the window of accesses
* is a multiple of the number of elements processed by a thread in order to
* enable faster processing
* @tparam Type the matrix/vector type
* @tparam Lambda a device function which represents a binary operator
* @tparam IdxType Integer type used to for addressing
Expand All @@ -93,17 +99,23 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D,
IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op,
cudaStream_t stream) {
IdxType stride = rowMajor ? D : N;
size_t bytes = stride * sizeof(Type);
if (16 / sizeof(Type) && bytes % 16 == 0) {
size_t stride_bytes = stride * sizeof(Type);

auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) {
return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 &&
reinterpret_cast<uintptr_t>(matrix) % sizeof(Type);
};

if (test_aligned_access(16)) {
matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
} else if (8 / sizeof(Type) && bytes % 8 == 0) {
} else if (test_aligned_access(8)) {
matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
} else if (4 / sizeof(Type) && bytes % 4 == 0) {
} else if (test_aligned_access(4)) {
matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
} else if (2 / sizeof(Type) && bytes % 2 == 0) {
} else if (test_aligned_access(2)) {
matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
} else if (1 / sizeof(Type)) {
Expand Down Expand Up @@ -168,6 +180,12 @@ void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1,

/**
* @brief Operations for all the columns or rows with the given vectors.
* Caution : Threads process multiple elements to speed up processing. These
* are loaded in a single read thanks to type promotion. Faster processing
* would thus only be enabled when adresses are optimally aligned for it.
* Note : the function will also check that the size of the window of accesses
* is a multiple of the number of elements processed by a thread in order to
* enable faster processing
* @tparam Type the matrix/vector type
* @tparam Lambda a device function which represents a binary operator
* @tparam IdxType Integer type used to for addressing
Expand All @@ -189,17 +207,23 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1,
const Type *vec2, IdxType D, IdxType N, bool rowMajor,
bool bcastAlongRows, Lambda op, cudaStream_t stream) {
IdxType stride = rowMajor ? D : N;
size_t bytes = stride * sizeof(Type);
if (16 / sizeof(Type) && bytes % 16 == 0) {
size_t stride_bytes = stride * sizeof(Type);

auto test_aligned_access = [stride_bytes, matrix](const int n_bytes) {
return n_bytes / sizeof(Type) && stride_bytes % n_bytes == 0 &&
reinterpret_cast<uintptr_t>(matrix) % sizeof(Type);
};

if (test_aligned_access(16)) {
matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
} else if (8 / sizeof(Type) && bytes % 8 == 0) {
} else if (test_aligned_access(8)) {
matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
} else if (4 / sizeof(Type) && bytes % 4 == 0) {
} else if (test_aligned_access(4)) {
matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
} else if (2 / sizeof(Type) && bytes % 2 == 0) {
} else if (test_aligned_access(2)) {
matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
} else if (1 / sizeof(Type)) {
Expand Down
10 changes: 9 additions & 1 deletion cpp/include/raft/vectorized.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2018, NVIDIA CORPORATION.
* Copyright (c) 2018-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -227,6 +227,14 @@ struct IOType<double, 2> {
* reasons if one is unable to issue such vectorized operations, one can always
* fallback to using POD types.
*
* Concept of vectorized accesses : Threads process multiple elements
* to speed up processing. These are loaded in a single read thanks
* to type promotion. It is then reinterpreted as a vector elements
* to perform the kernel's work.
*
* Caution : vectorized accesses requires input adresses to be memory aligned
* according not to the input type but to the promoted type used for reading.
*
* Example demonstrating the use of load operations, performing math on such
* loaded data and finally storing it back.
* @code{.cu}
Expand Down

0 comments on commit c2e5a09

Please sign in to comment.