Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Faster matrix-vector-ops #401

Merged
merged 25 commits into from
Jan 12, 2022
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
635c79b
Version 1 (acrossRows sometimes slower)
achirkin Nov 22, 2021
ed27367
Version 2 (all faster, but not fully tested)
achirkin Nov 25, 2021
bb30615
Cosmetics and tests
achirkin Nov 25, 2021
5c63642
Replace matrixVectorOp implementation with matrixLinewiseOp
achirkin Nov 25, 2021
6a26187
Update to the new styles
achirkin Nov 25, 2021
1d9378b
Add NVTX flag
achirkin Nov 26, 2021
bb4c8e5
Merge branch 'branch-22.02' into enh-faster-linewise-ops
achirkin Dec 6, 2021
cfaa82e
Fix incorrect behaviour on tiny matrices
achirkin Dec 7, 2021
9a5ae4c
Merge branch 'branch-22.02' into enh-faster-linewise-ops
achirkin Dec 8, 2021
6f1b92d
Hide implementation details
achirkin Dec 8, 2021
c8d917b
Move the tests as well
achirkin Dec 8, 2021
40e1a7e
Merge branch 'branch-22.02' into enh-faster-linewise-ops
achirkin Dec 10, 2021
92390d7
use NVTX helpers from future
achirkin Dec 10, 2021
b747a0e
Add more docstrings and comments
achirkin Dec 16, 2021
3555bec
Add even more docstrings and comments
achirkin Dec 16, 2021
98f4711
Merge branch 'branch-22.02' into enh-faster-linewise-ops
achirkin Dec 16, 2021
2d818da
Adapt to the new .style
achirkin Dec 16, 2021
f4099a2
Use a double-buffered-style shared memory in loadVec
achirkin Dec 17, 2021
41f6475
Adapt to changes in raft api
achirkin Dec 17, 2021
ce53072
Merge branch 'branch-22.02' into enh-faster-linewise-ops
achirkin Dec 17, 2021
eec616a
Tested NVTX ranges working fine
achirkin Dec 17, 2021
56fa890
Merge branch 'branch-22.02' into enh-faster-linewise-ops
achirkin Dec 21, 2021
f3ed247
Removed/explained some magic constants and refactored a bit
achirkin Dec 21, 2021
d971a8a
Merge branch 'branch-22.02' into enh-faster-linewise-ops
achirkin Jan 11, 2022
ccd806e
Merge branch 'branch-22.02' into enh-faster-linewise-ops
achirkin Jan 12, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
189 changes: 9 additions & 180 deletions cpp/include/raft/linalg/matrix_vector_op.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,83 +16,11 @@

#pragma once

#include <raft/cuda_utils.cuh>
#include <raft/pow2_utils.cuh>
#include <raft/vectorized.cuh>
#include <raft/matrix/matrix.hpp>

namespace raft {
namespace linalg {

namespace {
template <size_t VecBytes>
struct AlignedAccess {
template <typename T>
static inline bool test(const T* matrix, size_t strideBytes)
{
return Pow2<VecBytes>::isAligned(matrix) && Pow2<VecBytes>::isAligned(strideBytes) &&
Pow2<sizeof(T)>::isAligned(VecBytes);
}
};
}; // namespace

template <typename Type, int veclen_, typename Lambda, typename IdxType>
__global__ void matrixVectorOpKernel(Type* out,
const Type* matrix,
const Type* vector,
IdxType D,
IdxType N,
bool rowMajor,
bool bcastAlongRows,
Lambda op)
{
typedef TxN_t<Type, veclen_> VecType;
IdxType len = N * D;
IdxType idx = threadIdx.x;
idx += (IdxType)blockIdx.x * (IdxType)blockDim.x;
idx *= VecType::Ratio;
if (idx >= len) return;
IdxType vIdx;
VecType mat, vec;
///@todo: yikes! use fast-int-div here.
///@todo: shared mem for vector could help with perf
if (rowMajor && bcastAlongRows) {
vIdx = idx % D;
vec.load(vector, vIdx);
} else if (!rowMajor && !bcastAlongRows) {
vIdx = idx % N;
vec.load(vector, vIdx);
} else if (rowMajor && !bcastAlongRows) {
vIdx = idx / D;
vec.fill(vector[vIdx]);
} else {
vIdx = idx / N;
vec.fill(vector[vIdx]);
}
mat.load(matrix, idx);
#pragma unroll
for (int i = 0; i < VecType::Ratio; ++i)
mat.val.data[i] = op(mat.val.data[i], vec.val.data[i]);
mat.store(out, idx);
}

template <typename Type, int veclen_, typename Lambda, typename IdxType, int TPB>
void matrixVectorOpImpl(Type* out,
const Type* matrix,
const Type* vec,
IdxType D,
IdxType N,
bool rowMajor,
bool bcastAlongRows,
Lambda op,
cudaStream_t stream)
{
IdxType len = N * D;
IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB);
matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
<<<nblks, TPB, 0, stream>>>(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op);
CUDA_CHECK(cudaPeekAtLastError());
}

/**
* @brief Operations for all the columns or rows with a given vector.
* Caution : Threads process multiple elements to speed up processing. These
Expand Down Expand Up @@ -127,91 +55,10 @@ void matrixVectorOp(Type* out,
Lambda op,
cudaStream_t stream)
{
IdxType stride = rowMajor ? D : N;
size_t stride_bytes = stride * sizeof(Type);

if (AlignedAccess<16>::test(matrix, stride_bytes)) {
matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
} else if (AlignedAccess<8>::test(matrix, stride_bytes)) {
matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
} else if (AlignedAccess<4>::test(matrix, stride_bytes)) {
matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
} else if (AlignedAccess<2>::test(matrix, stride_bytes)) {
matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
} else if (AlignedAccess<1>::test(matrix, stride_bytes)) {
matrixVectorOpImpl<Type, 1 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
} else {
matrixVectorOpImpl<Type, 1, Lambda, IdxType, TPB>(
out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
}
}

///@todo: come up with a cleaner interface to support these cases in future!

template <typename Type, int veclen_, typename Lambda, typename IdxType>
__global__ void matrixVectorOpKernel(Type* out,
const Type* matrix,
const Type* vector1,
const Type* vector2,
IdxType D,
IdxType N,
bool rowMajor,
bool bcastAlongRows,
Lambda op)
{
typedef TxN_t<Type, veclen_> VecType;
IdxType len = N * D;
IdxType idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * VecType::Ratio;
if (idx >= len) return;
IdxType vIdx;
VecType mat, vec1, vec2;
///@todo: yikes! use fast-int-div here.
///@todo: shared mem for vector could help with perf
if (rowMajor && bcastAlongRows) {
vIdx = idx % D;
vec1.load(vector1, vIdx);
vec2.load(vector2, vIdx);
} else if (!rowMajor && !bcastAlongRows) {
vIdx = idx % N;
vec1.load(vector1, vIdx);
vec2.load(vector2, vIdx);
} else if (rowMajor && !bcastAlongRows) {
vIdx = idx / D;
vec1.fill(vector1[vIdx]);
vec2.fill(vector2[vIdx]);
} else {
vIdx = idx / N;
vec1.fill(vector1[vIdx]);
vec2.fill(vector2[vIdx]);
}
mat.load(matrix, idx);
#pragma unroll
for (int i = 0; i < VecType::Ratio; ++i)
mat.val.data[i] = op(mat.val.data[i], vec1.val.data[i], vec2.val.data[i]);
mat.store(out, idx);
}

template <typename Type, int veclen_, typename Lambda, typename IdxType, int TPB>
void matrixVectorOpImpl(Type* out,
const Type* matrix,
const Type* vec1,
const Type* vec2,
IdxType D,
IdxType N,
bool rowMajor,
bool bcastAlongRows,
Lambda op,
cudaStream_t stream)
{
IdxType nblks = raft::ceildiv(N * D, (IdxType)TPB);
matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
<<<nblks, TPB, 0, stream>>>(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op);
CUDA_CHECK(cudaPeekAtLastError());
IdxType stride = rowMajor ? D : N;
IdxType nLines = rowMajor ? N : D;
return matrix::linewiseOp(
out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec);
}

/**
Expand Down Expand Up @@ -250,28 +97,10 @@ void matrixVectorOp(Type* out,
Lambda op,
cudaStream_t stream)
{
IdxType stride = rowMajor ? D : N;
size_t stride_bytes = stride * sizeof(Type);

if (AlignedAccess<16>::test(matrix, stride_bytes)) {
matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
} else if (AlignedAccess<8>::test(matrix, stride_bytes)) {
matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
} else if (AlignedAccess<4>::test(matrix, stride_bytes)) {
matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
} else if (AlignedAccess<2>::test(matrix, stride_bytes)) {
matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
} else if (AlignedAccess<1>::test(matrix, stride_bytes)) {
matrixVectorOpImpl<Type, 1 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
} else {
matrixVectorOpImpl<Type, 1, Lambda, IdxType, TPB>(
out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
}
IdxType stride = rowMajor ? D : N;
IdxType nLines = rowMajor ? N : D;
return matrix::linewiseOp(
out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec1, vec2);
}

}; // end namespace linalg
Expand Down
Loading