Skip to content

Commit

Permalink
Cosmetics and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
achirkin committed Nov 25, 2021
1 parent ed27367 commit bb30615
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 52 deletions.
66 changes: 38 additions & 28 deletions cpp/include/raft/linalg/matrix_linewise_op.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ __global__ void __launch_bounds__(MaxOffset, 2)
L::loadVec(vecs, arrTail % rowLen, rowLen)...);
}

template <typename Type, typename IdxType, std::size_t VecBytes,
template <typename Type, typename IdxType, std::size_t VecBytes, int BlockSize,
typename Lambda, typename... Vecs>
void matrixLinewiseVecCols(Type* out, const Type* in, const IdxType rowLen,
const IdxType nRows, Lambda op, cudaStream_t stream,
Expand All @@ -202,11 +202,9 @@ void matrixLinewiseVecCols(Type* out, const Type* in, const IdxType rowLen,
const IdxType alignedOff = IdxType(alignedStart - in);
const IdxType alignedEnd = IdxType(AlignBytes::roundDown(in + totalLen) - in);
const IdxType alignedLen = alignedEnd - alignedOff;
// blockSize
constexpr int BlockSize = 256;
constexpr dim3 bs(BlockSize, 1, 1);
// Minimum size of the grid to make device well occupied
const uint occupy = raft::getMultiProcessorCount() * 64;
// Minimum size of the grid to make the device well occupied
const uint occupy = raft::getMultiProcessorCount() * (16384 / BlockSize);
// does not make sense to have more blocks than this
const uint maxBlocks = raft::ceildiv<uint>(uint(alignedLen), bs.x * VecElems);
const dim3 gs(min(maxBlocks, occupy), 1, 1);
Expand All @@ -227,7 +225,7 @@ void matrixLinewiseVecCols(Type* out, const Type* in, const IdxType rowLen,
}
}

template <typename Type, typename IdxType, std::size_t VecBytes,
template <typename Type, typename IdxType, std::size_t VecBytes, int BlockSize,
typename Lambda, typename... Vecs>
void matrixLinewiseVecRows(Type* out, const Type* in, const IdxType rowLen,
const IdxType nRows, Lambda op, cudaStream_t stream,
Expand All @@ -236,14 +234,13 @@ void matrixLinewiseVecRows(Type* out, const Type* in, const IdxType rowLen,
constexpr std::size_t VecElems = VecBytes / sizeof(Type);
const IdxType totalLen = rowLen * nRows;
// blockSize
constexpr int BlockSize = 256;
constexpr dim3 bs(BlockSize, 1, 1);
// if we have `stride` number of blocks, then each block processes always the same
// indices along dimension rowLen; this means a block needs to index `vecs` only once!
const uint stride =
(rowLen / raft::gcd(bs.x * uint(VecElems), uint(rowLen))) * VecElems;
// Minimum size of the grid to make device well occupied
const uint occupy = raft::getMultiProcessorCount() * 64;
// Minimum size of the grid to make the device well occupied
const uint occupy = raft::getMultiProcessorCount() * (16384 / BlockSize);
const dim3 gs(min(
// does not make sense to have more blocks than this
raft::ceildiv<uint>(uint(totalLen), bs.x * VecElems),
Expand All @@ -270,23 +267,36 @@ void matrixLinewiseVecRows(Type* out, const Type* in, const IdxType rowLen,
}
}

template <std::size_t VecBytes = 16>
/**
* Select one of the implementations:
* a. vectors applied along/across lines
* b. recursively try different VecBytes, such that alignments of `in` and `out`
* are the same.
*
* @tparam VecBytes - size of the load/store ops in bytes.
* @tparam BlockSize - is fixed and should not affect the performance.
*/
template <std::size_t VecBytes = 16, int BlockSize = 256>
struct MatrixLinewiseOp {
template <typename Type, typename IdxType, typename Lambda, typename... Vecs>
static void run(Type* out, const Type* in, const IdxType lineLen,
const IdxType nLines, const bool alongLines, Lambda op,
cudaStream_t stream, Vecs... vecs) {
if constexpr (VecBytes > sizeof(Type)) {
if (!raft::Pow2<VecBytes>::areSameAlignOffsets(in, out))
return MatrixLinewiseOp<std::max((VecBytes >> 1), sizeof(Type))>::run(
out, in, lineLen, nLines, alongLines, op, stream, vecs...);
return MatrixLinewiseOp<std::max((VecBytes >> 1), sizeof(Type)),
BlockSize>::run(out, in, lineLen, nLines,
alongLines, op, stream,
vecs...);
}
if (alongLines)
return matrixLinewiseVecRows<Type, IdxType, VecBytes, Lambda, Vecs...>(
out, in, lineLen, nLines, op, stream, vecs...);
return matrixLinewiseVecRows<Type, IdxType, VecBytes, BlockSize, Lambda,
Vecs...>(out, in, lineLen, nLines, op,
stream, vecs...);
else
return matrixLinewiseVecCols<Type, IdxType, VecBytes, Lambda, Vecs...>(
out, in, lineLen, nLines, op, stream, vecs...);
return matrixLinewiseVecCols<Type, IdxType, VecBytes, BlockSize, Lambda,
Vecs...>(out, in, lineLen, nLines, op,
stream, vecs...);
}
};

Expand All @@ -297,29 +307,29 @@ struct MatrixLinewiseOp {
* row-vectors or column-vectors.
* The term `line` here signifies that the lines can be either columns or rows,
* depending on the matrix layout.
* What matters is if vectors are applied along lines (indices of vectors correspond
* indices within lines), or across lines (indices of vectors correspond to line indices).
* What matters is if the vectors are applied along lines (indices of vectors correspond to
* indices within lines), or across lines (indices of vectors correspond to line numbers).
*
* @param out result of the operation; can be same as `in`; should be aligned the same as `in`
* to allow faster vectorized memory transfers.
* @param in input matrix consisting of `nLines` lines, each `lineLen`-long.
* @param lineLen length of matrix line in elements (`=nCols` in row-major or `=nRows` in col-major)
* @param nLines number of matrix lines (`=nRows` in row-major or `=nCols` in col-major)
* @param alongLines whether vectors are indices along or across lines.
* @param op the operation applied on each line:
* @param [out] out result of the operation; can be same as `in`; should be aligned the same
* as `in` to allow faster vectorized memory transfers.
* @param [in] in input matrix consisting of `nLines` lines, each `lineLen`-long.
* @param [in] lineLen length of matrix line in elements (`=nCols` in row-major or `=nRows` in col-major)
* @param [in] nLines number of matrix lines (`=nRows` in row-major or `=nCols` in col-major)
* @param [in] alongLines whether vectors are indices along or across lines.
* @param [in] op the operation applied on each line:
* for i in [0..lineLen) and j in [0..nLines):
* out[i, j] = op(in[i, j], vec1[i], vec2[i], ... veck[i]) if alongLines = true
* out[i, j] = op(in[i, j], vec1[j], vec2[j], ... veck[j]) if alongLines = false
* where matrix indexing is row-major ([i, j] = [i + lineLen * j]).
* @param stream a cuda stream for the kernels
* @param vecs zero or more vectors to be passed as arguments,
* @param [in] stream a cuda stream for the kernels
* @param [in] vecs zero or more vectors to be passed as arguments,
* size of each vector is `alongLines ? lineLen : nLines`.
*/
template <typename Type, typename IdxType, typename Lambda, typename... Vecs>
void matrixLinewiseOp(Type* out, const Type* in, const IdxType lineLen,
const IdxType nLines, const bool alongLines, Lambda op,
cudaStream_t stream, Vecs... vecs) {
linewise_impl::MatrixLinewiseOp<16>::run<Type, IdxType, Lambda, Vecs...>(
linewise_impl::MatrixLinewiseOp<16, 256>::run<Type, IdxType, Lambda, Vecs...>(
out, in, lineLen, nLines, alongLines, op, stream, vecs...);
}

Expand Down
41 changes: 29 additions & 12 deletions cpp/include/raft/linalg/matrix_vector_op.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#pragma once

#include <raft/cuda_utils.cuh>
#include <raft/linalg/matrix_linewise_op.cuh>
#include <raft/pow2_utils.cuh>
#include <raft/vectorized.cuh>

Expand Down Expand Up @@ -127,22 +128,30 @@ void matrixVectorOp(Type* out,
Lambda op,
cudaStream_t stream)
{
IdxType stride = rowMajor ? D : N;
IdxType stride = rowMajor ? D : N;
// IdxType nLines = rowMajor ? N : D;
// return matrixLinewiseOp(out, matrix, stride, nLines,
// rowMajor == bcastAlongRows, op, stream, vec);

size_t stride_bytes = stride * sizeof(Type);

if (AlignedAccess<16>::test(matrix, stride_bytes)) {
if (AlignedAccess<16>::test(matrix, stride_bytes) && AlignedAccess<16>::test(out, stride_bytes)) {
matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
} else if (AlignedAccess<8>::test(matrix, stride_bytes)) {
} else if (AlignedAccess<8>::test(matrix, stride_bytes) &&
AlignedAccess<8>::test(out, stride_bytes)) {
matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
} else if (AlignedAccess<4>::test(matrix, stride_bytes)) {
} else if (AlignedAccess<4>::test(matrix, stride_bytes) &&
AlignedAccess<4>::test(out, stride_bytes)) {
matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
} else if (AlignedAccess<2>::test(matrix, stride_bytes)) {
} else if (AlignedAccess<2>::test(matrix, stride_bytes) &&
AlignedAccess<2>::test(out, stride_bytes)) {
matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
} else if (AlignedAccess<1>::test(matrix, stride_bytes)) {
} else if (AlignedAccess<1>::test(matrix, stride_bytes) &&
AlignedAccess<1>::test(out, stride_bytes)) {
matrixVectorOpImpl<Type, 1 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
} else {
Expand Down Expand Up @@ -250,22 +259,30 @@ void matrixVectorOp(Type* out,
Lambda op,
cudaStream_t stream)
{
IdxType stride = rowMajor ? D : N;
IdxType stride = rowMajor ? D : N;
// IdxType nLines = rowMajor ? N : D;
// return matrixLinewiseOp(out, matrix, stride, nLines,
// rowMajor == bcastAlongRows, op, stream, vec1, vec2);

size_t stride_bytes = stride * sizeof(Type);

if (AlignedAccess<16>::test(matrix, stride_bytes)) {
if (AlignedAccess<16>::test(matrix, stride_bytes) && AlignedAccess<16>::test(out, stride_bytes)) {
matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
} else if (AlignedAccess<8>::test(matrix, stride_bytes)) {
} else if (AlignedAccess<8>::test(matrix, stride_bytes) &&
AlignedAccess<8>::test(out, stride_bytes)) {
matrixVectorOpImpl<Type, 8 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
} else if (AlignedAccess<4>::test(matrix, stride_bytes)) {
} else if (AlignedAccess<4>::test(matrix, stride_bytes) &&
AlignedAccess<4>::test(out, stride_bytes)) {
matrixVectorOpImpl<Type, 4 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
} else if (AlignedAccess<2>::test(matrix, stride_bytes)) {
} else if (AlignedAccess<2>::test(matrix, stride_bytes) &&
AlignedAccess<2>::test(out, stride_bytes)) {
matrixVectorOpImpl<Type, 2 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
} else if (AlignedAccess<1>::test(matrix, stride_bytes)) {
} else if (AlignedAccess<1>::test(matrix, stride_bytes) &&
AlignedAccess<1>::test(out, stride_bytes)) {
matrixVectorOpImpl<Type, 1 / sizeof(Type), Lambda, IdxType, TPB>(
out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
} else {
Expand Down
59 changes: 47 additions & 12 deletions cpp/test/linalg/matrix_linewise_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <raft/random/rng.hpp>
#include <rmm/device_uvector.hpp>
#include "../test_utils.h"
#include "matrix_vector_op.cuh"

namespace raft {
namespace linalg {
Expand Down Expand Up @@ -54,6 +55,9 @@ struct LinewiseTestParams {
std::size_t workSizeBytes;
uint64_t seed;
bool useVanillaMatrixVectorOp;
bool checkCorrectness;
int inAlignOffset;
int outAlignOffset;
};

template <typename T, typename I, typename ParamsReader>
Expand Down Expand Up @@ -124,7 +128,7 @@ struct LinewiseTest
I m = I(floor(y));
if (n > 0 && m > 0) out.push_back(std::make_tuple(n, m));
};
std::vector<double> sizes = {15, 16, 17, 256, 257, 263};
std::vector<double> sizes = {15, 16, 17, 256, 257, 263, 1024};
addIfMakesSense(squareN, squareN);
for (I k : sizes) {
addIfMakesSense(solveForN(k), k);
Expand All @@ -137,9 +141,10 @@ struct LinewiseTest
std::tuple<T*, const T*, const T*, const T*> assignSafePtrs(
rmm::device_uvector<T>& blob, I n, I m) {
typedef raft::Pow2<PTR_PADDING> Align;
T* out = Align::roundUp(blob.data());
T* out = Align::roundUp(blob.data()) + params.outAlignOffset;
const T* in =
const_cast<const T*>(Align::roundUp(out + n * m + PTR_PADDING));
const_cast<const T*>(Align::roundUp(out + n * m + PTR_PADDING)) +
params.inAlignOffset;
const T* vec1 = Align::roundUp(in + n * m + PTR_PADDING);
const T* vec2 = Align::roundUp(vec1 + m + PTR_PADDING);
ASSERT(blob.data() + blob.size() >= vec2 + PTR_PADDING,
Expand All @@ -149,6 +154,8 @@ struct LinewiseTest

testing::AssertionResult run() {
rmm::device_uvector<T> blob = genData();
rmm::device_uvector<T> blob_val(
params.checkCorrectness ? blob.size() / 2 : 0, stream);

auto dims = suggestDimensions(2);

Expand All @@ -167,9 +174,25 @@ struct LinewiseTest
PUSH_RANGE(stream, "one vec");
runLinewiseSum(out, in, lineLen, nLines, alongRows, vec1);
POP_RANGE(stream);
if (params.checkCorrectness) {
naiveMatVec(blob_val.data(), in, vec1, lineLen, nLines, true,
alongRows, T(1));
EXPECT_NO_FATAL_FAILURE(
devArrMatch(blob_val.data(), out, n * m,
CompareApprox<float>(params.tolerance)))
<< "with one vec";
}
PUSH_RANGE(stream, "two vecs");
runLinewiseSum(out, in, lineLen, nLines, alongRows, vec1, vec2);
POP_RANGE(stream);
if (params.checkCorrectness) {
naiveMatVec(blob_val.data(), in, vec1, vec2, lineLen, nLines, true,
alongRows, T(1));
EXPECT_NO_FATAL_FAILURE(
devArrMatch(blob_val.data(), out, n * m,
CompareApprox<float>(params.tolerance)))
<< "with two vecs";
}
}
POP_RANGE(stream);
}
Expand All @@ -189,39 +212,51 @@ struct LinewiseTest
INSTANTIATE_TEST_SUITE_P(LinewiseOp, TestClass##_##ElemType##_##IndexType, \
TestClass##Params)

auto MegabyteParams = ::testing::Bool();
auto MegabyteParams =
::testing::Combine(::testing::Bool(), ::testing::Values(0, 1, 2, 4),
::testing::Values(0, 1, 2, 3));

struct Megabyte {
typedef bool Params;
typedef std::tuple<bool, int, int> Params;
static LinewiseTestParams read(Params ps) {
return {/** .tolerance */ 0.00001,
/** .workSizeBytes */ 1024 * 1024,
/** .seed */ 42ULL,
/** .useVanillaMatrixVectorOp */ ps};
/** .useVanillaMatrixVectorOp */ std::get<0>(ps),
/** .checkCorrectness */ true,
/** .inAlignOffset */ std::get<1>(ps),
/** .outAlignOffset */ std::get<2>(ps)};
}
};

auto GigabyteParams = ::testing::Bool();
auto GigabyteParams = ::testing::Combine(
::testing::Bool(), ::testing::Values(0, 1, 2), ::testing::Values(0, 1, 2));

struct Gigabyte {
typedef bool Params;
typedef std::tuple<bool, int, int> Params;
static LinewiseTestParams read(Params ps) {
return {/** .tolerance */ 0.00001,
/** .workSizeBytes */ 1024 * 1024 * 1024,
/** .seed */ 42ULL,
/** .useVanillaMatrixVectorOp */ ps};
/** .useVanillaMatrixVectorOp */ std::get<0>(ps),
/** .checkCorrectness */ false,
/** .inAlignOffset */ std::get<1>(ps),
/** .outAlignOffset */ std::get<2>(ps)};
}
};

auto TenGigsParams = ::testing::Bool();
auto TenGigsParams = GigabyteParams;

struct TenGigs {
typedef bool Params;
typedef std::tuple<bool, int, int> Params;
static LinewiseTestParams read(Params ps) {
return {/** .tolerance */ 0.00001,
/** .workSizeBytes */ 10ULL * 1024ULL * 1024ULL * 1024ULL,
/** .seed */ 42ULL,
/** .useVanillaMatrixVectorOp */ ps};
/** .useVanillaMatrixVectorOp */ std::get<0>(ps),
/** .checkCorrectness */ false,
/** .inAlignOffset */ std::get<1>(ps),
/** .outAlignOffset */ std::get<2>(ps)};
}
};

Expand Down

0 comments on commit bb30615

Please sign in to comment.