diff --git a/cpp/include/raft/linalg/matrix_linewise_op.cuh b/cpp/include/raft/linalg/matrix_linewise_op.cuh index d3789be504..af7dd41971 100644 --- a/cpp/include/raft/linalg/matrix_linewise_op.cuh +++ b/cpp/include/raft/linalg/matrix_linewise_op.cuh @@ -190,7 +190,7 @@ __global__ void __launch_bounds__(MaxOffset, 2) L::loadVec(vecs, arrTail % rowLen, rowLen)...); } -template void matrixLinewiseVecCols(Type* out, const Type* in, const IdxType rowLen, const IdxType nRows, Lambda op, cudaStream_t stream, @@ -202,11 +202,9 @@ void matrixLinewiseVecCols(Type* out, const Type* in, const IdxType rowLen, const IdxType alignedOff = IdxType(alignedStart - in); const IdxType alignedEnd = IdxType(AlignBytes::roundDown(in + totalLen) - in); const IdxType alignedLen = alignedEnd - alignedOff; - // blockSize - constexpr int BlockSize = 256; constexpr dim3 bs(BlockSize, 1, 1); - // Minimum size of the grid to make device well occupied - const uint occupy = raft::getMultiProcessorCount() * 64; + // Minimum size of the grid to make the device well occupied + const uint occupy = raft::getMultiProcessorCount() * (16384 / BlockSize); // does not make sense to have more blocks than this const uint maxBlocks = raft::ceildiv(uint(alignedLen), bs.x * VecElems); const dim3 gs(min(maxBlocks, occupy), 1, 1); @@ -227,7 +225,7 @@ void matrixLinewiseVecCols(Type* out, const Type* in, const IdxType rowLen, } } -template void matrixLinewiseVecRows(Type* out, const Type* in, const IdxType rowLen, const IdxType nRows, Lambda op, cudaStream_t stream, @@ -236,14 +234,13 @@ void matrixLinewiseVecRows(Type* out, const Type* in, const IdxType rowLen, constexpr std::size_t VecElems = VecBytes / sizeof(Type); const IdxType totalLen = rowLen * nRows; // blockSize - constexpr int BlockSize = 256; constexpr dim3 bs(BlockSize, 1, 1); // if we have `stride` number of blocks, then each block processes always the same // indices along dimension rowLen; this means a block needs to index `vecs` only once! const uint stride = (rowLen / raft::gcd(bs.x * uint(VecElems), uint(rowLen))) * VecElems; - // Minimum size of the grid to make device well occupied - const uint occupy = raft::getMultiProcessorCount() * 64; + // Minimum size of the grid to make the device well occupied + const uint occupy = raft::getMultiProcessorCount() * (16384 / BlockSize); const dim3 gs(min( // does not make sense to have more blocks than this raft::ceildiv(uint(totalLen), bs.x * VecElems), @@ -270,7 +267,16 @@ void matrixLinewiseVecRows(Type* out, const Type* in, const IdxType rowLen, } } -template +/** + * Select one of the implementations: + * a. vectors applied along/across lines + * b. recursively try different VecBytes, such that alignments of `in` and `out` + * are the same. + * + * @tparam VecBytes - size of the load/store ops in bytes. + * @tparam BlockSize - is fixed and should not affect the performance. + */ +template struct MatrixLinewiseOp { template static void run(Type* out, const Type* in, const IdxType lineLen, @@ -278,15 +284,19 @@ struct MatrixLinewiseOp { cudaStream_t stream, Vecs... vecs) { if constexpr (VecBytes > sizeof(Type)) { if (!raft::Pow2::areSameAlignOffsets(in, out)) - return MatrixLinewiseOp> 1), sizeof(Type))>::run( - out, in, lineLen, nLines, alongLines, op, stream, vecs...); + return MatrixLinewiseOp> 1), sizeof(Type)), + BlockSize>::run(out, in, lineLen, nLines, + alongLines, op, stream, + vecs...); } if (alongLines) - return matrixLinewiseVecRows( - out, in, lineLen, nLines, op, stream, vecs...); + return matrixLinewiseVecRows(out, in, lineLen, nLines, op, + stream, vecs...); else - return matrixLinewiseVecCols( - out, in, lineLen, nLines, op, stream, vecs...); + return matrixLinewiseVecCols(out, in, lineLen, nLines, op, + stream, vecs...); } }; @@ -297,29 +307,29 @@ struct MatrixLinewiseOp { * row-vectors or column-vectors. * The term `line` here signifies that the lines can be either columns or rows, * depending on the matrix layout. - * What matters is if vectors are applied along lines (indices of vectors correspond - * indices within lines), or across lines (indices of vectors correspond to line indices). + * What matters is if the vectors are applied along lines (indices of vectors correspond to + * indices within lines), or across lines (indices of vectors correspond to line numbers). * - * @param out result of the operation; can be same as `in`; should be aligned the same as `in` - * to allow faster vectorized memory transfers. - * @param in input matrix consisting of `nLines` lines, each `lineLen`-long. - * @param lineLen length of matrix line in elements (`=nCols` in row-major or `=nRows` in col-major) - * @param nLines number of matrix lines (`=nRows` in row-major or `=nCols` in col-major) - * @param alongLines whether vectors are indices along or across lines. - * @param op the operation applied on each line: + * @param [out] out result of the operation; can be same as `in`; should be aligned the same + * as `in` to allow faster vectorized memory transfers. + * @param [in] in input matrix consisting of `nLines` lines, each `lineLen`-long. + * @param [in] lineLen length of matrix line in elements (`=nCols` in row-major or `=nRows` in col-major) + * @param [in] nLines number of matrix lines (`=nRows` in row-major or `=nCols` in col-major) + * @param [in] alongLines whether vectors are indices along or across lines. + * @param [in] op the operation applied on each line: * for i in [0..lineLen) and j in [0..nLines): * out[i, j] = op(in[i, j], vec1[i], vec2[i], ... veck[i]) if alongLines = true * out[i, j] = op(in[i, j], vec1[j], vec2[j], ... veck[j]) if alongLines = false * where matrix indexing is row-major ([i, j] = [i + lineLen * j]). - * @param stream a cuda stream for the kernels - * @param vecs zero or more vectors to be passed as arguments, + * @param [in] stream a cuda stream for the kernels + * @param [in] vecs zero or more vectors to be passed as arguments, * size of each vector is `alongLines ? lineLen : nLines`. */ template void matrixLinewiseOp(Type* out, const Type* in, const IdxType lineLen, const IdxType nLines, const bool alongLines, Lambda op, cudaStream_t stream, Vecs... vecs) { - linewise_impl::MatrixLinewiseOp<16>::run( + linewise_impl::MatrixLinewiseOp<16, 256>::run( out, in, lineLen, nLines, alongLines, op, stream, vecs...); } diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh index 81c1919b2e..3082d92d9f 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/matrix_vector_op.cuh @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include @@ -127,22 +128,30 @@ void matrixVectorOp(Type* out, Lambda op, cudaStream_t stream) { - IdxType stride = rowMajor ? D : N; + IdxType stride = rowMajor ? D : N; + // IdxType nLines = rowMajor ? N : D; + // return matrixLinewiseOp(out, matrix, stride, nLines, + // rowMajor == bcastAlongRows, op, stream, vec); + size_t stride_bytes = stride * sizeof(Type); - if (AlignedAccess<16>::test(matrix, stride_bytes)) { + if (AlignedAccess<16>::test(matrix, stride_bytes) && AlignedAccess<16>::test(out, stride_bytes)) { matrixVectorOpImpl( out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (AlignedAccess<8>::test(matrix, stride_bytes)) { + } else if (AlignedAccess<8>::test(matrix, stride_bytes) && + AlignedAccess<8>::test(out, stride_bytes)) { matrixVectorOpImpl( out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (AlignedAccess<4>::test(matrix, stride_bytes)) { + } else if (AlignedAccess<4>::test(matrix, stride_bytes) && + AlignedAccess<4>::test(out, stride_bytes)) { matrixVectorOpImpl( out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (AlignedAccess<2>::test(matrix, stride_bytes)) { + } else if (AlignedAccess<2>::test(matrix, stride_bytes) && + AlignedAccess<2>::test(out, stride_bytes)) { matrixVectorOpImpl( out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (AlignedAccess<1>::test(matrix, stride_bytes)) { + } else if (AlignedAccess<1>::test(matrix, stride_bytes) && + AlignedAccess<1>::test(out, stride_bytes)) { matrixVectorOpImpl( out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); } else { @@ -250,22 +259,30 @@ void matrixVectorOp(Type* out, Lambda op, cudaStream_t stream) { - IdxType stride = rowMajor ? D : N; + IdxType stride = rowMajor ? D : N; + // IdxType nLines = rowMajor ? N : D; + // return matrixLinewiseOp(out, matrix, stride, nLines, + // rowMajor == bcastAlongRows, op, stream, vec1, vec2); + size_t stride_bytes = stride * sizeof(Type); - if (AlignedAccess<16>::test(matrix, stride_bytes)) { + if (AlignedAccess<16>::test(matrix, stride_bytes) && AlignedAccess<16>::test(out, stride_bytes)) { matrixVectorOpImpl( out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (AlignedAccess<8>::test(matrix, stride_bytes)) { + } else if (AlignedAccess<8>::test(matrix, stride_bytes) && + AlignedAccess<8>::test(out, stride_bytes)) { matrixVectorOpImpl( out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (AlignedAccess<4>::test(matrix, stride_bytes)) { + } else if (AlignedAccess<4>::test(matrix, stride_bytes) && + AlignedAccess<4>::test(out, stride_bytes)) { matrixVectorOpImpl( out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (AlignedAccess<2>::test(matrix, stride_bytes)) { + } else if (AlignedAccess<2>::test(matrix, stride_bytes) && + AlignedAccess<2>::test(out, stride_bytes)) { matrixVectorOpImpl( out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (AlignedAccess<1>::test(matrix, stride_bytes)) { + } else if (AlignedAccess<1>::test(matrix, stride_bytes) && + AlignedAccess<1>::test(out, stride_bytes)) { matrixVectorOpImpl( out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); } else { diff --git a/cpp/test/linalg/matrix_linewise_op.cu b/cpp/test/linalg/matrix_linewise_op.cu index c2f70d4525..4a77fe57aa 100644 --- a/cpp/test/linalg/matrix_linewise_op.cu +++ b/cpp/test/linalg/matrix_linewise_op.cu @@ -23,6 +23,7 @@ #include #include #include "../test_utils.h" +#include "matrix_vector_op.cuh" namespace raft { namespace linalg { @@ -54,6 +55,9 @@ struct LinewiseTestParams { std::size_t workSizeBytes; uint64_t seed; bool useVanillaMatrixVectorOp; + bool checkCorrectness; + int inAlignOffset; + int outAlignOffset; }; template @@ -124,7 +128,7 @@ struct LinewiseTest I m = I(floor(y)); if (n > 0 && m > 0) out.push_back(std::make_tuple(n, m)); }; - std::vector sizes = {15, 16, 17, 256, 257, 263}; + std::vector sizes = {15, 16, 17, 256, 257, 263, 1024}; addIfMakesSense(squareN, squareN); for (I k : sizes) { addIfMakesSense(solveForN(k), k); @@ -137,9 +141,10 @@ struct LinewiseTest std::tuple assignSafePtrs( rmm::device_uvector& blob, I n, I m) { typedef raft::Pow2 Align; - T* out = Align::roundUp(blob.data()); + T* out = Align::roundUp(blob.data()) + params.outAlignOffset; const T* in = - const_cast(Align::roundUp(out + n * m + PTR_PADDING)); + const_cast(Align::roundUp(out + n * m + PTR_PADDING)) + + params.inAlignOffset; const T* vec1 = Align::roundUp(in + n * m + PTR_PADDING); const T* vec2 = Align::roundUp(vec1 + m + PTR_PADDING); ASSERT(blob.data() + blob.size() >= vec2 + PTR_PADDING, @@ -149,6 +154,8 @@ struct LinewiseTest testing::AssertionResult run() { rmm::device_uvector blob = genData(); + rmm::device_uvector blob_val( + params.checkCorrectness ? blob.size() / 2 : 0, stream); auto dims = suggestDimensions(2); @@ -167,9 +174,25 @@ struct LinewiseTest PUSH_RANGE(stream, "one vec"); runLinewiseSum(out, in, lineLen, nLines, alongRows, vec1); POP_RANGE(stream); + if (params.checkCorrectness) { + naiveMatVec(blob_val.data(), in, vec1, lineLen, nLines, true, + alongRows, T(1)); + EXPECT_NO_FATAL_FAILURE( + devArrMatch(blob_val.data(), out, n * m, + CompareApprox(params.tolerance))) + << "with one vec"; + } PUSH_RANGE(stream, "two vecs"); runLinewiseSum(out, in, lineLen, nLines, alongRows, vec1, vec2); POP_RANGE(stream); + if (params.checkCorrectness) { + naiveMatVec(blob_val.data(), in, vec1, vec2, lineLen, nLines, true, + alongRows, T(1)); + EXPECT_NO_FATAL_FAILURE( + devArrMatch(blob_val.data(), out, n * m, + CompareApprox(params.tolerance))) + << "with two vecs"; + } } POP_RANGE(stream); } @@ -189,39 +212,51 @@ struct LinewiseTest INSTANTIATE_TEST_SUITE_P(LinewiseOp, TestClass##_##ElemType##_##IndexType, \ TestClass##Params) -auto MegabyteParams = ::testing::Bool(); +auto MegabyteParams = + ::testing::Combine(::testing::Bool(), ::testing::Values(0, 1, 2, 4), + ::testing::Values(0, 1, 2, 3)); struct Megabyte { - typedef bool Params; + typedef std::tuple Params; static LinewiseTestParams read(Params ps) { return {/** .tolerance */ 0.00001, /** .workSizeBytes */ 1024 * 1024, /** .seed */ 42ULL, - /** .useVanillaMatrixVectorOp */ ps}; + /** .useVanillaMatrixVectorOp */ std::get<0>(ps), + /** .checkCorrectness */ true, + /** .inAlignOffset */ std::get<1>(ps), + /** .outAlignOffset */ std::get<2>(ps)}; } }; -auto GigabyteParams = ::testing::Bool(); +auto GigabyteParams = ::testing::Combine( + ::testing::Bool(), ::testing::Values(0, 1, 2), ::testing::Values(0, 1, 2)); struct Gigabyte { - typedef bool Params; + typedef std::tuple Params; static LinewiseTestParams read(Params ps) { return {/** .tolerance */ 0.00001, /** .workSizeBytes */ 1024 * 1024 * 1024, /** .seed */ 42ULL, - /** .useVanillaMatrixVectorOp */ ps}; + /** .useVanillaMatrixVectorOp */ std::get<0>(ps), + /** .checkCorrectness */ false, + /** .inAlignOffset */ std::get<1>(ps), + /** .outAlignOffset */ std::get<2>(ps)}; } }; -auto TenGigsParams = ::testing::Bool(); +auto TenGigsParams = GigabyteParams; struct TenGigs { - typedef bool Params; + typedef std::tuple Params; static LinewiseTestParams read(Params ps) { return {/** .tolerance */ 0.00001, /** .workSizeBytes */ 10ULL * 1024ULL * 1024ULL * 1024ULL, /** .seed */ 42ULL, - /** .useVanillaMatrixVectorOp */ ps}; + /** .useVanillaMatrixVectorOp */ std::get<0>(ps), + /** .checkCorrectness */ false, + /** .inAlignOffset */ std::get<1>(ps), + /** .outAlignOffset */ std::get<2>(ps)}; } };