From 15fd1d3faf151480a8fd2d259a12b4487d00700d Mon Sep 17 00:00:00 2001 From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com> Date: Wed, 12 Jan 2022 15:26:50 +0100 Subject: [PATCH] Faster matrix-vector-ops (#401) Introduce `matrixLinewiseOp` for applying row- or column-wise operations on matrices with (templated) fixed number of vectors. This is a rewriting of `matrixVectorOp`. The new primitive is on average 2x faster for various numbers of columns/rows. In general case: it improves performance by reusing the vector values across multiple matrix rows/columns (trying to load vector value once or at least cache it). In edge case: it tries to use vectorized load/store operations on the input/output matrices even when the pointers are not properly aligned, or the vectors' length is not multiple of the alignment. Authors: - Artem M. Chirkin (https://github.com/achirkin) Approvers: - Tamas Bela Feher (https://github.com/tfeher) - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/401 --- cpp/include/raft/linalg/matrix_vector_op.cuh | 189 +----- .../raft/matrix/detail/linewise_op.cuh | 546 ++++++++++++++++++ cpp/include/raft/matrix/matrix.hpp | 48 +- cpp/include/raft/pow2_utils.cuh | 44 +- cpp/test/CMakeLists.txt | 1 + cpp/test/matrix/linewise_op.cu | 277 +++++++++ 6 files changed, 904 insertions(+), 201 deletions(-) create mode 100644 cpp/include/raft/matrix/detail/linewise_op.cuh create mode 100644 cpp/test/matrix/linewise_op.cu diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh index bd80cf5d02..750eca0742 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/matrix_vector_op.cuh @@ -16,83 +16,11 @@ #pragma once -#include -#include -#include +#include namespace raft { namespace linalg { -namespace { -template -struct AlignedAccess { - template - static inline bool test(const T* matrix, size_t strideBytes) - { - return Pow2::isAligned(matrix) && Pow2::isAligned(strideBytes) && - Pow2::isAligned(VecBytes); - } -}; -}; // namespace - -template -__global__ void matrixVectorOpKernel(Type* out, - const Type* matrix, - const Type* vector, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - Lambda op) -{ - typedef TxN_t VecType; - IdxType len = N * D; - IdxType idx = threadIdx.x; - idx += (IdxType)blockIdx.x * (IdxType)blockDim.x; - idx *= VecType::Ratio; - if (idx >= len) return; - IdxType vIdx; - VecType mat, vec; - ///@todo: yikes! use fast-int-div here. - ///@todo: shared mem for vector could help with perf - if (rowMajor && bcastAlongRows) { - vIdx = idx % D; - vec.load(vector, vIdx); - } else if (!rowMajor && !bcastAlongRows) { - vIdx = idx % N; - vec.load(vector, vIdx); - } else if (rowMajor && !bcastAlongRows) { - vIdx = idx / D; - vec.fill(vector[vIdx]); - } else { - vIdx = idx / N; - vec.fill(vector[vIdx]); - } - mat.load(matrix, idx); -#pragma unroll - for (int i = 0; i < VecType::Ratio; ++i) - mat.val.data[i] = op(mat.val.data[i], vec.val.data[i]); - mat.store(out, idx); -} - -template -void matrixVectorOpImpl(Type* out, - const Type* matrix, - const Type* vec, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - Lambda op, - cudaStream_t stream) -{ - IdxType len = N * D; - IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB); - matrixVectorOpKernel - <<>>(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op); - RAFT_CUDA_TRY(cudaPeekAtLastError()); -} - /** * @brief Operations for all the columns or rows with a given vector. * Caution : Threads process multiple elements to speed up processing. These @@ -127,91 +55,10 @@ void matrixVectorOp(Type* out, Lambda op, cudaStream_t stream) { - IdxType stride = rowMajor ? D : N; - size_t stride_bytes = stride * sizeof(Type); - - if (AlignedAccess<16>::test(matrix, stride_bytes)) { - matrixVectorOpImpl( - out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (AlignedAccess<8>::test(matrix, stride_bytes)) { - matrixVectorOpImpl( - out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (AlignedAccess<4>::test(matrix, stride_bytes)) { - matrixVectorOpImpl( - out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (AlignedAccess<2>::test(matrix, stride_bytes)) { - matrixVectorOpImpl( - out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (AlignedAccess<1>::test(matrix, stride_bytes)) { - matrixVectorOpImpl( - out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } else { - matrixVectorOpImpl( - out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); - } -} - -///@todo: come up with a cleaner interface to support these cases in future! - -template -__global__ void matrixVectorOpKernel(Type* out, - const Type* matrix, - const Type* vector1, - const Type* vector2, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - Lambda op) -{ - typedef TxN_t VecType; - IdxType len = N * D; - IdxType idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * VecType::Ratio; - if (idx >= len) return; - IdxType vIdx; - VecType mat, vec1, vec2; - ///@todo: yikes! use fast-int-div here. - ///@todo: shared mem for vector could help with perf - if (rowMajor && bcastAlongRows) { - vIdx = idx % D; - vec1.load(vector1, vIdx); - vec2.load(vector2, vIdx); - } else if (!rowMajor && !bcastAlongRows) { - vIdx = idx % N; - vec1.load(vector1, vIdx); - vec2.load(vector2, vIdx); - } else if (rowMajor && !bcastAlongRows) { - vIdx = idx / D; - vec1.fill(vector1[vIdx]); - vec2.fill(vector2[vIdx]); - } else { - vIdx = idx / N; - vec1.fill(vector1[vIdx]); - vec2.fill(vector2[vIdx]); - } - mat.load(matrix, idx); -#pragma unroll - for (int i = 0; i < VecType::Ratio; ++i) - mat.val.data[i] = op(mat.val.data[i], vec1.val.data[i], vec2.val.data[i]); - mat.store(out, idx); -} - -template -void matrixVectorOpImpl(Type* out, - const Type* matrix, - const Type* vec1, - const Type* vec2, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - Lambda op, - cudaStream_t stream) -{ - IdxType nblks = raft::ceildiv(N * D, (IdxType)TPB); - matrixVectorOpKernel - <<>>(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op); - RAFT_CUDA_TRY(cudaPeekAtLastError()); + IdxType stride = rowMajor ? D : N; + IdxType nLines = rowMajor ? N : D; + return matrix::linewiseOp( + out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec); } /** @@ -250,28 +97,10 @@ void matrixVectorOp(Type* out, Lambda op, cudaStream_t stream) { - IdxType stride = rowMajor ? D : N; - size_t stride_bytes = stride * sizeof(Type); - - if (AlignedAccess<16>::test(matrix, stride_bytes)) { - matrixVectorOpImpl( - out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (AlignedAccess<8>::test(matrix, stride_bytes)) { - matrixVectorOpImpl( - out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (AlignedAccess<4>::test(matrix, stride_bytes)) { - matrixVectorOpImpl( - out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (AlignedAccess<2>::test(matrix, stride_bytes)) { - matrixVectorOpImpl( - out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } else if (AlignedAccess<1>::test(matrix, stride_bytes)) { - matrixVectorOpImpl( - out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } else { - matrixVectorOpImpl( - out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); - } + IdxType stride = rowMajor ? D : N; + IdxType nLines = rowMajor ? N : D; + return matrix::linewiseOp( + out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec1, vec2); } }; // end namespace linalg diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh new file mode 100644 index 0000000000..63fa872f9d --- /dev/null +++ b/cpp/include/raft/matrix/detail/linewise_op.cuh @@ -0,0 +1,546 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace raft { +namespace matrix { +namespace detail { + +template +struct Linewise { + static constexpr IdxType VecElems = VecBytes / sizeof(Type); + + typedef raft::TxN_t Vec; + typedef raft::Pow2 AlignBytes; + typedef raft::Pow2 AlignElems; + typedef raft::Pow2 AlignWarp; + + /** + * Compute op(matrix_in, vec_1, vec_2, ...) where vectors are applied across the + * matrix rows (one vector element per matrix row). + * + * It's assumed that `in` and `out` are aligned to the cuda-vector-size, + * and their length is multiple of that. + * + * Block work arrangement: blocked; + * one warp works on a contiguous chunk of a matrix. Since the matrix is represented + * as a flat array, such an arangement minimizes the number of times when a single + * thread needs to reload the vector value at an index corresponding to the current + * matrix row. Ideally, a thread would load a value from a vector only once, but that + * is not possible if the vector size (= number of matrix rows) is too small or not + * aligned with the cuda-vector-size. + * + * Note about rowDiv/rowMod: + * these two represent the row/column indices in the original input matrices, before + * it was converted to (Vec::io_t*) type (which possibly involves shifting a pointer + * a bit to align to the cuda-vector-size). Thus, they are used to track the index for + * the argument vectors only (the vector pointers are not altered in any way). + * + * + * @tparam Vecs a pack of pointers to vectors (Type*) + * @param [out] out (aligned part of) the output matrix + * @param [in] in (aligned part of) the input matrix + * @param [in] in_end end of the (aligned part of the) input matrix + * @param [in] rowLen number of elements in a row (NOT the vector size) + * @param [in] rowDiv the index in the vectors (= row num in the original unaligned input matrix) + * @param [in] rowMod the index within a row in the original unaligned input matrix. + * @param [in] op the function to apply + * @param [in] vecs pointers to the argument vectors. + * + */ + template + static __device__ __forceinline__ void vectorCols(typename Vec::io_t* out, + const typename Vec::io_t* in, + const typename Vec::io_t* in_end, + const IdxType rowLen, + IdxType rowDiv, + IdxType rowMod, + Lambda op, + Vecs... vecs) noexcept + { + constexpr IdxType warpPad = (AlignWarp::Value - 1) * VecElems; + Type args[sizeof...(Vecs)]; + Vec v, w; + bool update = true; + for (; in < in_end; in += AlignWarp::Value, out += AlignWarp::Value, rowMod += warpPad) { + v.val.internal = __ldcv(in); + while (rowMod >= rowLen) { + rowMod -= rowLen; + rowDiv++; + update = true; + } + if (update) { + int l = 0; + ((args[l] = vecs[rowDiv], l++), ...); + update = false; + } +#pragma unroll VecElems + for (int k = 0; k < VecElems; k++, rowMod++) { + if (rowMod == rowLen) { + rowMod = 0; + rowDiv++; + int l = 0; + ((args[l] = vecs[rowDiv], l++), ...); + } + int l = 0; + w.val.data[k] = op(v.val.data[k], (std::ignore = vecs, args[l++])...); + } + *out = w.val.internal; + } + } + + /** + * Compute op(matrix_in, vec_1, vec_2, ...) where vectors are applied along + * matrix rows (vector and matrix indices are 1-1). + * + * It's assumed that `in` and `out` are aligned to the cuda-vector-size, + * and their length is multiple of that. + * + * Block work arrangement: striped; + * the grid size is chosen in such a way, that one thread always processes + * the same vector elements. That's why there is no need to read the + * vector arguments multiple times. + * + * @tparam Args a pack of raft::TxN_t + * @param [out] out (aligned part of) the output matrix + * @param [in] in (aligned part of) the input matrix + * @param [in] len total length of (the aligned part of) the input/output matrices + * @param [in] op the function to apply + * @param [in] args the cuda-vector-sized chunks on input vectors (raft::TxN_t) + */ + template + static __device__ __forceinline__ void vectorRows(typename Vec::io_t* out, + const typename Vec::io_t* in, + const IdxType len, + Lambda op, + Args... args) noexcept + { + Vec v; + const IdxType d = BlockSize * gridDim.x; + for (IdxType i = threadIdx.x + blockIdx.x * BlockSize; i < len; i += d) { + v.val.internal = __ldcv(in + i); +#pragma unroll VecElems + for (int k = 0; k < VecElems; k++) + v.val.data[k] = op(v.val.data[k], args.val.data[k]...); + __stwt(out + i, v.val.internal); + } + } + + /** + * The helper for `vectorRows`. Loads the `raft::TxN_t` chunk + * of a vector. Most of the time this is not aligned, so we load it thread-striped + * within a block and then use the shared memory to get a contiguous chunk. + * + * @param [in] shm a shared memory region for rearranging the data among threads + * @param [in] p pointer to a vector + * @param [in] blockOffset the offset of the current block into a vector. + * @param [in] rowLen the length of a vector. + * @return a contiguous chunk of a vector, suitable for `vectorRows`. + */ + static __device__ __forceinline__ Vec loadVec(Type* shm, + const Type* p, + const IdxType blockOffset, + const IdxType rowLen) noexcept + { + IdxType j = blockOffset + threadIdx.x; +#pragma unroll VecElems + for (int k = threadIdx.x; k < VecElems * BlockSize; k += BlockSize, j += BlockSize) { + while (j >= rowLen) + j -= rowLen; + shm[k] = p[j]; + } + __syncthreads(); + { + Vec out; + out.val.internal = reinterpret_cast(shm)[threadIdx.x]; + return out; + } + } +}; + +/** + * This kernel prepares the inputs for the `vectorCols` function where the most of the + * work happens; see `vectorCols` for details. + * + * The work arrangement is blocked; a single block works on a contiguous chunk of flattened + * matrix data and does not care about the gridDim. + * + * @param [out] out the output matrix + * @param [in] in the input matrix + * @param [in] arrOffset such an offset into the matrices that makes them aligned to the + * cuda-vector-size + * @param [in] rowLen number of elements in a row (NOT the vector size) + * @param [in] len the total length of the aligned part of the matrices + * @param [in] elemsPerThread how many elements are processed by a single thread in total + * @param [in] op the function to apply + * @param [in] vecs pointers to the argument vectors + */ +template +__global__ void __launch_bounds__(BlockSize) + matrixLinewiseVecColsMainKernel(Type* out, + const Type* in, + const IdxType arrOffset, + const IdxType rowLen, + const IdxType len, + const IdxType elemsPerThread, + Lambda op, + Vecs... vecs) +{ + typedef Linewise L; + + IdxType t = L::AlignWarp::mod(threadIdx.x); + t = arrOffset + elemsPerThread * (blockIdx.x * BlockSize + threadIdx.x - t) + t * L::VecElems; + + return L::vectorCols(reinterpret_cast(out + t), + reinterpret_cast(in + t), + reinterpret_cast( + in + min(t + elemsPerThread * L::AlignWarp::Value, len)), + rowLen, + t / rowLen, + t % rowLen, + op, + vecs...); +} + +/** + * This kernel is similar to `matrixLinewiseVecColsMainKernel`, but processes only the unaligned + * head and tail parts of the matrix. + * This kernel is always launched in just two blocks; the first block processes the head of the + * matrix, the second block processes the tail. It uses the same `vectorCols` function, but + * sets `VecElems = 1` + * + * @param [out] out the output matrix + * @param [in] in the input matrix + * @param [in] arrOffset the length of the unaligned head - such an offset into the matrices that + * makes them aligned to the `VecBytes` + * @param [in] arrTail the offset to the unaligned tail + * @param [in] rowLen number of elements in a row (NOT the vector size) + * @param [in] len the total length of the matrices (rowLen * nRows) + * @param [in] op the function to apply + * @param [in] vecs pointers to the argument vectors + */ +template +__global__ void __launch_bounds__(MaxOffset, 2) + matrixLinewiseVecColsTailKernel(Type* out, + const Type* in, + const IdxType arrOffset, + const IdxType arrTail, + const IdxType rowLen, + const IdxType len, + Lambda op, + Vecs... vecs) +{ + // Note, L::VecElems == 1 + typedef Linewise L; + IdxType threadOffset, elemsPerWarp; + if (blockIdx.x == 0) { + // first block: offset = 0, length = arrOffset + threadOffset = threadIdx.x; + elemsPerWarp = threadOffset < arrOffset; + } else { + // second block: offset = arrTail, length = len - arrTail + threadOffset = arrTail + threadIdx.x; + elemsPerWarp = threadOffset < len; + } + const IdxType rowDiv = threadOffset / rowLen; + const IdxType rowMod = threadOffset % rowLen; + return L::vectorCols( + reinterpret_cast(out + threadOffset), + reinterpret_cast(in + threadOffset), + reinterpret_cast(in + threadOffset + elemsPerWarp), + rowLen, + rowDiv, + rowMod, + op, + vecs...); +} + +/** + * This kernel prepares the inputs for the `vectorRows` function where the most of the + * work happens; see `vectorRows` for details. + * + * The work arrangement is striped; the gridDim should be selected in such a way, that + * on each iteration a thread processes the same indices along rows: + * `(gridDim.x * BlockSize * VecElems) % rowLen == 0`. + * + * @param [out] out the start of the *aligned* part of the output matrix + * @param [in] in the start of the *aligned* part of the input matrix + * @param [in] arrOffset such an offset into the matrices that makes them aligned to `VecBytes` + * @param [in] rowLen number of elements in a row (= the vector size) + * @param [in] len the total length of the aligned part of the matrices + * @param [in] op the function to apply + * @param [in] vecs pointers to the argument vectors + */ +template +__global__ void __launch_bounds__(BlockSize) + matrixLinewiseVecRowsMainKernel(Type* out, + const Type* in, + const IdxType arrOffset, + const IdxType rowLen, + const IdxType len, + Lambda op, + Vecs... vecs) +{ + typedef Linewise L; + constexpr uint workSize = L::VecElems * BlockSize; + uint workOffset = workSize; + __shared__ alignas(sizeof(Type) * L::VecElems) + Type shm[workSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)]; + const IdxType blockOffset = (arrOffset + BlockSize * L::VecElems * blockIdx.x) % rowLen; + return L::vectorRows( + reinterpret_cast(out), + reinterpret_cast(in), + L::AlignElems::div(len), + op, + (workOffset ^= workSize, L::loadVec(shm + workOffset, vecs, blockOffset, rowLen))...); +} + +/** + * This kernel is similar to `matrixLinewiseVecRowsMainKernel`, but processes only the unaligned + * head and tail parts of the matrix. + * This kernel is always launched in just two blocks; the first block processes the head of the + * matrix, the second block processes the tail. It uses the same `vectorRows` function, but + * sets `VecElems = 1` + * + * @param [out] out the output matrix + * @param [in] in the input matrix + * @param [in] arrOffset the length of the unaligned head - such an offset into the matrices that + * makes them aligned to the `VecBytes` + * @param [in] arrTail the offset to the unaligned tail + * @param [in] rowLen number of elements in a row (= the vector size) + * @param [in] len the total length of the matrices (rowLen * nRows) + * @param [in] op the function to apply + * @param [in] vecs pointers to the argument vectors + */ +template +__global__ void __launch_bounds__(MaxOffset, 2) + matrixLinewiseVecRowsTailKernel(Type* out, + const Type* in, + const IdxType arrOffset, + const IdxType arrTail, + const IdxType rowLen, + const IdxType len, + Lambda op, + Vecs... vecs) +{ + // Note, L::VecElems == 1 + constexpr uint workSize = MaxOffset; + uint workOffset = workSize; + __shared__ Type shm[workSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)]; + typedef Linewise L; + if (blockIdx.x == 0) { + // first block: offset = 0, length = arrOffset + L::vectorRows(reinterpret_cast(out), + reinterpret_cast(in), + arrOffset, + op, + (workOffset ^= workSize, L::loadVec(shm + workOffset, vecs, 0, rowLen))...); + } else { + // second block: offset = arrTail, length = len - arrTail + // NB: I substract MaxOffset (= blockDim.x) to get the correct indexing for block 1 + L::vectorRows( + reinterpret_cast(out + arrTail - MaxOffset), + reinterpret_cast(in + arrTail - MaxOffset), + len - arrTail + MaxOffset, + op, + (workOffset ^= workSize, L::loadVec(shm + workOffset, vecs, arrTail % rowLen, rowLen))...); + } +} + +/** Fully occupy GPU this many times for better work balancing. */ +static inline constexpr uint OptimalSmOccupancy = 16; + +/** + * Calculate the grid size to be `OptimalSmOccupancy * FullyOccupiedGPU`, where `FullyOccupiedGPU` + * is the maximum number of blocks fitting in all available SMs. + * + * @tparam BlockSize blockDim of the kernel. + * @return OptimalSmOccupancy * FullyOccupiedGPU + */ +template +inline uint getOptimalGridSize() +{ + int devId, smCount, maxBlockSize; + RAFT_CUDA_TRY(cudaGetDevice(&devId)); + RAFT_CUDA_TRY(cudaDeviceGetAttribute(&smCount, cudaDevAttrMultiProcessorCount, devId)); + RAFT_CUDA_TRY(cudaDeviceGetAttribute(&maxBlockSize, cudaDevAttrMaxThreadsPerBlock, devId)); + return OptimalSmOccupancy * static_cast(smCount * maxBlockSize / BlockSize); +} + +template +void matrixLinewiseVecCols(Type* out, + const Type* in, + const IdxType rowLen, + const IdxType nRows, + Lambda op, + cudaStream_t stream, + Vecs... vecs) +{ + typedef raft::Pow2 AlignBytes; + constexpr std::size_t VecElems = VecBytes / sizeof(Type); + const IdxType totalLen = rowLen * nRows; + const Type* alignedStart = AlignBytes::roundUp(in); + const IdxType alignedOff = IdxType(alignedStart - in); + const IdxType alignedEnd = IdxType(AlignBytes::roundDown(in + totalLen) - in); + const IdxType alignedLen = alignedEnd - alignedOff; + if (alignedLen > 0) { + constexpr dim3 bs(BlockSize, 1, 1); + // Minimum size of the grid to make the device well occupied + const uint occupy = getOptimalGridSize(); + // does not make sense to have more blocks than this + const uint maxBlocks = raft::ceildiv(uint(alignedLen), bs.x * VecElems); + const dim3 gs(min(maxBlocks, occupy), 1, 1); + // The work arrangement is blocked on the block and warp levels; + // see more details at Linewise::vectorCols. + // The value below determines how many scalar elements are processed by on thread in total. + const IdxType elemsPerThread = + raft::ceildiv(alignedLen, gs.x * VecElems * BlockSize) * VecElems; + matrixLinewiseVecColsMainKernel + <<>>(out, in, alignedOff, rowLen, alignedLen, elemsPerThread, op, vecs...); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + } + if (alignedLen < totalLen) { + // should be not smaller than the warp size for better branching + constexpr std::size_t MaxOffset = std::max(std::size_t(raft::WarpSize), VecBytes); + matrixLinewiseVecColsTailKernel + <<>>( + out, in, alignedOff, alignedEnd, rowLen, totalLen, op, vecs...); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + } +} + +template +void matrixLinewiseVecRows(Type* out, + const Type* in, + const IdxType rowLen, + const IdxType nRows, + Lambda op, + cudaStream_t stream, + Vecs... vecs) +{ + typedef raft::Pow2 AlignBytes; + constexpr std::size_t VecElems = VecBytes / sizeof(Type); + const IdxType totalLen = rowLen * nRows; + const Type* alignedStart = AlignBytes::roundUp(in); + const IdxType alignedOff = IdxType(alignedStart - in); + const IdxType alignedEnd = IdxType(AlignBytes::roundDown(in + totalLen) - in); + const IdxType alignedLen = alignedEnd - alignedOff; + if (alignedLen > 0) { + constexpr dim3 bs(BlockSize, 1, 1); + // The work arrangement is striped; + // see more details at Linewise::vectorRows. + // Below is the work amount performed by one block in one iteration. + constexpr uint block_work_size = bs.x * uint(VecElems); + /* Here I would define `grid_work_size = lcm(block_work_size, rowLen)` (Least Common Multiple) + This way, the grid spans a set of one or more rows each iteration, and, most importantly, + on every iteration each row processes the same set of indices within a row (= the same set + of vector indices). + This means, each block needs to load the values from the vector arguments only once. + Sadly, sometimes `grid_work_size > rowLen*nRows`, and sometimes grid_work_size > UINT_MAX. + That's why I don't declare it here explicitly. + Instead, I straightaway compute the + expected_grid_size = lcm(block_work_size, rowLen) / block_work_size + */ + const uint expected_grid_size = rowLen / raft::gcd(block_work_size, uint(rowLen)); + // Minimum size of the grid to make the device well occupied + const uint occupy = getOptimalGridSize(); + const dim3 gs(min( + // does not make sense to have more blocks than this + raft::ceildiv(uint(totalLen), block_work_size), + // increase the grid size to be not less than `occupy` while + // still being the multiple of `expected_grid_size` + raft::ceildiv(occupy, expected_grid_size) * expected_grid_size), + 1, + 1); + + matrixLinewiseVecRowsMainKernel + <<>>( + out + alignedOff, alignedStart, alignedOff, rowLen, alignedLen, op, vecs...); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + } + if (alignedLen < totalLen) { + // should be not smaller than the warp size for better branching + constexpr std::size_t MaxOffset = std::max(std::size_t(raft::WarpSize), VecBytes); + matrixLinewiseVecRowsTailKernel + <<>>( + out, in, alignedOff, alignedEnd, rowLen, totalLen, op, vecs...); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + } +} + +/** + * Select one of the implementations: + * a. vectors applied along/across lines + * b. recursively try different VecBytes, such that alignments of `in` and `out` + * are the same. + * + * @tparam VecBytes - size of the load/store ops in bytes. + * @tparam BlockSize - is fixed and should not affect the performance. + */ +template +struct MatrixLinewiseOp { + template + static void run(Type* out, + const Type* in, + const IdxType lineLen, + const IdxType nLines, + const bool alongLines, + Lambda op, + cudaStream_t stream, + Vecs... vecs) + { + if constexpr (VecBytes > sizeof(Type)) { + if (!raft::Pow2::areSameAlignOffsets(in, out)) + return MatrixLinewiseOp> 1), sizeof(Type)), BlockSize>::run( + out, in, lineLen, nLines, alongLines, op, stream, vecs...); + } + if (alongLines) + return matrixLinewiseVecRows( + out, in, lineLen, nLines, op, stream, vecs...); + else + return matrixLinewiseVecCols( + out, in, lineLen, nLines, op, stream, vecs...); + } +}; + +} // end namespace detail +} // end namespace matrix +} // end namespace raft diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp index d3d98cb872..e3e2f88d14 100644 --- a/cpp/include/raft/matrix/matrix.hpp +++ b/cpp/include/raft/matrix/matrix.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * Copyright (c) 2018-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,8 +16,11 @@ #pragma once +#include "detail/linewise_op.cuh" #include "detail/matrix.cuh" +#include + namespace raft { namespace matrix { @@ -223,5 +226,48 @@ m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t st return detail::getL2Norm(handle, in, size, stream); } +/** + * Run a function over matrix lines (rows or columns) with a variable number + * row-vectors or column-vectors. + * The term `line` here signifies that the lines can be either columns or rows, + * depending on the matrix layout. + * What matters is if the vectors are applied along lines (indices of vectors correspond to + * indices within lines), or across lines (indices of vectors correspond to line numbers). + * + * @param [out] out result of the operation; can be same as `in`; should be aligned the same + * as `in` to allow faster vectorized memory transfers. + * @param [in] in input matrix consisting of `nLines` lines, each `lineLen`-long. + * @param [in] lineLen length of matrix line in elements (`=nCols` in row-major or `=nRows` in + * col-major) + * @param [in] nLines number of matrix lines (`=nRows` in row-major or `=nCols` in col-major) + * @param [in] alongLines whether vectors are indices along or across lines. + * @param [in] op the operation applied on each line: + * for i in [0..lineLen) and j in [0..nLines): + * out[i, j] = op(in[i, j], vec1[i], vec2[i], ... veck[i]) if alongLines = true + * out[i, j] = op(in[i, j], vec1[j], vec2[j], ... veck[j]) if alongLines = false + * where matrix indexing is row-major ([i, j] = [i + lineLen * j]). + * @param [in] stream a cuda stream for the kernels + * @param [in] vecs zero or more vectors to be passed as arguments, + * size of each vector is `alongLines ? lineLen : nLines`. + */ +template +void linewiseOp(m_t* out, + const m_t* in, + const idx_t lineLen, + const idx_t nLines, + const bool alongLines, + Lambda op, + cudaStream_t stream, + Vecs... vecs) +{ + common::nvtx::range fun_scope("linewiseOp-%c-%zu (%zu, %zu)", + alongLines ? 'l' : 'x', + sizeof...(Vecs), + size_t(lineLen), + size_t(nLines)); + detail::MatrixLinewiseOp<16, 256>::run( + out, in, lineLen, nLines, alongLines, op, stream, vecs...); +} + }; // end namespace matrix }; // end namespace raft diff --git a/cpp/include/raft/pow2_utils.cuh b/cpp/include/raft/pow2_utils.cuh index 56a3192f9f..93f81db1ac 100644 --- a/cpp/include/raft/pow2_utils.cuh +++ b/cpp/include/raft/pow2_utils.cuh @@ -35,7 +35,9 @@ struct Pow2 { static_assert(std::is_integral::value, "Value must be integral."); static_assert(Value && !(Value & Mask), "Value must be power of two."); -#define Pow2_IsRepresentableAs(I) (std::is_integral::value && Type(I(Value)) == Value) +#define Pow2_FUNC_QUALIFIER static constexpr __host__ __device__ __forceinline__ +#define Pow2_WHEN_INTEGRAL(I) std::enable_if_t +#define Pow2_IS_REPRESENTABLE_AS(I) (std::is_integral::value && Type(I(Value)) == Value) /** * Integer division by Value truncated toward zero @@ -44,7 +46,7 @@ struct Pow2 { * Invariant: `x = Value * quot(x) + rem(x)` */ template - static constexpr HDI std::enable_if_t quot(I x) noexcept + Pow2_FUNC_QUALIFIER Pow2_WHEN_INTEGRAL(I) quot(I x) noexcept { if constexpr (std::is_signed::value) return (x >> I(Log2)) + (x < 0 && (x & I(Mask))); if constexpr (std::is_unsigned::value) return x >> I(Log2); @@ -57,7 +59,7 @@ struct Pow2 { * Invariant: `x = Value * quot(x) + rem(x)`. */ template - static constexpr HDI std::enable_if_t rem(I x) noexcept + Pow2_FUNC_QUALIFIER Pow2_WHEN_INTEGRAL(I) rem(I x) noexcept { if constexpr (std::is_signed::value) return x < 0 ? -((-x) & I(Mask)) : (x & I(Mask)); if constexpr (std::is_unsigned::value) return x & I(Mask); @@ -74,7 +76,7 @@ struct Pow2 { * compared to normal C++ operators `/` and `%`. */ template - static constexpr HDI std::enable_if_t div(I x) noexcept + Pow2_FUNC_QUALIFIER Pow2_WHEN_INTEGRAL(I) div(I x) noexcept { return x >> I(Log2); } @@ -91,7 +93,7 @@ struct Pow2 { * compared to normal C++ operators `/` and `%`. */ template - static constexpr HDI std::enable_if_t mod(I x) noexcept + Pow2_FUNC_QUALIFIER Pow2_WHEN_INTEGRAL(I) mod(I x) noexcept { return x & I(Mask); } @@ -105,25 +107,25 @@ struct Pow2 { * NB: for pointers, the alignment is checked in bytes, not in elements. */ template - static constexpr HDI bool isAligned(PtrT p) noexcept + Pow2_FUNC_QUALIFIER bool isAligned(PtrT p) noexcept { Pow2_CHECK_TYPE(PtrT); - if constexpr (Pow2_IsRepresentableAs(PtrT)) return mod(p) == 0; - if constexpr (!Pow2_IsRepresentableAs(PtrT)) return mod(reinterpret_cast(p)) == 0; + if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrT)) return mod(p) == 0; + if constexpr (!Pow2_IS_REPRESENTABLE_AS(PtrT)) return mod(reinterpret_cast(p)) == 0; } /** Tell whether two pointers have the same address modulo Value. */ template - static constexpr HDI bool areSameAlignOffsets(PtrT a, PtrS b) noexcept + Pow2_FUNC_QUALIFIER bool areSameAlignOffsets(PtrT a, PtrS b) noexcept { Pow2_CHECK_TYPE(PtrT); Pow2_CHECK_TYPE(PtrS); Type x, y; - if constexpr (Pow2_IsRepresentableAs(PtrT)) + if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrT)) x = Type(mod(a)); else x = mod(reinterpret_cast(a)); - if constexpr (Pow2_IsRepresentableAs(PtrS)) + if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrS)) y = Type(mod(b)); else y = mod(reinterpret_cast(b)); @@ -132,29 +134,31 @@ struct Pow2 { /** Get this or next Value-aligned address (in bytes) or integral. */ template - static constexpr HDI PtrT roundUp(PtrT p) noexcept + Pow2_FUNC_QUALIFIER PtrT roundUp(PtrT p) noexcept { Pow2_CHECK_TYPE(PtrT); - if constexpr (Pow2_IsRepresentableAs(PtrT)) return p + PtrT(Mask) - mod(p + PtrT(Mask)); - if constexpr (!Pow2_IsRepresentableAs(PtrT)) { + if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrT)) return (p + PtrT(Mask)) & PtrT(~Mask); + if constexpr (!Pow2_IS_REPRESENTABLE_AS(PtrT)) { auto x = reinterpret_cast(p); - return reinterpret_cast(x + Mask - mod(x + Mask)); + return reinterpret_cast((x + Mask) & (~Mask)); } } /** Get this or previous Value-aligned address (in bytes) or integral. */ template - static constexpr HDI PtrT roundDown(PtrT p) noexcept + Pow2_FUNC_QUALIFIER PtrT roundDown(PtrT p) noexcept { Pow2_CHECK_TYPE(PtrT); - if constexpr (Pow2_IsRepresentableAs(PtrT)) return p - mod(p); - if constexpr (!Pow2_IsRepresentableAs(PtrT)) { + if constexpr (Pow2_IS_REPRESENTABLE_AS(PtrT)) return p & PtrT(~Mask); + if constexpr (!Pow2_IS_REPRESENTABLE_AS(PtrT)) { auto x = reinterpret_cast(p); - return reinterpret_cast(x - mod(x)); + return reinterpret_cast(x & (~Mask)); } } #undef Pow2_CHECK_TYPE -#undef Pow2_IsRepresentableAs +#undef Pow2_IS_REPRESENTABLE_AS +#undef Pow2_FUNC_QUALIFIER +#undef Pow2_WHEN_INTEGRAL }; }; // namespace raft diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index b37c671525..5bf836e8d5 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -63,6 +63,7 @@ add_executable(test_raft test/linalg/unary_op.cu test/matrix/math.cu test/matrix/matrix.cu + test/matrix/linewise_op.cu test/mr/device/buffer.cpp test/mr/host/buffer.cpp test/mst.cu diff --git a/cpp/test/matrix/linewise_op.cu b/cpp/test/matrix/linewise_op.cu new file mode 100644 index 0000000000..1cd00b8adc --- /dev/null +++ b/cpp/test/matrix/linewise_op.cu @@ -0,0 +1,277 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../linalg/matrix_vector_op.cuh" +#include "../test_utils.h" +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft { +namespace matrix { + +constexpr std::size_t PTR_PADDING = 128; + +struct LinewiseTestParams { + double tolerance; + std::size_t workSizeBytes; + uint64_t seed; + bool checkCorrectness; + int inAlignOffset; + int outAlignOffset; +}; + +template +struct LinewiseTest : public ::testing::TestWithParam { + const LinewiseTestParams params; + const raft::handle_t handle; + rmm::cuda_stream_view stream; + + LinewiseTest() + : testing::TestWithParam(), + params( + ParamsReader::read(::testing::TestWithParam::GetParam())), + handle(), + stream(handle.get_stream()) + { + } + + void runLinewiseSum( + T* out, const T* in, const I lineLen, const I nLines, const bool alongLines, const T* vec) + { + auto f = [] __device__(T a, T b) -> T { return a + b; }; + matrix::linewiseOp(out, in, lineLen, nLines, alongLines, f, stream, vec); + } + + void runLinewiseSum(T* out, + const T* in, + const I lineLen, + const I nLines, + const bool alongLines, + const T* vec1, + const T* vec2) + { + auto f = [] __device__(T a, T b, T c) -> T { return a + b + c; }; + matrix::linewiseOp(out, in, lineLen, nLines, alongLines, f, stream, vec1, vec2); + } + + rmm::device_uvector genData(size_t workSizeBytes) + { + raft::random::Rng r(params.seed); + const std::size_t workSizeElems = workSizeBytes / sizeof(T); + rmm::device_uvector blob(workSizeElems, stream); + r.uniform(blob.data(), workSizeElems, T(-1.0), T(1.0), stream); + return blob; + } + + /** + * Suggest multiple versions of matrix dimensions (n, m), such that + * + * (2 * n * m + numVectors * m + minUnused) * sizeof(T) <= workSize. + * + * This way I know I can create two matrices and numVectors vectors of size m, + * such that they fit into the allocated workSet. + */ + std::vector> suggestDimensions(I numVectors) + { + const std::size_t workSizeElems = params.workSizeBytes / sizeof(T); + std::vector> out; + const double b = double(numVectors); + const double s = double(workSizeElems) - double(PTR_PADDING * 2 * (2 + b)); + double squareN = 0.25 * (sqrt(8.0 * s + b * b) - b); + + auto solveForN = [s, b](I m) -> double { return (s - b * double(m)) / double(2 * m); }; + auto solveForM = [s, b](I n) -> double { return s / double(2 * n + b); }; + auto addIfMakesSense = [&out](double x, double y) { + if (x <= 0 || y <= 0) return; + I n = I(floor(x)); + I m = I(floor(y)); + if (n > 0 && m > 0) out.push_back(std::make_tuple(n, m)); + }; + std::vector sizes = {15, 16, 17, 256, 257, 263, 1024}; + addIfMakesSense(squareN, squareN); + for (I k : sizes) { + addIfMakesSense(solveForN(k), k); + addIfMakesSense(k, solveForM(k)); + } + + return out; + } + + std::tuple assignSafePtrs(rmm::device_uvector& blob, + I n, + I m) + { + typedef raft::Pow2 Align; + T* out = Align::roundUp(blob.data()) + params.outAlignOffset; + const T* in = + const_cast(Align::roundUp(out + n * m + PTR_PADDING)) + params.inAlignOffset; + const T* vec1 = Align::roundUp(in + n * m + PTR_PADDING); + const T* vec2 = Align::roundUp(vec1 + m + PTR_PADDING); + ASSERT(blob.data() + blob.size() >= vec2 + PTR_PADDING, + "Failed to allocate pointers: the workset is not big enough."); + return std::make_tuple(out, in, vec1, vec2); + } + + testing::AssertionResult run(std::vector>&& dims, rmm::device_uvector&& blob) + { + rmm::device_uvector blob_val(params.checkCorrectness ? blob.size() / 2 : 0, stream); + + stream.synchronize(); + cudaProfilerStart(); + testing::AssertionResult r = testing::AssertionSuccess(); + for (auto [n, m] : dims) { + if (!r) break; + auto [out, in, vec1, vec2] = assignSafePtrs(blob, n, m); + common::nvtx::range dims_scope("Dims-%zu-%zu", std::size_t(n), std::size_t(m)); + for (auto alongRows : ::testing::Bool()) { + common::nvtx::range dir_scope(alongRows ? "alongRows" : "acrossRows"); + auto lineLen = alongRows ? m : n; + auto nLines = alongRows ? n : m; + { + { + common::nvtx::range vecs_scope("one vec"); + runLinewiseSum(out, in, lineLen, nLines, alongRows, vec1); + } + if (params.checkCorrectness) { + linalg::naiveMatVec( + blob_val.data(), in, vec1, lineLen, nLines, true, alongRows, T(1), stream); + r = devArrMatch(blob_val.data(), out, n * m, CompareApprox(params.tolerance)) + << " " << (alongRows ? "alongRows" : "acrossRows") + << " with one vec; lineLen: " << lineLen << "; nLines " << nLines; + if (!r) break; + } + { + common::nvtx::range vecs_scope("two vecs"); + runLinewiseSum(out, in, lineLen, nLines, alongRows, vec1, vec2); + } + if (params.checkCorrectness) { + linalg::naiveMatVec( + blob_val.data(), in, vec1, vec2, lineLen, nLines, true, alongRows, T(1), stream); + r = devArrMatch(blob_val.data(), out, n * m, CompareApprox(params.tolerance)) + << " " << (alongRows ? "alongRows" : "acrossRows") + << " with two vecs; lineLen: " << lineLen << "; nLines " << nLines; + if (!r) break; + } + } + } + } + cudaProfilerStop(); + + return r; + } + + testing::AssertionResult run() + { + return run(suggestDimensions(2), genData(params.workSizeBytes)); + } + + testing::AssertionResult runEdgeCases() + { + std::vector sizes = {1, 2, 3, 4, 7, 16}; + std::vector> dims; + for (auto m : sizes) { + for (auto n : sizes) { + dims.push_back(std::make_tuple(n, m)); + dims.push_back(std::make_tuple(m, n)); + } + } + + return run(std::move(dims), genData(1024 * 1024)); + } +}; + +#define TEST_IT(fun, TestClass, ElemType, IndexType) \ + typedef LinewiseTest TestClass##_##ElemType##_##IndexType; \ + TEST_P(TestClass##_##ElemType##_##IndexType, fun) { ASSERT_TRUE(fun()); } \ + INSTANTIATE_TEST_SUITE_P(LinewiseOp, TestClass##_##ElemType##_##IndexType, TestClass##Params) + +auto TinyParams = ::testing::Combine(::testing::Values(0, 1, 2, 4), ::testing::Values(0, 1, 2, 3)); + +struct Tiny { + typedef std::tuple Params; + static LinewiseTestParams read(Params ps) + { + return {/** .tolerance */ 0.00001, + /** .workSizeBytes */ 0 /* not used anyway */, + /** .seed */ 42ULL, + /** .checkCorrectness */ true, + /** .inAlignOffset */ std::get<0>(ps), + /** .outAlignOffset */ std::get<1>(ps)}; + } +}; + +auto MegabyteParams = TinyParams; + +struct Megabyte { + typedef std::tuple Params; + static LinewiseTestParams read(Params ps) + { + return {/** .tolerance */ 0.00001, + /** .workSizeBytes */ 1024 * 1024, + /** .seed */ 42ULL, + /** .checkCorrectness */ true, + /** .inAlignOffset */ std::get<0>(ps), + /** .outAlignOffset */ std::get<1>(ps)}; + } +}; + +auto GigabyteParams = ::testing::Combine(::testing::Values(0, 1, 2), ::testing::Values(0, 1, 2)); + +struct Gigabyte { + typedef std::tuple Params; + static LinewiseTestParams read(Params ps) + { + return {/** .tolerance */ 0.00001, + /** .workSizeBytes */ 1024 * 1024 * 1024, + /** .seed */ 42ULL, + /** .checkCorrectness */ false, + /** .inAlignOffset */ std::get<0>(ps), + /** .outAlignOffset */ std::get<1>(ps)}; + } +}; + +auto TenGigsParams = GigabyteParams; + +struct TenGigs { + typedef std::tuple Params; + static LinewiseTestParams read(Params ps) + { + return {/** .tolerance */ 0.00001, + /** .workSizeBytes */ 10ULL * 1024ULL * 1024ULL * 1024ULL, + /** .seed */ 42ULL, + /** .checkCorrectness */ false, + /** .inAlignOffset */ std::get<0>(ps), + /** .outAlignOffset */ std::get<1>(ps)}; + } +}; + +TEST_IT(runEdgeCases, Tiny, float, int); +TEST_IT(runEdgeCases, Tiny, double, int); +TEST_IT(run, Megabyte, float, int); +TEST_IT(run, Megabyte, double, int); +TEST_IT(run, Gigabyte, float, int); +TEST_IT(run, Gigabyte, double, int); +TEST_IT(run, TenGigs, float, uint64_t); +TEST_IT(run, TenGigs, double, uint64_t); + +} // namespace matrix +} // end namespace raft