From af05bcce28dcfa764121879fe6e6dcb374be4d69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Malte=20F=C3=B6rster?= <97973773+mfoerste4@users.noreply.github.com> Date: Thu, 27 Oct 2022 04:42:05 +0200 Subject: [PATCH] Adding padded layout 'layout_padded_general' (#725) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a different approach / followup PR of https://github.com/rapidsai/raft/pull/663 for issue https://github.com/rapidsai/raft/issues/497. I implemented a `layout_padded_general` within raft to statically enforce padding on mdpsan accesses. * The layout has template parameters for `ValueType`, `StorageOrder `(default `row_major_t`), and `ByteAlignment `(default 128) * in order to *not* require changes upstream I skipped `submdspan `functionality right now. I have a branch on a mdspan fork where I tested this though (https://github.com/mfoerste4/mdspan/tree/layout_padded). Authors: - Malte Förster (https://github.com/mfoerste4) Approvers: - Artem M. Chirkin (https://github.com/achirkin) - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/raft/pull/725 --- cpp/include/raft/core/device_mdspan.hpp | 45 + cpp/include/raft/core/host_mdspan.hpp | 45 + cpp/include/raft/core/mdspan.hpp | 35 + .../raft/matrix/detail/linewise_op.cuh | 233 ++++++ cpp/include/raft/matrix/linewise_op.cuh | 39 +- .../__p0009_bits/aligned_accessor.hpp | 189 +++++ .../__p0009_bits/layout_padded.hpp | 787 ++++++++++++++++++ .../mdspan/include/experimental/mdspan | 2 + cpp/test/matrix/linewise_op.cu | 157 +++- cpp/test/mdarray.cu | 420 ++++++++++ 10 files changed, 1942 insertions(+), 10 deletions(-) create mode 100644 cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/aligned_accessor.hpp create mode 100644 cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_padded.hpp diff --git a/cpp/include/raft/core/device_mdspan.hpp b/cpp/include/raft/core/device_mdspan.hpp index ffbbe43d01..7257b65f58 100644 --- a/cpp/include/raft/core/device_mdspan.hpp +++ b/cpp/include/raft/core/device_mdspan.hpp @@ -161,6 +161,51 @@ template using device_matrix_view = device_mdspan, LayoutPolicy>; +/** + * @brief Shorthand for 128 byte aligned device matrix view. + * @tparam ElementType the data type of the matrix elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy must be of type layout_{left/right}_padded + */ +template , + typename = enable_if_layout_padded> +using device_aligned_matrix_view = + device_mdspan, + LayoutPolicy, + std::experimental::aligned_accessor>; + +/** + * @brief Create a 2-dim 128 byte aligned mdspan instance for device pointer. It's + * expected that the given layout policy match the layout of the underlying + * pointer. + * @tparam ElementType the data type of the matrix elements + * @tparam LayoutPolicy must be of type layout_{left/right}_padded + * @tparam IndexType the index type of the extents + * @param[in] ptr on device to wrap + * @param[in] n_rows number of rows in pointer + * @param[in] n_cols number of columns in pointer + */ +template > +auto make_device_aligned_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols) +{ + using data_handle_type = + typename std::experimental::aligned_accessor::data_handle_type; + static_assert(std::is_same>::value || + std::is_same>::value); + assert(ptr == alignTo(ptr, detail::alignment::value)); + + data_handle_type aligned_pointer = ptr; + + matrix_extent extents{n_rows, n_cols}; + return device_aligned_matrix_view{aligned_pointer, extents}; +} + /** * @brief Create a raft::managed_mdspan * @tparam ElementType the data type of the matrix elements diff --git a/cpp/include/raft/core/host_mdspan.hpp b/cpp/include/raft/core/host_mdspan.hpp index 3e76dbb9ce..961a7a7ccb 100644 --- a/cpp/include/raft/core/host_mdspan.hpp +++ b/cpp/include/raft/core/host_mdspan.hpp @@ -110,6 +110,51 @@ template using host_matrix_view = host_mdspan, LayoutPolicy>; +/** + * @brief Shorthand for 128 byte aligned host matrix view. + * @tparam ElementType the data type of the matrix elements + * @tparam IndexType the index type of the extents + * @tparam LayoutPolicy must be of type layout_{left/right}_padded + */ +template , + typename = enable_if_layout_padded> +using host_aligned_matrix_view = + host_mdspan, + LayoutPolicy, + std::experimental::aligned_accessor>; + +/** + * @brief Create a 2-dim 128 byte aligned mdspan instance for host pointer. It's + * expected that the given layout policy match the layout of the underlying + * pointer. + * @tparam ElementType the data type of the matrix elements + * @tparam LayoutPolicy must be of type layout_{left/right}_padded + * @tparam IndexType the index type of the extents + * @param[in] ptr on host to wrap + * @param[in] n_rows number of rows in pointer + * @param[in] n_cols number of columns in pointer + */ +template > +auto make_host_aligned_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols) +{ + using data_handle_type = + typename std::experimental::aligned_accessor::data_handle_type; + + static_assert(std::is_same>::value || + std::is_same>::value); + assert(ptr == alignTo(ptr, detail::alignment::value)); + data_handle_type aligned_pointer = ptr; + + matrix_extent extents{n_rows, n_cols}; + return host_aligned_matrix_view{aligned_pointer, extents}; +} + /** * @brief Create a 0-dim (scalar) mdspan instance for host value. * diff --git a/cpp/include/raft/core/mdspan.hpp b/cpp/include/raft/core/mdspan.hpp index 1b98a7a937..1faac44cc8 100644 --- a/cpp/include/raft/core/mdspan.hpp +++ b/cpp/include/raft/core/mdspan.hpp @@ -32,6 +32,40 @@ template > using mdspan = std::experimental::mdspan; +namespace detail { + +// keeping ByteAlignment as optional to allow testing +template +struct padding { + static_assert(std::is_same, ValueType>::value, + "std::experimental::padding ValueType has to be provided without " + "const or volatile specifiers."); + static_assert(ByteAlignment % sizeof(ValueType) == 0 || sizeof(ValueType) % ByteAlignment == 0, + "std::experimental::padding sizeof(ValueType) has to be multiple or " + "divider of ByteAlignment."); + static constexpr size_t value = std::max(ByteAlignment / sizeof(ValueType), 1ul); +}; + +// alignment fixed to 128 bytes +struct alignment { + static constexpr size_t value = 128; +}; + +} // namespace detail + +template +using layout_right_padded = std::experimental::layout_right_padded< + detail::padding>>::value>; + +template +using layout_left_padded = std::experimental::layout_left_padded< + detail::padding>>::value>; + +template +using enable_if_layout_padded = + std::enable_if_t>::value || + std::is_same>::value>; + /** * Ensure all types listed in the parameter pack `Extents` are integral types. * Usage: @@ -254,4 +288,5 @@ RAFT_INLINE_FUNCTION auto unravel_index(Idx idx, return unravel_index_impl(static_cast(idx), shape); } } + } // namespace raft diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh index 15f5204382..8180b88c8a 100644 --- a/cpp/include/raft/matrix/detail/linewise_op.cuh +++ b/cpp/include/raft/matrix/detail/linewise_op.cuh @@ -16,6 +16,8 @@ #pragma once +#include +#include #include #include #include @@ -176,6 +178,37 @@ struct Linewise { return out; } } + + /** + * @brief Same as loadVec, but padds data with Ones + * + * @param shm + * @param p + * @param blockOffset + * @param rowLen + * @param rowLenPadded + * @return a contiguous chunk of a vector, suitable for `vectorRows`. + */ + static __device__ __forceinline__ Vec loadVecPadded(Type* shm, + const Type* p, + const IdxType blockOffset, + const IdxType rowLen, + const IdxType rowLenPadded) noexcept + { + IdxType j = blockOffset + threadIdx.x; +#pragma unroll VecElems + for (int k = threadIdx.x; k < VecElems * BlockSize; k += BlockSize, j += BlockSize) { + while (j >= rowLenPadded) + j -= rowLenPadded; + shm[k] = j < rowLen ? p[j] : Type(1); + } + __syncthreads(); + { + Vec out; + *out.vectorized_data() = reinterpret_cast(shm)[threadIdx.x]; + return out; + } + } }; /** @@ -325,6 +358,48 @@ __global__ void __launch_bounds__(BlockSize) (workOffset ^= workSize, L::loadVec(shm + workOffset, vecs, blockOffset, rowLen))...); } +/** + * Simplified version of `matrixLinewiseVecRowsMainKernel` for use with padded data. + * Data is required to be aligned and padded. + * + * @param [out] out the start of the *aligned* part of the output matrix + * @param [in] in the start of the *aligned* part of the input matrix + * @param [in] arrOffset such an offset into the matrices that makes them aligned to `VecBytes` + * @param [in] rowLen number of elements in a row (= the vector size) + * @param [in] len the total length of the aligned part of the matrices + * @param [in] op the function to apply + * @param [in] vecs pointers to the argument vectors + */ +template +__global__ void __launch_bounds__(BlockSize) + matrixLinewiseVecRowsSpanKernel(Type* out, + const Type* in, + const IdxType rowLen, + const IdxType rowLenPadded, + const IdxType lenPadded, + Lambda op, + Vecs... vecs) +{ + typedef Linewise L; + constexpr uint workSize = L::VecElems * BlockSize; + uint workOffset = workSize; + __shared__ __align__(sizeof(Type) * L::VecElems) + Type shm[workSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)]; + const IdxType blockOffset = (BlockSize * L::VecElems * blockIdx.x) % rowLenPadded; + return L::vectorRows( + reinterpret_cast(out), + reinterpret_cast(in), + L::AlignElems::div(lenPadded), + op, + (workOffset ^= workSize, + L::loadVecPadded(shm + workOffset, vecs, blockOffset, rowLen, rowLenPadded))...); +} + /** * This kernel is similar to `matrixLinewiseVecRowsMainKernel`, but processes only the unaligned * head and tail parts of the matrix. @@ -444,6 +519,59 @@ void matrixLinewiseVecCols(Type* out, } } +/** + * input/output data is expected to be aligned and padded + * we simply extend the operation over the padded elements to be fully aligned + */ +template +void matrixLinewiseVecColsSpan( + raft::device_aligned_matrix_view out, + raft::device_aligned_matrix_view in, + const IdxType rowLen, + const IdxType nRows, + Lambda op, + cudaStream_t stream, + Vecs... vecs) +{ + typedef raft::Pow2 AlignBytes; + constexpr std::size_t VecElems = VecBytes / sizeof(Type); + + typedef raft::Pow2::padding> AlignPadding; + + const uint paddedRowLen = AlignPadding::roundUp(rowLen); + const IdxType alignedLen = paddedRowLen * nRows; + + if (rowLen * nRows > 0) { + constexpr dim3 bs(BlockSize, 1, 1); + // Minimum size of the grid to make the device well occupied + const uint occupy = getOptimalGridSize(); + // does not make sense to have more blocks than this + const uint maxBlocks = raft::ceildiv(uint(alignedLen), bs.x * VecElems); + const dim3 gs(std::min(maxBlocks, occupy), 1, 1); + // The work arrangement is blocked on the block and warp levels; + // see more details at Linewise::vectorCols. + // The value below determines how many scalar elements are processed by on thread in total. + const IdxType elemsPerThread = + raft::ceildiv(alignedLen, gs.x * VecElems * BlockSize) * VecElems; + matrixLinewiseVecColsMainKernel + <<>>(out.data_handle(), + in.data_handle(), + 0, + paddedRowLen, + alignedLen, + elemsPerThread, + op, + vecs...); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + } +} + template +void matrixLinewiseVecRowsSpan( + raft::device_aligned_matrix_view out, + raft::device_aligned_matrix_view in, + const IdxType rowLen, + const IdxType nRows, + Lambda op, + cudaStream_t stream, + Vecs... vecs) +{ + constexpr std::size_t VecElems = VecBytes / sizeof(Type); + typedef raft::Pow2 AlignBytes; + + typedef raft::Pow2::padding> AlignPadding; + + const uint paddedRowLen = AlignPadding::roundUp(rowLen); + const IdxType alignedLen = paddedRowLen * nRows; + + if (rowLen * nRows > 0) { + constexpr dim3 bs(BlockSize, 1, 1); + // The work arrangement is striped; + // see more details at Linewise::vectorRows. + // Below is the work amount performed by one block in one iteration. + constexpr uint block_work_size = bs.x * uint(VecElems); + /* Here I would define `grid_work_size = lcm(block_work_size, rowLen)` (Least Common Multiple) + This way, the grid spans a set of one or more rows each iteration, and, most importantly, + on every iteration each row processes the same set of indices within a row (= the same set + of vector indices). + This means, each block needs to load the values from the vector arguments only once. + Sadly, sometimes `grid_work_size > rowLen*nRows`, and sometimes grid_work_size > UINT_MAX. + That's why I don't declare it here explicitly. + Instead, I straightaway compute the + expected_grid_size = lcm(block_work_size, rowLen) / block_work_size + */ + const uint expected_grid_size = paddedRowLen / raft::gcd(block_work_size, uint(paddedRowLen)); + // Minimum size of the grid to make the device well occupied + const uint occupy = getOptimalGridSize(); + const dim3 gs(std::min( + // does not make sense to have more blocks than this + raft::ceildiv(uint(alignedLen), block_work_size), + // increase the grid size to be not less than `occupy` while + // still being the multiple of `expected_grid_size` + raft::ceildiv(occupy, expected_grid_size) * expected_grid_size), + 1, + 1); + + matrixLinewiseVecRowsSpanKernel + <<>>( + out.data_handle(), in.data_handle(), rowLen, paddedRowLen, alignedLen, op, vecs...); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + } +} + /** * Select one of the implementations: * a. vectors applied along/across lines @@ -541,6 +733,47 @@ struct MatrixLinewiseOp { return matrixLinewiseVecCols( out, in, lineLen, nLines, op, stream, vecs...); } + + template + static void runPadded(raft::device_aligned_matrix_view out, + raft::device_aligned_matrix_view in, + const IdxType lineLen, + const IdxType nLines, + const bool alongLines, + Lambda op, + cudaStream_t stream, + Vecs... vecs) + { + constexpr auto is_rowmajor = std::is_same_v>; + constexpr auto is_colmajor = std::is_same_v>; + + static_assert(is_rowmajor || is_colmajor, + "layout for in and out must be either padded row or col major"); + + // also statically assert padded matrix alignment == 2^i*VecBytes + assert(raft::Pow2::areSameAlignOffsets(in, out)); + + if (alongLines) + return matrixLinewiseVecRowsSpan(out, in, lineLen, nLines, op, stream, vecs...); + else + return matrixLinewiseVecColsSpan(out, in, lineLen, nLines, op, stream, vecs...); + } }; } // end namespace detail diff --git a/cpp/include/raft/matrix/linewise_op.cuh b/cpp/include/raft/matrix/linewise_op.cuh index 6b383b14f5..77f70239ea 100644 --- a/cpp/include/raft/matrix/linewise_op.cuh +++ b/cpp/include/raft/matrix/linewise_op.cuh @@ -42,9 +42,12 @@ namespace raft::matrix { * @param [in] alongLines whether vectors are indices along or across lines. * @param [in] op the operation applied on each line: * for i in [0..lineLen) and j in [0..nLines): + * out[j, i] = op(in[j, i], vec1[i], vec2[i], ... veck[i]) if alongLines = true + * out[j, i] = op(in[j, i], vec1[j], vec2[j], ... veck[j]) if alongLines = false + * where matrix indexing is row-major ([j, i] = [i + lineLen * j]). * out[i, j] = op(in[i, j], vec1[i], vec2[i], ... veck[i]) if alongLines = true * out[i, j] = op(in[i, j], vec1[j], vec2[j], ... veck[j]) if alongLines = false - * where matrix indexing is row-major ([i, j] = [i + lineLen * j]). + * where matrix indexing is col-major ([i, j] = [i + lineLen * j]). * @param [in] vecs zero or more vectors to be passed as arguments, * size of each vector is `alongLines ? lineLen : nLines`. */ @@ -67,8 +70,8 @@ void linewise_op(const raft::handle_t& handle, static_assert(is_rowmajor || is_colmajor, "layout for in and out must be either row or col major"); - const idx_t lineLen = is_rowmajor ? in.extent(0) : in.extent(1); - const idx_t nLines = is_rowmajor ? in.extent(1) : in.extent(0); + const idx_t nLines = is_rowmajor ? in.extent(0) : in.extent(1); + const idx_t lineLen = is_rowmajor ? in.extent(1) : in.extent(0); RAFT_EXPECTS(out.extent(0) == in.extent(0) && out.extent(1) == in.extent(1), "Input and output must have the same shape."); @@ -82,4 +85,34 @@ void linewise_op(const raft::handle_t& handle, handle.get_stream(), vecs.data_handle()...); } + +template > +void linewise_op(const raft::handle_t& handle, + raft::device_aligned_matrix_view in, + raft::device_aligned_matrix_view out, + const bool alongLines, + Lambda op, + vec_t... vecs) +{ + constexpr auto is_rowmajor = std::is_same_v>; + constexpr auto is_colmajor = std::is_same_v>; + + static_assert(is_rowmajor || is_colmajor, + "layout for in and out must be either padded row or col major"); + + const idx_t nLines = is_rowmajor ? in.extent(0) : in.extent(1); + const idx_t lineLen = is_rowmajor ? in.extent(1) : in.extent(0); + + RAFT_EXPECTS(out.extent(0) == in.extent(0) && out.extent(1) == in.extent(1), + "Input and output must have the same shape."); + + detail::MatrixLinewiseOp<16, 256>::runPadded( + out, in, lineLen, nLines, alongLines, op, handle.get_stream(), vecs.data_handle()...); +} + } // namespace raft::matrix diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/aligned_accessor.hpp b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/aligned_accessor.hpp new file mode 100644 index 0000000000..67356785c0 --- /dev/null +++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/aligned_accessor.hpp @@ -0,0 +1,189 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2019) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + + +// NOTE: This code is prematurely taken from an example based on +// https://github.com/kokkos/mdspan/pull/176 + +#pragma once + +#include "macros.hpp" +#include "trait_backports.hpp" +#include "default_accessor.hpp" +#include "extents.hpp" +#include +#include +#include + +namespace std { +namespace experimental { + +namespace stdex = std::experimental; + + +// Prefer std::assume_aligned if available, as it is in the C++ Standard. +// Otherwise, use a compiler-specific equivalent if available. + +// NOTE (mfh 2022/08/08) BYTE_ALIGNMENT must be unsigned and a power of 2. +#if defined(__cpp_lib_assume_aligned) +# define _MDSPAN_ASSUME_ALIGNED( ELEMENT_TYPE, POINTER, BYTE_ALIGNMENT ) (std::assume_aligned< BYTE_ALIGNMENT >( POINTER )) + constexpr char assume_aligned_method[] = "std::assume_aligned"; +#elif defined(__ICL) +# define _MDSPAN_ASSUME_ALIGNED( ELEMENT_TYPE, POINTER, BYTE_ALIGNMENT ) POINTER + constexpr char assume_aligned_method[] = "(none)"; +#elif defined(__ICC) +# define _MDSPAN_ASSUME_ALIGNED( ELEMENT_TYPE, POINTER, BYTE_ALIGNMENT ) POINTER + constexpr char assume_aligned_method[] = "(none)"; +#elif defined(__clang__) +# define _MDSPAN_ASSUME_ALIGNED( ELEMENT_TYPE, POINTER, BYTE_ALIGNMENT ) POINTER + constexpr char assume_aligned_method[] = "(none)"; +#elif defined(__GNUC__) + // __builtin_assume_aligned returns void* +# define _MDSPAN_ASSUME_ALIGNED( ELEMENT_TYPE, POINTER, BYTE_ALIGNMENT ) reinterpret_cast< ELEMENT_TYPE* >(__builtin_assume_aligned( POINTER, BYTE_ALIGNMENT )) + constexpr char assume_aligned_method[] = "__builtin_assume_aligned"; +#else +# define _MDSPAN_ASSUME_ALIGNED( ELEMENT_TYPE, POINTER, BYTE_ALIGNMENT ) POINTER + constexpr char assume_aligned_method[] = "(none)"; +#endif + +// Some compilers other than Clang or GCC like to define __clang__ or __GNUC__. +// Thus, we order the tests from most to least specific. +#if defined(__ICL) +# define _MDSPAN_ALIGN_VALUE_ATTRIBUTE( BYTE_ALIGNMENT ) __declspec(align_value( BYTE_ALIGNMENT )); + constexpr char align_attribute_method[] = "__declspec(align_value(BYTE_ALIGNMENT))"; +#elif defined(__ICC) +# define _MDSPAN_ALIGN_VALUE_ATTRIBUTE( BYTE_ALIGNMENT ) __attribute__((align_value( BYTE_ALIGNMENT ))); + constexpr char align_attribute_method[] = "__attribute__((align_value(BYTE_ALIGNMENT)))"; +#elif defined(__clang__) +# define _MDSPAN_ALIGN_VALUE_ATTRIBUTE( BYTE_ALIGNMENT ) __attribute__((align_value( BYTE_ALIGNMENT ))); + constexpr char align_attribute_method[] = "__attribute__((align_value(BYTE_ALIGNMENT)))"; +#else +# define _MDSPAN_ALIGN_VALUE_ATTRIBUTE( BYTE_ALIGNMENT ) + constexpr char align_attribute_method[] = "(none)"; +#endif + +constexpr bool +is_nonzero_power_of_two(const std::size_t x) +{ +// Just checking __cpp_lib_int_pow2 isn't enough for some GCC versions. +// The header exists, but std::has_single_bit does not. +#if defined(__cpp_lib_int_pow2) && __cplusplus >= 202002L + return std::has_single_bit(x); +#else + return x != 0 && (x & (x - 1)) == 0; +#endif +} + +template +constexpr bool +valid_byte_alignment(const std::size_t byte_alignment) +{ + return is_nonzero_power_of_two(byte_alignment) && byte_alignment >= alignof(ElementType); +} + +// We define aligned_pointer_t through a struct +// so we can check whether the byte alignment is valid. +// This makes it impossible to use the alias +// with an invalid byte alignment. +template +struct aligned_pointer { + static_assert(valid_byte_alignment(byte_alignment), + "byte_alignment must be a power of two no less than " + "the minimum required alignment of ElementType."); + using type = ElementType* _MDSPAN_ALIGN_VALUE_ATTRIBUTE( byte_alignment ); +}; + + +template +using aligned_pointer_t = typename aligned_pointer::type; + +template +aligned_pointer_t +bless(ElementType* ptr, std::integral_constant /* ba */ ) +{ + return _MDSPAN_ASSUME_ALIGNED( ElementType, ptr, byte_alignment ); +} + + +template +struct aligned_accessor { + using offset_policy = stdex::default_accessor; + using element_type = ElementType; + using reference = ElementType&; + using data_handle_type = aligned_pointer_t; + + constexpr aligned_accessor() noexcept = default; + + MDSPAN_TEMPLATE_REQUIRES( + class OtherElementType, + std::size_t other_byte_alignment, + /* requires */ (std::is_convertible::value && other_byte_alignment == byte_alignment) + ) + constexpr aligned_accessor(aligned_accessor) noexcept {} + + constexpr reference access(data_handle_type p, size_t i) const noexcept { + // This may declare alignment twice, depending on + // if we have an attribute for marking pointer types. + return _MDSPAN_ASSUME_ALIGNED( ElementType, p, byte_alignment )[i]; + } + + constexpr typename offset_policy::data_handle_type + offset(data_handle_type p, size_t i) const noexcept { + return p + i; + } +}; + +template +struct delete_raw { + void operator()(ElementType* p) const { + if (p != nullptr) { + // All the aligned allocation methods below go with std::free. + // If we implement a new method that uses a different + // deallocation function, that function would go here. + std::free(p); + } + } +}; + +} // end namespace experimental +} // end namespace std diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_padded.hpp b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_padded.hpp new file mode 100644 index 0000000000..cd9c9c19bf --- /dev/null +++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_padded.hpp @@ -0,0 +1,787 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2019) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + + +// NOTE: This code is prematurely taken from https://github.com/kokkos/mdspan/pull/180 +// and matches requirements described in https://github.com/ORNL/cpp-proposals-pub/pull/296 +// Some parts (as submdspan integration) are missing +// EDIT: the meaning of the template argument 'padding_stride' was adjusted from a +// fixed stride to a padding alignment, allowing dimensions > padding_stride to be padded +// to multiples of 'padding_stride' + +#pragma once + +#include "macros.hpp" +#include "trait_backports.hpp" +#include "extents.hpp" +#include "layout_left.hpp" +#include "layout_right.hpp" +#include +#include +#include + +namespace std { +namespace experimental { + +namespace stdex = std::experimental; + +namespace details { + + // offset_index_sequence idea comes from "offset_sequence" here: + // https://devblogs.microsoft.com/oldnewthing/20200625-00/?p=103903 + // + // offset_index_sequence adds N to each element of the given IndexSequence. + // We can't just template on the parameter pack of indices directly; + // the pack needs to be contained in some type. + // We choose index_sequence because it stores no run-time data. + template struct offset_index_sequence; + + template + struct offset_index_sequence> + { + using type = std::index_sequence<(Indices + N)...>; + }; + + template + using offset_index_sequence_t = typename offset_index_sequence::type; + + static_assert(std::is_same< + offset_index_sequence_t<3, std::make_index_sequence<4>>, + std::index_sequence<3, 4, 5, 6>>::value, + "offset_index_sequence defined incorrectly." ); + + // iota_index_sequence defines the half-open sequence + // begin, begin+1, begin+2, ..., end-1. + // If end == begin, then the sequence is empty (we permit this). + // + // Defining the struct first, rather than going straight to the type alias, + // lets us check the template arguments. + template + struct iota_index_sequence { + static_assert(end >= begin, "end must be >= begin."); + using type = + offset_index_sequence_t< begin, std::make_index_sequence >; + }; + + // iota_index_sequence_t is like make_index_sequence, + // except that it starts with begin instead of 0. + template + using iota_index_sequence_t = typename iota_index_sequence::type; + + static_assert(std::is_same< + iota_index_sequence_t<3, 6>, + std::index_sequence<3, 4, 5>>::value, + "iota_index_sequence defined incorrectly." ); + + static_assert(std::is_same< + iota_index_sequence_t<3, 3>, + std::index_sequence<>>::value, + "iota_index_sequence defined incorrectly." ); + + static_assert(std::is_same< + iota_index_sequence_t<3, 4>, + std::index_sequence<3>>::value, + "iota_index_sequence defined incorrectly." ); + + template + constexpr IndexType ceildiv(IndexType a, IndexType b) + { + return (a + b - 1) / b; + } + + template + constexpr IndexType alignTo(IndexType a, IndexType b) + { + return ceildiv(a, b) * b; + } + +} // namespace details + +// layout_padded_left implementation + +namespace details { + + + // The *_helper functions work around not having C++20 + // templated lambdas: []{} . + + // The third argument should always be + // iota_index_sequence_t<1, ReturnExtents::rank()>. + template + MDSPAN_INLINE_FUNCTION constexpr ReturnExtents + layout_left_extents_helper(const stdex::extents& unpadded_extent, + const InnerExtents& inner_extents, + std::index_sequence) + { + static_assert(sizeof...(TrailingIndices) + 1 == ReturnExtents::rank(), + "sizeof...(TrailingIndices) + 1 != ReturnExtents::rank()"); + static_assert(InnerExtents::rank() == ReturnExtents::rank(), + "InnerExtents::rank() != ReturnExtents::rank()"); + using index_type = typename ReturnExtents::index_type; + return ReturnExtents{ + unpadded_extent.extent(0), + index_type(inner_extents.extent(TrailingIndices))... + }; + } + + // The third argument should always be + // iota_index_sequence_t<0, ReturnExtents::rank() - 1>. + template + MDSPAN_INLINE_FUNCTION constexpr ReturnExtents + layout_right_extents_helper(const InnerExtents& inner_extents, + const stdex::extents& unpadded_extent, + std::index_sequence) + { + static_assert(sizeof...(LeadingIndices) + 1 == ReturnExtents::rank(), + "sizeof...(LeadingIndices) + 1 != ReturnExtents::rank()"); + static_assert(InnerExtents::rank() == ReturnExtents::rank(), + "InnerExtents::rank() != ReturnExtents::rank()"); + using index_type = typename ReturnExtents::index_type; + return ReturnExtents{ + index_type(inner_extents.extent(LeadingIndices))..., + unpadded_extent.extent(0) + }; + } + + template + MDSPAN_INLINE_FUNCTION constexpr ReturnExtents + layout_left_extents(const stdex::extents& unpadded_extent, + const stdex::extents& inner_extents) + { + return layout_left_extents_helper( + unpadded_extent, + inner_extents, + details::iota_index_sequence_t<1, ReturnExtents::rank()>{} + ); + } + + // Rank-0 unpadded_extent means rank-0 input, + // but the latter turns out not to matter here. + + template + MDSPAN_INLINE_FUNCTION constexpr ReturnExtents + layout_left_extents(const stdex::extents& /* unpadded_extent */ , + const stdex::extents& inner_extents) + { + return inner_extents; + } + + template + MDSPAN_INLINE_FUNCTION constexpr ReturnExtents + layout_right_extents(const stdex::extents& inner_extents, + const stdex::extents& unpadded_extent) + { + // If rank() is zero, size_t(-1) would be a very large upper bound. + static_assert(ReturnExtents::rank() != 0, + "ReturnExtents::rank() must not be 0"); + return layout_right_extents_helper( + inner_extents, + unpadded_extent, + details::iota_index_sequence_t<0, ReturnExtents::rank() - 1>{} + ); + } + + // Rank-0 unpadded_extent means rank-0 input, + // but the latter turns out not to matter here. + + template + MDSPAN_INLINE_FUNCTION constexpr ReturnExtents + layout_right_extents(const stdex::extents& inner_extents, + const stdex::extents& /* unpadded_extent */ ) + { + return inner_extents; + } + + template< + class InputExtentsType, + std::size_t PaddingExtent, + std::size_t ... Indices + > + MDSPAN_INLINE_FUNCTION constexpr auto + pad_extents_left_helper(const InputExtentsType& input, + const stdex::extents& padding, + std::index_sequence) + { + // NOTE (mfh 2022/09/04) This can be if constexpr, + // if the compiler supports it. + if /* constexpr */ (PaddingExtent == stdex::dynamic_extent) { + assert(padding.extent(0) != stdex::dynamic_extent); + } + using input_type = std::remove_cv_t>; + using index_type = typename input_type::index_type; + constexpr std::size_t rank = input_type::rank(); + static_assert(sizeof...(Indices) == std::size_t(rank - 1), + "Indices pack has the wrong size."); + using return_type = stdex::extents< + index_type, + stdex::dynamic_extent, + input_type::static_extent(Indices)... + >; + return return_type{ + index_type(details::alignTo(input.extent(0), padding.extent(0))), + input.extent(Indices)... + }; + } + + template< + class InputExtentsType, + std::size_t PaddingExtent, + std::size_t ... Indices + > + MDSPAN_INLINE_FUNCTION constexpr auto + pad_extents_right_helper(const InputExtentsType& input, + const stdex::extents& padding, + std::index_sequence) + { + // NOTE (mfh 2022/09/04) This can be if constexpr, + // if the compiler supports it. + if /* constexpr */ (PaddingExtent == stdex::dynamic_extent) { + assert(padding.extent(0) != stdex::dynamic_extent); + } + using input_type = std::remove_cv_t>; + using index_type = typename input_type::index_type; + constexpr std::size_t rank = input_type::rank(); + static_assert(sizeof...(Indices) == std::size_t(rank - 1), + "Indices pack has the wrong size."); + + using return_type = stdex::extents< + index_type, + input_type::static_extent(Indices)..., + stdex::dynamic_extent + >; + return return_type{ + input.extent(Indices)..., + index_type(details::alignTo(input.extent(rank - 1), padding.extent(0))) + }; + } + + // Rank-0 and rank-1 mdspan don't need extra padding from their layout. + // They rely on an "aligned_accessor" and on the data_handle's alignment. + + MDSPAN_TEMPLATE_REQUIRES( + class IndexType, + std::size_t PaddingExtent, + std::size_t ... InputExtents, + /* requires */ (sizeof...(InputExtents) <= std::size_t(1)) + ) + MDSPAN_INLINE_FUNCTION constexpr auto + pad_extents_left(const stdex::extents& input, + const stdex::extents /* padding */ ) + { + return input; + } + + MDSPAN_TEMPLATE_REQUIRES( + class IndexType, + std::size_t PaddingExtent, + std::size_t ... InputExtents, + /* requires */ (sizeof...(InputExtents) <= std::size_t(1)) + ) + MDSPAN_INLINE_FUNCTION constexpr auto + pad_extents_right(const stdex::extents& input, + const stdex::extents /* padding */ ) + { + return input; + } + + // rank > 1 case follows. + + MDSPAN_TEMPLATE_REQUIRES( + class IndexType, + std::size_t PaddingExtent, + std::size_t ... InputExtents, + /* requires */ (sizeof...(InputExtents) > std::size_t(1)) + ) + MDSPAN_INLINE_FUNCTION constexpr auto + pad_extents_left(const stdex::extents& input, + const stdex::extents padding) + { + constexpr std::size_t rank = sizeof...(InputExtents); + return details::pad_extents_left_helper + (input, padding, details::iota_index_sequence_t<1, rank>{}); + } + + MDSPAN_TEMPLATE_REQUIRES( + class IndexType, + std::size_t PaddingExtent, + std::size_t ... InputExtents, + /* requires */ (sizeof...(InputExtents) > std::size_t(1)) + ) + MDSPAN_INLINE_FUNCTION constexpr auto + pad_extents_right(const stdex::extents& input, + const stdex::extents padding) + { + constexpr std::size_t rank = sizeof...(InputExtents); + return details::pad_extents_right_helper + (input, padding, details::iota_index_sequence_t<0, rank - 1>{}); + } + + MDSPAN_TEMPLATE_REQUIRES( + class IndexType, + std::size_t ... InputExtents, + /* requires */ (sizeof...(InputExtents) != std::size_t(0)) + ) + MDSPAN_INLINE_FUNCTION constexpr auto + unpadded_extent_left(const stdex::extents& input) + { + using input_type = stdex::extents; + return stdex::extents{input.extent(0)}; + } + + MDSPAN_TEMPLATE_REQUIRES( + class IndexType, + std::size_t ... InputExtents, + /* requires */ (sizeof...(InputExtents) != std::size_t(0)) + ) + MDSPAN_INLINE_FUNCTION constexpr auto + unpadded_extent_right(const stdex::extents& input) + { + using input_type = stdex::extents; + const auto rank = input_type::rank(); + return stdex::extents{input.extent(rank - 1)}; + } + + template + MDSPAN_INLINE_FUNCTION constexpr auto + unpadded_extent_left(const stdex::extents& /* input */ ) + { + return stdex::extents{}; + } + + template + MDSPAN_INLINE_FUNCTION constexpr auto + unpadded_extent_right(const stdex::extents& /* input */ ) + { + return stdex::extents{}; + } + + // Helper functions to work around C++14's lack of "if constexpr." + + template + MDSPAN_INLINE_FUNCTION constexpr PaddingExtentsType + left_padding_extents(const InnerMappingType& inner_mapping, + std::integral_constant /* rank */ ) + { + return PaddingExtentsType{inner_mapping.extent(0)}; + } + + template + MDSPAN_INLINE_FUNCTION constexpr PaddingExtentsType + left_padding_extents(const InnerMappingType& /* inner_mapping */ , + std::integral_constant /* rank */ ) + { + return PaddingExtentsType{}; + } + + template + MDSPAN_INLINE_FUNCTION constexpr PaddingExtentsType + right_padding_extents(const InnerMappingType& inner_mapping, + std::integral_constant /* rank */ ) + { + return PaddingExtentsType{inner_mapping.extent(Rank - 1)}; + } + + template + MDSPAN_INLINE_FUNCTION constexpr PaddingExtentsType + right_padding_extents(const InnerMappingType& /* inner_mapping */ , + std::integral_constant /* rank */ ) + { + return PaddingExtentsType{}; + } + +} // namespace details + + + +// TODO (mfh 2022/08/30) Private inheritance from layout_left::mapping +// resp. layout_right::mapping would reduce inlining depth. + +// layout_left_padded is like layout_left, +// except that stride(0) == 1 always, +// and the leftmost extent may be padded +// (so that stride(1) could possibly be greater than extent(0)). +// +// This layout exists for two reasons: +// +// 1. Appropriate choice of padding, plus use of overaligned memory, +// can ensure any desired power-of-two overalignment of the +// beginning of each contiguous segment of elements in an mdspan. +// This is useful for hardware that optimizes for overaligned +// access. +// +// 2. For rank-2 mdspan, this is exactly the layout supported by the +// BLAS and LAPACK (where the "leading dimension" of the matrix +// (LDA), i.e., the stride, is greater than or equal to the number +// of rows). +// +// The padding can be either a compile-time value or a run-time value. +// It is a template parameter of layout_left_padded (the "tag type"), +// and NOT of the mapping, because mdspan requires that the mapping be +// a metafunction of the tag type and the extents specialization type. +template +struct layout_left_padded { + static constexpr size_t padding = padding_stride; + template + class mapping { + public: + using extents_type = Extents; + using index_type = typename extents_type::index_type; + using size_type = typename extents_type::size_type; + using rank_type = typename extents_type::rank_type; + using layout_type = layout_left_padded; + + private: + using padding_extents_type = + stdex::extents; + using inner_layout_type = stdex::layout_left; + using inner_extents_type = decltype( + details::pad_extents_left( + std::declval(), + std::declval() + ) + ); + using inner_mapping_type = + typename inner_layout_type::template mapping; + using unpadded_extent_type = + decltype(details::unpadded_extent_left(std::declval())); + + inner_mapping_type inner_mapping_; + unpadded_extent_type unpadded_extent_; + + padding_extents_type padding_extents() const { + return details::left_padding_extents( + inner_mapping_, + std::integral_constant{}); + } + + public: + // mapping constructor that takes ONLY an extents_type. + // + // This constructor makes it possible to construct an mdspan + // from a pointer and extents, since that requires that + // the mapping be constructible from extents alone. + MDSPAN_INLINE_FUNCTION constexpr + mapping(const extents_type& ext) : + inner_mapping_(details::pad_extents_left( + ext, + padding_extents_type{padding_stride})), + unpadded_extent_(details::unpadded_extent_left(ext)) + {} + + // mapping constructor that takes an extents_type, + // AND an integral padding_value. + // + // This constructor always exists, even if padding is known at + // compile time -- just like the extents constructor lets you pass + // in all rank() extents, even if some of them are known at + // compile time. + template + MDSPAN_INLINE_FUNCTION constexpr + mapping(const extents_type& ext, + Size padding_value, + std::enable_if_t< + std::is_convertible::value && + std::is_nothrow_constructible::value + >* = nullptr) : + inner_mapping_(details::pad_extents_left(ext, padding_extents_type{padding_value})), + unpadded_extent_(details::unpadded_extent_left(ext)) + { + // We don't have to check padding_value here, because the + // padding_extents_type constructor already has a precondition. + } + + // Pass in the padding as an extents object. + MDSPAN_INLINE_FUNCTION constexpr + mapping(const extents_type& ext, + const stdex::extents& padding_extents) : + inner_mapping_(details::pad_extents_left(ext, padding_extents)), + unpadded_extent_(details::unpadded_extent_left(ext)) + {} + + // FIXME (mfh 2022/09/28) Converting constructor taking + // layout_right_padded::mapping + // is in the proposal, but missing here. + + // layout_stride::mapping deliberately only defines the copy + // constructor and copy assignment operator, not the move + // constructor or move assignment operator. This is fine because + // all the storage is std::array-like; there's no advantage to + // move construction or move assignment. We imitate this. + MDSPAN_INLINE_FUNCTION_DEFAULTED + constexpr mapping(const mapping&) noexcept = default; + MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED + mapping& operator=(const mapping&) noexcept = default; + + MDSPAN_INLINE_FUNCTION + constexpr extents_type extents() const noexcept + { + return details::layout_left_extents( + unpadded_extent_, + inner_mapping_.extents() + ); + } + + MDSPAN_INLINE_FUNCTION + constexpr std::array + strides() const noexcept + { + return inner_mapping_.strides(); + } + + MDSPAN_INLINE_FUNCTION + constexpr index_type required_span_size() const noexcept + { + return inner_mapping_.required_span_size(); + } + + MDSPAN_TEMPLATE_REQUIRES( + class... Indices, + /* requires */ (sizeof...(Indices) == Extents::rank() && + _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) /*&& ...*/ ) && + _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices) /*&& ...*/) + ) + ) + MDSPAN_INLINE_FUNCTION + constexpr size_t operator()(Indices... idxs) const noexcept { + // TODO (mfh 2022/08/30) in debug mode, check precondition before forwarding to inner mapping. + return inner_mapping_(std::forward(idxs)...); + } + + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { + return extents_type::rank() == 0 ? true : + (extents_type::static_extent(0) != stdex::dynamic_extent && + extents_type::static_extent(0) == unpadded_extent_type::static_extent(0)); + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return true; } + + MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; } + MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 bool is_exhaustive() const noexcept { + return extents_type::rank() == 0 ? true : + inner_mapping_.extent(0) == unpadded_extent_.extent(0); + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; } + + MDSPAN_INLINE_FUNCTION + constexpr index_type stride(rank_type r) const noexcept { + return inner_mapping_.stride(r); + } + }; +}; + +template +struct layout_right_padded { + static constexpr size_t padding = padding_stride; + template + class mapping { + public: + using extents_type = Extents; + using index_type = typename extents_type::index_type; + using size_type = typename extents_type::size_type; + using rank_type = typename extents_type::rank_type; + using layout_type = layout_right_padded; + + private: + using padding_extents_type = + stdex::extents; + using inner_layout_type = stdex::layout_right; + using inner_extents_type = decltype( + details::pad_extents_right( + std::declval(), + std::declval() + ) + ); + using inner_mapping_type = + typename inner_layout_type::template mapping; + using unpadded_extent_type = + decltype(details::unpadded_extent_right(std::declval())); + + inner_mapping_type inner_mapping_; + unpadded_extent_type unpadded_extent_; + + padding_extents_type padding_extents() const { + return details::right_padding_extents( + inner_mapping_, + std::integral_constant{}); + } + + public: + // mapping constructor that takes ONLY an extents_type. + // + // This constructor makes it possible to construct an mdspan + // from a pointer and extents, since that requires that + // the mapping be constructible from extents alone. + MDSPAN_INLINE_FUNCTION constexpr + mapping(const extents_type& ext) : + inner_mapping_(details::pad_extents_right( + ext, + padding_extents_type{padding_stride})), + unpadded_extent_(details::unpadded_extent_right(ext)) + {} + + // mapping constructor that takes an extents_type, + // AND an integral padding_value. + // + // This constructor always exists, even if padding is known at + // compile time -- just like the extents constructor lets you pass + // in all rank() extents, even if some of them are known at + // compile time. + template + MDSPAN_INLINE_FUNCTION constexpr + mapping(const extents_type& ext, + Size padding_value, + std::enable_if_t< + std::is_convertible::value && + std::is_nothrow_constructible::value + >* = nullptr) : + inner_mapping_(details::pad_extents_right(ext, padding_extents_type{padding_value})), + unpadded_extent_(details::unpadded_extent_right(ext)) + { + // We don't have to check padding_value here, because the + // padding_extents_type constructor already has a precondition. + } + + // Pass in the padding as an extents object. + MDSPAN_INLINE_FUNCTION constexpr + mapping(const extents_type& ext, + const stdex::extents& padding_extents) : + inner_mapping_(details::pad_extents_right(ext, padding_extents)), + unpadded_extent_(details::unpadded_extent_right(ext)) + {} + + // FIXME (mfh 2022/09/28) The converting constructor taking + // layout_left_padded::mapping + // is in the proposal (missing other_padding_stride in R0), + // but missing here. + + // layout_stride::mapping deliberately only defines the copy + // constructor and copy assignment operator, not the move + // constructor or move assignment operator. This is fine because + // all the storage is std::array-like; there's no advantage to + // move construction or move assignment. We imitate this. + MDSPAN_INLINE_FUNCTION_DEFAULTED + constexpr mapping(const mapping&) noexcept = default; + MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED + mapping& operator=(const mapping&) noexcept = default; + + MDSPAN_INLINE_FUNCTION + constexpr extents_type extents() const noexcept + { + return details::layout_right_extents( + inner_mapping_.extents(), + unpadded_extent_ + ); + } + + MDSPAN_INLINE_FUNCTION + constexpr std::array + strides() const noexcept + { + return inner_mapping_.strides(); + } + + MDSPAN_INLINE_FUNCTION + constexpr index_type required_span_size() const noexcept + { + return inner_mapping_.required_span_size(); + } + + MDSPAN_TEMPLATE_REQUIRES( + class... Indices, + /* requires */ (sizeof...(Indices) == Extents::rank() && + _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) /*&& ...*/ ) && + _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices) /*&& ...*/) + ) + ) + MDSPAN_INLINE_FUNCTION + constexpr size_t operator()(Indices... idxs) const noexcept { + // TODO (mfh 2022/08/30) in debug mode, check precondition before forwarding to inner mapping. + return inner_mapping_(std::forward(idxs)...); + } + + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { + return extents_type::rank() == 0 ? true : + (extents_type::static_extent(Extents::rank() - 1) != stdex::dynamic_extent && + extents_type::static_extent(Extents::rank() - 1) == unpadded_extent_type::static_extent(0)); + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return true; } + + MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; } + MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 bool is_exhaustive() const noexcept { + return extents_type::rank() == 0 ? true : + inner_mapping_.extent(Extents::rank() - 1) == unpadded_extent_.extent(0); + } + MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; } + + MDSPAN_INLINE_FUNCTION + constexpr index_type stride(rank_type r) const noexcept { + return inner_mapping_.stride(r); + } + }; +}; + +} // end namespace experimental +} // end namespace std diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/mdspan b/cpp/include/raft/thirdparty/mdspan/include/experimental/mdspan index ca6f6b8686..c5fac4c0bd 100644 --- a/cpp/include/raft/thirdparty/mdspan/include/experimental/mdspan +++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/mdspan @@ -44,6 +44,7 @@ #pragma once #include "__p0009_bits/default_accessor.hpp" +#include "__p0009_bits/aligned_accessor.hpp" #include "__p0009_bits/full_extent_t.hpp" #include "__p0009_bits/mdspan.hpp" #include "__p0009_bits/dynamic_extent.hpp" @@ -51,6 +52,7 @@ #include "__p0009_bits/layout_stride.hpp" #include "__p0009_bits/layout_left.hpp" #include "__p0009_bits/layout_right.hpp" +#include "__p0009_bits/layout_padded.hpp" #include "__p0009_bits/macros.hpp" #include "__p0009_bits/static_array.hpp" #include "__p0009_bits/submdspan.hpp" diff --git a/cpp/test/matrix/linewise_op.cu b/cpp/test/matrix/linewise_op.cu index 9d3d5af51e..931c3ecb16 100644 --- a/cpp/test/matrix/linewise_op.cu +++ b/cpp/test/matrix/linewise_op.cu @@ -64,10 +64,10 @@ struct LinewiseTest : public ::testing::TestWithParam(in, m, n); - auto out_view = raft::make_device_matrix_view(out, m, n); + auto in_view = raft::make_device_matrix_view(in, n, m); + auto out_view = raft::make_device_matrix_view(out, n, m); - auto vec_view = raft::make_device_vector_view(vec, m); + auto vec_view = raft::make_device_vector_view(vec, lineLen); matrix::linewise_op(handle, in_view, out_view, raft::is_row_major(in_view), f, vec_view); } @@ -81,10 +81,10 @@ struct LinewiseTest : public ::testing::TestWithParam(in, m, n); - auto out_view = raft::make_device_matrix_view(out, m, n); - auto vec1_view = raft::make_device_vector_view(vec1, m); - auto vec2_view = raft::make_device_vector_view(vec2, m); + auto in_view = raft::make_device_matrix_view(in, n, m); + auto out_view = raft::make_device_matrix_view(out, n, m); + auto vec1_view = raft::make_device_vector_view(vec1, lineLen); + auto vec2_view = raft::make_device_vector_view(vec2, lineLen); matrix::linewise_op( handle, in_view, out_view, raft::is_row_major(in_view), f, vec1_view, vec2_view); @@ -99,6 +99,19 @@ struct LinewiseTest : public ::testing::TestWithParam + void runLinewiseSumPadded(raft::device_aligned_matrix_view out, + raft::device_aligned_matrix_view in, + const I lineLen, + const I nLines, + const bool alongLines, + const T* vec) + { + auto f = [] __device__(T a, T b) -> T { return a + b; }; + auto vec_view = raft::make_device_vector_view(vec, lineLen); + matrix::linewise_op(handle, in, out, alongLines, f, vec_view); + } + /** * Suggest multiple versions of matrix dimensions (n, m), such that * @@ -205,11 +218,129 @@ struct LinewiseTest : public ::testing::TestWithParam>&& dims, + rmm::device_uvector&& blob) + { + rmm::device_uvector blob_val(params.checkCorrectness ? blob.size() / 2 : 0, stream); + + stream.synchronize(); + cudaProfilerStart(); + testing::AssertionResult r = testing::AssertionSuccess(); + for (auto alongRows : ::testing::Bool()) { + for (auto [n, m] : dims) { + if (!r) break; + // take dense testdata + auto [out, in, vec1, vec2] = assignSafePtrs(blob, n, m); + common::nvtx::range dims_scope("Dims-%zu-%zu", std::size_t(n), std::size_t(m)); + common::nvtx::range dir_scope(alongRows ? "alongRows" : "acrossRows"); + + auto lineLen = m; + auto nLines = n; + + // create a padded span based on testdata (just for functional testing) + size_t matrix_size_padded; + if (alongRows) { + auto extents = matrix_extent{n, m}; + typename raft::layout_right_padded::mapping> layout{extents}; + matrix_size_padded = layout.required_span_size(); + } else { + auto extents = matrix_extent{m, n}; + typename raft::layout_left_padded::mapping> layout{extents}; + matrix_size_padded = layout.required_span_size(); + } + + rmm::device_uvector blob_in(matrix_size_padded, stream); + rmm::device_uvector blob_out(matrix_size_padded, stream); + + { + auto in2 = in; + + // actual testrun + common::nvtx::range vecs_scope("one vec"); + if (alongRows) { + auto inSpan = make_device_aligned_matrix_view>( + blob_in.data(), nLines, lineLen); + auto outSpan = make_device_aligned_matrix_view>( + blob_out.data(), nLines, lineLen); + // prep padded input data + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0ul), + nLines * lineLen, + [inSpan, in2, lineLen] __device__(size_t i) { + inSpan(i / lineLen, i % lineLen) = in2[i]; + }); + auto inSpanConst = + make_device_aligned_matrix_view>( + blob_in.data(), nLines, lineLen); + runLinewiseSumPadded>( + outSpan, inSpanConst, lineLen, nLines, alongRows, vec1); + + if (params.checkCorrectness) { + runLinewiseSum(out, in, lineLen, nLines, vec1); + auto out_dense = blob_val.data(); + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0ul), + nLines * lineLen, + [outSpan, out_dense, lineLen] __device__(size_t i) { + out_dense[i] = outSpan(i / lineLen, i % lineLen); + }); + r = devArrMatch(out_dense, out, n * m, CompareApprox(params.tolerance)) + << " " << (alongRows ? "alongRows" : "acrossRows") + << " with one vec; lineLen: " << lineLen << "; nLines " << nLines; + if (!r) break; + } + + } else { + auto inSpan = make_device_aligned_matrix_view>( + blob_in.data(), lineLen, nLines); + auto outSpan = make_device_aligned_matrix_view>( + blob_out.data(), lineLen, nLines); + // prep padded input data + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0ul), + nLines * lineLen, + [inSpan, in2, lineLen] __device__(size_t i) { + inSpan(i % lineLen, i / lineLen) = in2[i]; + }); + auto inSpanConst = + make_device_aligned_matrix_view>( + blob_in.data(), lineLen, nLines); + runLinewiseSumPadded>( + outSpan, inSpanConst, lineLen, nLines, alongRows, vec1); + + if (params.checkCorrectness) { + runLinewiseSum(out, in, lineLen, nLines, vec1); + auto out_dense = blob_val.data(); + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0ul), + nLines * lineLen, + [outSpan, out_dense, lineLen] __device__(size_t i) { + out_dense[i] = outSpan(i % lineLen, i / lineLen); + }); + r = devArrMatch(out_dense, out, n * m, CompareApprox(params.tolerance)) + << " " << (alongRows ? "alongRows" : "acrossRows") + << " with one vec; lineLen: " << lineLen << "; nLines " << nLines; + if (!r) break; + } + } + } + } + } + cudaProfilerStop(); + + return r; + } + testing::AssertionResult run() { return run(suggestDimensions(2), genData(params.workSizeBytes)); } + testing::AssertionResult runWithPaddedSpan() + { + return runWithPaddedSpan(suggestDimensions(2), genData(params.workSizeBytes)); + } + testing::AssertionResult runEdgeCases() { std::vector sizes = {1, 2, 3, 4, 7, 16}; @@ -230,6 +361,13 @@ struct LinewiseTest : public ::testing::TestWithParam TestClass##Span_##ElemType##_##IndexType; \ + TEST_P(TestClass##Span_##ElemType##_##IndexType, fun) { ASSERT_TRUE(fun()); } \ + INSTANTIATE_TEST_SUITE_P(LinewiseOpSpan, TestClass##Span_##ElemType##_##IndexType, SpanParams) + +auto SpanParams = ::testing::Combine(::testing::Values(0), ::testing::Values(0)); + auto TinyParams = ::testing::Combine(::testing::Values(0, 1, 2, 4), ::testing::Values(0, 1, 2, 3)); struct Tiny { @@ -299,5 +437,10 @@ TEST_IT(run, Gigabyte, double, int); TEST_IT(run, TenGigs, float, uint64_t); TEST_IT(run, TenGigs, double, uint64_t); +TEST_IT_SPAN(runWithPaddedSpan, Megabyte, float, int); +TEST_IT_SPAN(runWithPaddedSpan, Megabyte, double, int); +TEST_IT_SPAN(runWithPaddedSpan, Gigabyte, float, int); +TEST_IT_SPAN(runWithPaddedSpan, Gigabyte, double, int); + } // namespace matrix } // end namespace raft diff --git a/cpp/test/mdarray.cu b/cpp/test/mdarray.cu index 3931ff224c..c292feb894 100644 --- a/cpp/test/mdarray.cu +++ b/cpp/test/mdarray.cu @@ -436,6 +436,425 @@ TEST(MDArray, FuncArg) } } +void test_mdspan_layout_right_padded() +{ + { + // 5x2 example, + constexpr int n_rows = 2; + constexpr int n_cols = 5; + constexpr int alignment = 8; + constexpr int alignment_bytes = sizeof(int) * alignment; + + int data_row_major[] = { + 1, + 2, + 3, + 4, + 5, /* X X X */ + 6, + 7, + 8, + 9, + 10 /* X X X */ + }; + // manually aligning the above, using -1 as filler + static constexpr int X = -1; + int data_padded[] = {1, 2, 3, 4, 5, X, X, X, 6, 7, 8, 9, 10, X, X, X}; + + using extents_type = stdex::extents; + using padded_layout_row_major = + stdex::layout_right_padded::value>; + using padded_mdspan = stdex::mdspan; + using row_major_mdspan = stdex::mdspan; + + padded_layout_row_major::mapping layout{extents_type{n_rows, n_cols}}; + + auto padded = padded_mdspan(data_padded, layout); + auto row_major = row_major_mdspan(data_row_major, n_rows, n_cols); + + int failures = 0; + for (int irow = 0; irow < n_rows; ++irow) { + for (int icol = 0; icol < n_cols; ++icol) { + if (padded(irow, icol) != row_major(irow, icol)) { ++failures; } + } + } + ASSERT_EQ(failures, 0); + } +} + +TEST(MDSpan, LayoutRightPadded) { test_mdspan_layout_right_padded(); } + +void test_mdarray_padding() +{ + using extents_type = stdex::extents; + auto s = rmm::cuda_stream_default; + + { + constexpr int rows = 6; + constexpr int cols = 7; + constexpr int alignment = 5; + constexpr int alignment_bytes = sizeof(int) * alignment; + + /** + * padded device array + */ + using padded_layout_row_major = + stdex::layout_right_padded::value>; + + using padded_mdarray_type = device_mdarray; + padded_layout_row_major::mapping layout(extents_type(rows, cols)); + + auto device_policy = padded_mdarray_type::container_policy_type{s}; + static_assert(std::is_same_v>); + padded_mdarray_type padded_device_array{layout, device_policy}; + + // direct access mdarray + padded_device_array(0, 3) = 1; + ASSERT_EQ(padded_device_array(0, 3), 1); + + // non-const access via mdspan + auto d_view = padded_device_array.view(); + static_assert(!decltype(d_view)::accessor_type::is_host_type::value); + + thrust::device_vector status(1, 0); + auto p_status = status.data().get(); + thrust::for_each_n(rmm::exec_policy(s), + thrust::make_counting_iterator(0ul), + 1, + [d_view, p_status] __device__(size_t i) { + if (d_view(0, 3) != 1) { myAtomicAdd(p_status, 1); } + d_view(0, 2) = 3; + if (d_view(0, 2) != 3) { myAtomicAdd(p_status, 1); } + }); + check_status(p_status, s); + + // const ref access via mdspan + auto const& arr = padded_device_array; + ASSERT_EQ(arr(0, 3), 1); + auto const_d_view = arr.view(); + thrust::for_each_n(rmm::exec_policy(s), + thrust::make_counting_iterator(0ul), + 1, + [const_d_view, p_status] __device__(size_t i) { + if (const_d_view(0, 3) != 1) { myAtomicAdd(p_status, 1); } + }); + check_status(p_status, s); + + // initialize with sequence + thrust::for_each_n( + rmm::exec_policy(s), + thrust::make_counting_iterator(0ul), + rows * cols, + [d_view, rows, cols] __device__(size_t i) { d_view(i / cols, i % cols) = i; }); + + // manually create span with layout + { + auto data_padded = padded_device_array.data_handle(); + using padded_mdspan_type = device_mdspan; + auto padded_span = padded_mdspan_type(data_padded, layout); + thrust::for_each_n(rmm::exec_policy(s), + thrust::make_counting_iterator(0ul), + rows * cols, + [padded_span, rows, cols, p_status] __device__(size_t i) { + if (padded_span(i / cols, i % cols) != i) myAtomicAdd(p_status, 1); + }); + check_status(p_status, s); + } + + // utilities + static_assert(padded_device_array.rank_dynamic() == 2); + static_assert(padded_device_array.rank() == 2); + static_assert(padded_device_array.is_unique()); + static_assert(padded_device_array.is_strided()); + + static_assert( + !std::is_nothrow_default_constructible::value); // cuda stream + static_assert(std::is_nothrow_move_constructible::value); + static_assert(std::is_nothrow_move_assignable::value); + } +} + +TEST(MDArray, Padding) { test_mdarray_padding(); } + +// Test deactivated as submdspan support requires upstream changes +/*void test_submdspan_padding() +{ + using extents_type = stdex::extents; + auto s = rmm::cuda_stream_default; + + { + constexpr int rows = 6; + constexpr int cols = 7; + constexpr int alignment = 5; + constexpr int alignment_bytes = sizeof(int) * alignment; + + using layout_padded_general = + stdex::layout_padded_general; + using padded_mdarray_type = device_mdarray; + using padded_mdspan_type = device_mdspan; + layout_padded_general::mapping layout{extents_type{rows, cols}}; + + auto device_policy = padded_mdarray_type::container_policy_type{s}; + static_assert(std::is_same_v>); + padded_mdarray_type padded_device_array{layout, device_policy}; + + // test status + thrust::device_vector status(1, 0); + auto p_status = status.data().get(); + + // initialize with sequence + { + auto d_view = padded_device_array.view(); + static_assert(std::is_same_v); + thrust::for_each_n( + rmm::exec_policy(s), + thrust::make_counting_iterator(0ul), + rows * cols, + [d_view, rows, cols] __device__(size_t i) { d_view(i / cols, i % cols) = i; }); + } + + // get mdspan manually from raw data + { + auto data_padded = padded_device_array.data(); + auto padded_span = padded_mdspan_type(data_padded, layout); + thrust::for_each_n(rmm::exec_policy(s), + thrust::make_counting_iterator(0ul), + rows * cols, + [padded_span, rows, cols, p_status] __device__(size_t i) { + if (padded_span(i / cols, i % cols) != i) myAtomicAdd(p_status, 1); + }); + check_status(p_status, s); + } + + // full subspan + { + auto padded_span = padded_device_array.view(); + auto subspan_full = stdex::submdspan(padded_span, stdex::full_extent, stdex::full_extent); + thrust::for_each_n(rmm::exec_policy(s), + thrust::make_counting_iterator(0ul), + cols * rows, + [subspan_full, padded_span, rows, cols, p_status] __device__(size_t i) { + if (subspan_full(i / cols, i % cols) != padded_span(i / cols, i % cols)) + myAtomicAdd(p_status, 1); + }); + check_status(p_status, s); + + // resulting submdspan should still be padded + static_assert( + std::is_same_v); + } + + // slicing a row + { + auto padded_span = padded_device_array.view(); + auto row3 = stdex::submdspan(padded_span, 3, stdex::full_extent); + thrust::for_each_n(rmm::exec_policy(s), + thrust::make_counting_iterator(0ul), + cols, + [row3, padded_span, p_status] __device__(size_t i) { + if (row3(i) != padded_span(3, i)) myAtomicAdd(p_status, 1); + }); + check_status(p_status, s); + + // resulting submdspan should still be padded + static_assert(std::is_same_v); + } + + // slicing a column + { + auto padded_span = padded_device_array.view(); + auto col1 = stdex::submdspan(padded_span, stdex::full_extent, 1); + thrust::for_each_n(rmm::exec_policy(s), + thrust::make_counting_iterator(0ul), + rows, + [col1, padded_span, p_status] __device__(size_t i) { + if (col1(i) != padded_span(i, 1)) myAtomicAdd(p_status, 1); + }); + check_status(p_status, s); + + // resulting submdspan is *NOT* padded anymore + static_assert(std::is_same_v); + } + + // sub-rectangle of 6x7 + { + auto padded_span = padded_device_array.view(); + auto subspan = + stdex::submdspan(padded_span, std::make_tuple(1ul, 4ul), std::make_tuple(2ul, 5ul)); + thrust::for_each_n(rmm::exec_policy(s), + thrust::make_counting_iterator(0ul), + (rows - 1) * (cols - 2), + [subspan, rows, cols, padded_span, p_status] __device__(size_t i) { + size_t idx = i / (cols - 2); + size_t idy = i % (cols - 2); + // elements > subspan range can be accessed as well + if (subspan(idx, idy) != padded_span(idx + 1, idy + 2)) + myAtomicAdd(p_status, 1); + }); + check_status(p_status, s); + + // resulting submdspan is *NOT* padded anymore + static_assert(std::is_same_v); + } + + // sub-rectangle retaining padded layout + { + auto padded_span = padded_device_array.view(); + auto subspan = + stdex::submdspan(padded_span, std::make_tuple(1ul, 4ul), std::make_tuple(2ul, 5ul)); + thrust::for_each_n(rmm::exec_policy(s), + thrust::make_counting_iterator(0ul), + (rows - 1) * (cols - 2), + [subspan, rows, cols, padded_span, p_status] __device__(size_t i) { + size_t idx = i / (cols - 2); + size_t idy = i % (cols - 2); + // elements > subspan range can be accessed as well + if (subspan(idx, idy) != padded_span(idx + 1, idy + 2)) + myAtomicAdd(p_status, 1); + }); + check_status(p_status, s); + + // resulting submdspan is *NOT* padded anymore + static_assert(std::is_same_v); + } + } +} + +TEST(MDSpan, SubmdspanPadding) { test_submdspan_padding(); }*/ + +struct TestElement1 { + int a, b; +}; + +void test_mdspan_padding_by_type() +{ + using extents_type = stdex::extents; + auto s = rmm::cuda_stream_default; + + { + constexpr int rows = 6; + constexpr int cols = 7; + constexpr int alignment_bytes = 16; + + thrust::device_vector status(1, 0); + auto p_status = status.data().get(); + + // manually check strides for row major (c style) padding + { + using padded_layout_row_major = stdex::layout_right_padded< + detail::padding>, + alignment_bytes>::value>; + + using padded_mdarray_type = + device_mdarray; + auto device_policy = padded_mdarray_type::container_policy_type{s}; + + padded_layout_row_major::mapping layout{extents_type{rows, cols}}; + padded_mdarray_type padded_device_array{layout, device_policy}; + int alignment_elements = detail::padding::value; + auto padded_span = padded_device_array.view(); + thrust::for_each_n( + rmm::exec_policy(s), + thrust::make_counting_iterator(0ul), + rows * cols, + [rows, cols, padded_span, alignment_elements, p_status] __device__(size_t i) { + size_t idx = i / cols; + size_t idy = i % cols; + if ((&(padded_span(idx, idy)) - &(padded_span(0, idy))) % alignment_elements != 0) + myAtomicAdd(p_status, 1); + if ((&(padded_span(idx, idy)) - &(padded_span(idx, 0))) != idy) myAtomicAdd(p_status, 1); + }); + check_status(p_status, s); + } + + // manually check strides for col major (f style) padding + { + using padded_layout_col_major = stdex::layout_left_padded< + detail::padding>, + alignment_bytes>::value>; + using padded_mdarray_type = + device_mdarray; + auto device_policy = padded_mdarray_type::container_policy_type{s}; + + padded_layout_col_major::mapping layout{extents_type{rows, cols}}; + padded_mdarray_type padded_device_array{layout, device_policy}; + int alignment_elements = detail::padding::value; + auto padded_span = padded_device_array.view(); + thrust::for_each_n( + rmm::exec_policy(s), + thrust::make_counting_iterator(0ul), + rows * cols, + [rows, cols, padded_span, alignment_elements, p_status] __device__(size_t i) { + size_t idx = i / cols; + size_t idy = i % cols; + if ((&(padded_span(idx, idy)) - &(padded_span(idx, 0))) % alignment_elements != 0) + myAtomicAdd(p_status, 1); + if ((&(padded_span(idx, idy)) - &(padded_span(0, idy))) != idx) myAtomicAdd(p_status, 1); + }); + check_status(p_status, s); + } + } +} + +TEST(MDSpan, MDSpanPaddingType) { test_mdspan_padding_by_type(); } + +void test_mdspan_aligned_matrix() +{ + using extents_type = stdex::extents; + constexpr int rows = 2; + constexpr int cols = 10; + + // manually aligning the above, using -1 as filler + static constexpr int X = -1; + long data_padded[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, X, X, X, X, X, X, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, X, X, X, X, X, X}; + + auto my_aligned_host_span = + make_host_aligned_matrix_view>(data_padded, rows, cols); + + int failures = 0; + for (int irow = 0; irow < rows; ++irow) { + for (int icol = 0; icol < cols; ++icol) { + if (my_aligned_host_span(irow, icol) != irow * cols + icol) { ++failures; } + } + } + ASSERT_EQ(failures, 0); + + // now work with device memory + // use simple 1D array to allocate some space + auto s = rmm::cuda_stream_default; + using extent_1d = stdex::extents; + layout_c_contiguous::mapping layout_1d{extent_1d{rows * 32}}; + using mdarray_t = device_mdarray; + auto device_policy = mdarray_t::container_policy_type{s}; + mdarray_t device_array_1d{layout_1d, device_policy}; + + // direct access mdarray -- initialize with above data + for (int i = 0; i < 32; ++i) { + device_array_1d(i) = data_padded[i]; + } + + auto my_aligned_device_span = + make_device_aligned_matrix_view>( + device_array_1d.data_handle(), rows, cols); + + thrust::device_vector status(1, 0); + auto p_status = status.data().get(); + thrust::for_each_n(rmm::exec_policy(s), + thrust::make_counting_iterator(0ul), + rows * cols, + [rows, cols, my_aligned_device_span, p_status] __device__(size_t i) { + size_t idx = i / cols; + size_t idy = i % cols; + if (my_aligned_device_span(idx, idy) != i) myAtomicAdd(p_status, 1); + }); + check_status(p_status, s); +} + +TEST(MDSpan, MDSpanAlignedMatrix) { test_mdspan_aligned_matrix(); } + namespace { void test_mdarray_unravel() { @@ -527,4 +946,5 @@ void test_mdarray_unravel() } // anonymous namespace TEST(MDArray, Unravel) { test_mdarray_unravel(); } + } // namespace raft