From 4af551cc94998d01a3c37c867a363c77d678d4c2 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 27 Apr 2022 18:04:09 +0800 Subject: [PATCH 1/5] Implement unravel index for row-major array. --- cpp/include/raft/core/mdarray.hpp | 2 +- cpp/include/raft/detail/mdarray.hpp | 102 ++++++++++++++++++++++++++++ cpp/test/mdarray.cu | 62 ++++++++++++++++- 3 files changed, 164 insertions(+), 2 deletions(-) diff --git a/cpp/include/raft/core/mdarray.hpp b/cpp/include/raft/core/mdarray.hpp index 595c0161cd..0354068c54 100644 --- a/cpp/include/raft/core/mdarray.hpp +++ b/cpp/include/raft/core/mdarray.hpp @@ -647,4 +647,4 @@ auto make_device_vector(raft::handle_t const& handle, size_t n) { return make_device_vector(n, handle.get_stream()); } -} // namespace raft \ No newline at end of file +} // namespace raft diff --git a/cpp/include/raft/detail/mdarray.hpp b/cpp/include/raft/detail/mdarray.hpp index 624c7a4d07..33f98355c0 100644 --- a/cpp/include/raft/detail/mdarray.hpp +++ b/cpp/include/raft/detail/mdarray.hpp @@ -21,6 +21,7 @@ * limitations under the License. */ #pragma once +#include #include #include // dynamic_extent #include @@ -238,4 +239,105 @@ namespace stdex = std::experimental; using vector_extent = stdex::extents; using matrix_extent = stdex::extents; using scalar_extent = stdex::extents<1>; + +template +auto native_popc(T v) -> int32_t +{ + int c = 0; + for (; v != 0; v &= v - 1) + c++; + return c; +} + +inline __host__ __device__ auto popc(uint32_t v) -> int32_t +{ +#if defined(__CUDA_ARCH__) + return __popc(v); +#elif defined(__GNUC__) || defined(__clang__) + return __builtin_popcount(v); +#else + return native_popc(v); +#endif // compiler +} + +inline __host__ __device__ auto popc(uint64_t v) -> int32_t +{ +#if defined(__CUDA_ARCH__) + return __popcll(v); +#elif defined(__GNUC__) || defined(__clang__) + return __builtin_popcountll(v); +#else + return native_popc(v); +#endif // compiler +} + +template +MDSPAN_INLINE_FUNCTION constexpr auto arr_to_tup(T (&arr)[N], std::index_sequence) +{ + return cuda::std::make_tuple(arr[Idx]...); +} + +template +MDSPAN_INLINE_FUNCTION constexpr auto arr_to_tup(T (&arr)[N]) +{ + return arr_to_tup(arr, std::make_index_sequence{}); +} + +// uint division optimization inspired by the CIndexer in cupy. Division operation is +// slow on both CPU and GPU, especially 64 bit integer. So here we first try to avoid 64 +// bit when the index is smaller, then try to avoid division when it's exp of 2. +template +MDSPAN_INLINE_FUNCTION auto unravel_index_impl(I idx, stdex::extents shape) +{ + constexpr auto kRank = static_cast(shape.rank()); + size_t index[shape.rank()]{0}; // NOLINT + static_assert(std::is_signed::value, + "Don't change the type without changing the for loop."); + for (int32_t dim = kRank; --dim > 0;) { + auto s = static_cast>>(shape.extent(dim)); + if (s & (s - 1)) { + auto t = idx / s; + index[dim] = idx - t * s; + idx = t; + } else { // exp of 2 + index[dim] = idx & (s - 1); + idx >>= popc(s - 1); + } + } + index[0] = idx; + return arr_to_tup(index); +} + +/** + * \brief Turns linear index into coordinate. Similar to numpy unravel_index. This is not + * exposed to public as it's not part of the mdspan proposal, the returned tuple + * can not be directly used for indexing into mdspan and we might change the return + * type in the future. + * + * \code + * auto m = make_host_matrix(7, 6); + * auto m_v = m.view(); + * auto coord = detail::unravel_index(2, m.extents(), typename decltype(m)::layout_type{}); + * cuda::std::apply(m_v, coord) = 2; + * \endcode + * + * \param idx The linear index. + * \param shape The shape of the array to use. + * \param layout Must be `layout_right` (row-major) in current version. + * + * \return A cuda::std::tuple that represents the coordinate. + */ +template +MDSPAN_INLINE_FUNCTION auto unravel_index(size_t idx, + detail::stdex::extents shape, + LayoutPolicy const&) +{ + static_assert(std::is_same::value, + "Only C layout is supported."); + if (idx > std::numeric_limits::max()) { + return unravel_index_impl(static_cast(idx), shape); + } else { + return unravel_index_impl(static_cast(idx), shape); + } +} } // namespace raft::detail diff --git a/cpp/test/mdarray.cu b/cpp/test/mdarray.cu index 961a703a8b..62ac5edb24 100644 --- a/cpp/test/mdarray.cu +++ b/cpp/test/mdarray.cu @@ -15,9 +15,9 @@ */ #include #include +#include #include #include -#include #include #include #include @@ -417,4 +417,64 @@ TEST(MDArray, FuncArg) // check_matrix_layout(slice); } } + +TEST(MDArray, Unravel) +{ + { + uint32_t v{0}; + ASSERT_EQ(detail::native_popc(v), 0); + ASSERT_EQ(detail::popc(v), 0); + v = 1; + ASSERT_EQ(detail::native_popc(v), 1); + ASSERT_EQ(detail::popc(v), 1); + v = 0xffffffff; + ASSERT_EQ(detail::native_popc(v), 32); + ASSERT_EQ(detail::popc(v), 32); + } + { + uint64_t v{0}; + ASSERT_EQ(detail::native_popc(v), 0); + ASSERT_EQ(detail::popc(v), 0); + v = 1; + ASSERT_EQ(detail::native_popc(v), 1); + ASSERT_EQ(detail::popc(v), 1); + v = 0xffffffff; + ASSERT_EQ(detail::native_popc(v), 32); + ASSERT_EQ(detail::popc(v), 32); + v = 0xffffffffffffffff; + ASSERT_EQ(detail::native_popc(v), 64); + ASSERT_EQ(detail::popc(v), 64); + } + + // examples in numpy unravel_index + { + auto coord = detail::unravel_index(22, detail::matrix_extent{7, 6}, stdex::layout_right{}); + static_assert(cuda::std::tuple_size::value == 2); + ASSERT_EQ(cuda::std::get<0>(coord), 3); + ASSERT_EQ(cuda::std::get<1>(coord), 4); + } + { + auto coord = detail::unravel_index(41, detail::matrix_extent{7, 6}, stdex::layout_right{}); + static_assert(cuda::std::tuple_size::value == 2); + ASSERT_EQ(cuda::std::get<0>(coord), 6); + ASSERT_EQ(cuda::std::get<1>(coord), 5); + } + { + auto coord = detail::unravel_index(37, detail::matrix_extent{7, 6}, stdex::layout_right{}); + static_assert(cuda::std::tuple_size::value == 2); + ASSERT_EQ(cuda::std::get<0>(coord), 6); + ASSERT_EQ(cuda::std::get<1>(coord), 1); + + auto m = make_host_matrix(7, 6); + auto m_v = m.view(); + for (size_t i = 0; i < m.size(); ++i) { + auto coord = detail::unravel_index(i, m.extents(), typename decltype(m)::layout_type{}); + cuda::std::apply(m_v, coord) = i; + } + for (size_t i = 0; i < m.size(); ++i) { + auto coord = detail::unravel_index(i, m.extents(), typename decltype(m)::layout_type{}); + ASSERT_EQ(cuda::std::apply(m_v, coord), i); + } + } +} } // namespace raft From 9427773e3ef653ed925dba893dbf6f9510a9b700 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 27 Apr 2022 18:07:29 +0800 Subject: [PATCH 2/5] format. --- cpp/test/mdarray.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/test/mdarray.cu b/cpp/test/mdarray.cu index 62ac5edb24..6e1ad7158d 100644 --- a/cpp/test/mdarray.cu +++ b/cpp/test/mdarray.cu @@ -465,7 +465,7 @@ TEST(MDArray, Unravel) ASSERT_EQ(cuda::std::get<0>(coord), 6); ASSERT_EQ(cuda::std::get<1>(coord), 1); - auto m = make_host_matrix(7, 6); + auto m = make_host_matrix(7, 6); auto m_v = m.view(); for (size_t i = 0; i < m.size(); ++i) { auto coord = detail::unravel_index(i, m.extents(), typename decltype(m)::layout_type{}); From b79b13b0a7c428ab79aeae0cf56e02dbf5f01258 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 27 Apr 2022 18:09:27 +0800 Subject: [PATCH 3/5] Avoid change. --- cpp/include/raft/core/mdarray.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/core/mdarray.hpp b/cpp/include/raft/core/mdarray.hpp index 0354068c54..595c0161cd 100644 --- a/cpp/include/raft/core/mdarray.hpp +++ b/cpp/include/raft/core/mdarray.hpp @@ -647,4 +647,4 @@ auto make_device_vector(raft::handle_t const& handle, size_t n) { return make_device_vector(n, handle.get_stream()); } -} // namespace raft +} // namespace raft \ No newline at end of file From 0f6eaa3b8b46b24ad198ed666d77ca8a880aca5d Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 27 Apr 2022 18:17:56 +0800 Subject: [PATCH 4/5] cleanup. --- cpp/include/raft/detail/mdarray.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/include/raft/detail/mdarray.hpp b/cpp/include/raft/detail/mdarray.hpp index 33f98355c0..7dc96e3f51 100644 --- a/cpp/include/raft/detail/mdarray.hpp +++ b/cpp/include/raft/detail/mdarray.hpp @@ -241,15 +241,16 @@ using matrix_extent = stdex::extents; using scalar_extent = stdex::extents<1>; template -auto native_popc(T v) -> int32_t +MDSPAN_INLINE_FUNCTION auto native_popc(T v) -> int32_t { int c = 0; - for (; v != 0; v &= v - 1) + for (; v != 0; v &= v - 1) { c++; + } return c; } -inline __host__ __device__ auto popc(uint32_t v) -> int32_t +MDSPAN_INLINE_FUNCTION auto popc(uint32_t v) -> int32_t { #if defined(__CUDA_ARCH__) return __popc(v); @@ -260,7 +261,7 @@ inline __host__ __device__ auto popc(uint32_t v) -> int32_t #endif // compiler } -inline __host__ __device__ auto popc(uint64_t v) -> int32_t +MDSPAN_INLINE_FUNCTION auto popc(uint64_t v) -> int32_t { #if defined(__CUDA_ARCH__) return __popcll(v); From fd6d0a74fad34c3556c47dde3a1fc1b3d1616138 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 27 Apr 2022 23:16:44 +0800 Subject: [PATCH 5/5] Use thrust tuple instead. --- cpp/include/raft/detail/mdarray.hpp | 32 +++++++++++++--- cpp/test/mdarray.cu | 58 ++++++++++++++++++++++------- 2 files changed, 72 insertions(+), 18 deletions(-) diff --git a/cpp/include/raft/detail/mdarray.hpp b/cpp/include/raft/detail/mdarray.hpp index 7dc96e3f51..1797e5469f 100644 --- a/cpp/include/raft/detail/mdarray.hpp +++ b/cpp/include/raft/detail/mdarray.hpp @@ -21,12 +21,12 @@ * limitations under the License. */ #pragma once -#include #include #include // dynamic_extent #include #include #include +#include namespace raft::detail { /** @@ -275,7 +275,7 @@ MDSPAN_INLINE_FUNCTION auto popc(uint64_t v) -> int32_t template MDSPAN_INLINE_FUNCTION constexpr auto arr_to_tup(T (&arr)[N], std::index_sequence) { - return cuda::std::make_tuple(arr[Idx]...); + return thrust::make_tuple(arr[Idx]...); } template @@ -319,14 +319,14 @@ MDSPAN_INLINE_FUNCTION auto unravel_index_impl(I idx, stdex::extents * auto m = make_host_matrix(7, 6); * auto m_v = m.view(); * auto coord = detail::unravel_index(2, m.extents(), typename decltype(m)::layout_type{}); - * cuda::std::apply(m_v, coord) = 2; + * detail::apply(m_v, coord) = 2; * \endcode * * \param idx The linear index. * \param shape The shape of the array to use. - * \param layout Must be `layout_right` (row-major) in current version. + * \param layout Must be `layout_right` (row-major) in current implementation. * - * \return A cuda::std::tuple that represents the coordinate. + * \return A thrust::tuple that represents the coordinate. */ template MDSPAN_INLINE_FUNCTION auto unravel_index(size_t idx, @@ -341,4 +341,26 @@ MDSPAN_INLINE_FUNCTION auto unravel_index(size_t idx, return unravel_index_impl(static_cast(idx), shape); } } + +template +MDSPAN_INLINE_FUNCTION auto constexpr apply_impl(Fn&& f, Tup&& t, std::index_sequence) + -> decltype(auto) +{ + return f(thrust::get(t)...); +} + +/** + * C++ 17 style apply for thrust tuple. + * + * \param f function to apply + * \param t tuple of arguments + */ +template >::value> +MDSPAN_INLINE_FUNCTION auto constexpr apply(Fn&& f, Tup&& t) -> decltype(auto) +{ + return apply_impl( + std::forward(f), std::forward(t), std::make_index_sequence{}); +} } // namespace raft::detail diff --git a/cpp/test/mdarray.cu b/cpp/test/mdarray.cu index 6e1ad7158d..1b3ed1a0ee 100644 --- a/cpp/test/mdarray.cu +++ b/cpp/test/mdarray.cu @@ -418,7 +418,8 @@ TEST(MDArray, FuncArg) } } -TEST(MDArray, Unravel) +namespace { +void test_mdarray_unravel() { { uint32_t v{0}; @@ -449,32 +450,63 @@ TEST(MDArray, Unravel) // examples in numpy unravel_index { auto coord = detail::unravel_index(22, detail::matrix_extent{7, 6}, stdex::layout_right{}); - static_assert(cuda::std::tuple_size::value == 2); - ASSERT_EQ(cuda::std::get<0>(coord), 3); - ASSERT_EQ(cuda::std::get<1>(coord), 4); + static_assert(thrust::tuple_size::value == 2); + ASSERT_EQ(thrust::get<0>(coord), 3); + ASSERT_EQ(thrust::get<1>(coord), 4); } { auto coord = detail::unravel_index(41, detail::matrix_extent{7, 6}, stdex::layout_right{}); - static_assert(cuda::std::tuple_size::value == 2); - ASSERT_EQ(cuda::std::get<0>(coord), 6); - ASSERT_EQ(cuda::std::get<1>(coord), 5); + static_assert(thrust::tuple_size::value == 2); + ASSERT_EQ(thrust::get<0>(coord), 6); + ASSERT_EQ(thrust::get<1>(coord), 5); } { auto coord = detail::unravel_index(37, detail::matrix_extent{7, 6}, stdex::layout_right{}); - static_assert(cuda::std::tuple_size::value == 2); - ASSERT_EQ(cuda::std::get<0>(coord), 6); - ASSERT_EQ(cuda::std::get<1>(coord), 1); - + static_assert(thrust::tuple_size::value == 2); + ASSERT_EQ(thrust::get<0>(coord), 6); + ASSERT_EQ(thrust::get<1>(coord), 1); + } + // assignment + { auto m = make_host_matrix(7, 6); auto m_v = m.view(); for (size_t i = 0; i < m.size(); ++i) { auto coord = detail::unravel_index(i, m.extents(), typename decltype(m)::layout_type{}); - cuda::std::apply(m_v, coord) = i; + detail::apply(m_v, coord) = i; } for (size_t i = 0; i < m.size(); ++i) { auto coord = detail::unravel_index(i, m.extents(), typename decltype(m)::layout_type{}); - ASSERT_EQ(cuda::std::apply(m_v, coord), i); + ASSERT_EQ(detail::apply(m_v, coord), i); } } + + { + handle_t handle; + auto m = make_device_matrix(handle, 7, 6); + auto m_v = m.view(); + thrust::for_each_n(handle.get_thrust_policy(), + thrust::make_counting_iterator(0ul), + m_v.size(), + [=] __device__(size_t i) { + auto coord = detail::unravel_index( + i, m_v.extents(), typename decltype(m_v)::layout_type{}); + detail::apply(m_v, coord) = static_cast(i); + }); + thrust::device_vector status(1, 0); + auto p_status = status.data().get(); + thrust::for_each_n(handle.get_thrust_policy(), + thrust::make_counting_iterator(0ul), + m_v.size(), + [=] __device__(size_t i) { + auto coord = detail::unravel_index( + i, m_v.extents(), typename decltype(m_v)::layout_type{}); + auto v = detail::apply(m_v, coord); + if (v != static_cast(i)) { raft::myAtomicAdd(p_status, 1); } + }); + check_status(p_status, handle.get_stream()); + } } +} // anonymous namespace + +TEST(MDArray, Unravel) { test_mdarray_unravel(); } } // namespace raft