rapidsai · trivialfis · Apr 27, 2022 · Apr 27, 2022 · Apr 27, 2022 · Apr 27, 2022
@@ -26,6 +26,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <thrust/device_ptr.h>
+#include <thrust/tuple.h>
 
 namespace raft::detail {
 /**
@@ -238,4 +239,128 @@ namespace stdex = std::experimental;
 using vector_extent = stdex::extents<dynamic_extent>;
 using matrix_extent = stdex::extents<dynamic_extent, dynamic_extent>;
 using scalar_extent = stdex::extents<1>;
+
+template <typename T>
+MDSPAN_INLINE_FUNCTION auto native_popc(T v) -> int32_t
+{
+  int c = 0;
+  for (; v != 0; v &= v - 1) {
+    c++;
+  }
+  return c;
+}
+
+MDSPAN_INLINE_FUNCTION auto popc(uint32_t v) -> int32_t
+{
+#if defined(__CUDA_ARCH__)
+  return __popc(v);
+#elif defined(__GNUC__) || defined(__clang__)
+  return __builtin_popcount(v);
+#else
+  return native_popc(v);
+#endif  // compiler
+}
+
+MDSPAN_INLINE_FUNCTION auto popc(uint64_t v) -> int32_t
+{
+#if defined(__CUDA_ARCH__)
+  return __popcll(v);
+#elif defined(__GNUC__) || defined(__clang__)
+  return __builtin_popcountll(v);
+#else
+  return native_popc(v);
+#endif  // compiler
+}
+
+template <class T, std::size_t N, std::size_t... Idx>
+MDSPAN_INLINE_FUNCTION constexpr auto arr_to_tup(T (&arr)[N], std::index_sequence<Idx...>)
+{
+  return thrust::make_tuple(arr[Idx]...);
+}
+
+template <class T, std::size_t N>
+MDSPAN_INLINE_FUNCTION constexpr auto arr_to_tup(T (&arr)[N])
+{
+  return arr_to_tup(arr, std::make_index_sequence<N>{});
+}
+
+// uint division optimization inspired by the CIndexer in cupy.  Division operation is
+// slow on both CPU and GPU, especially 64 bit integer.  So here we first try to avoid 64
+// bit when the index is smaller, then try to avoid division when it's exp of 2.
+template <typename I, size_t... Extents>
+MDSPAN_INLINE_FUNCTION auto unravel_index_impl(I idx, stdex::extents<Extents...> shape)
+{
+  constexpr auto kRank = static_cast<int32_t>(shape.rank());
+  size_t index[shape.rank()]{0};  // NOLINT
+  static_assert(std::is_signed<decltype(kRank)>::value,
+                "Don't change the type without changing the for loop.");
+  for (int32_t dim = kRank; --dim > 0;) {
+    auto s = static_cast<std::remove_const_t<std::remove_reference_t<I>>>(shape.extent(dim));
+    if (s & (s - 1)) {
+      auto t     = idx / s;
+      index[dim] = idx - t * s;
+      idx        = t;
+    } else {  // exp of 2
+      index[dim] = idx & (s - 1);
+      idx >>= popc(s - 1);
+    }
+  }
+  index[0] = idx;
+  return arr_to_tup(index);
+}
+
+/**
+ * \brief Turns linear index into coordinate.  Similar to numpy unravel_index. This is not
+ *        exposed to public as it's not part of the mdspan proposal, the returned tuple
+ *        can not be directly used for indexing into mdspan and we might change the return
+ *        type in the future.
+ *
+ * \code
+ *   auto m = make_host_matrix<float>(7, 6);
+ *   auto m_v = m.view();
+ *   auto coord = detail::unravel_index(2, m.extents(), typename decltype(m)::layout_type{});
+ *   detail::apply(m_v, coord) = 2;
+ * \endcode
+ *
+ * \param idx    The linear index.
+ * \param shape  The shape of the array to use.
+ * \param layout Must be `layout_right` (row-major) in current implementation.
+ *
+ * \return A thrust::tuple that represents the coordinate.
+ */
+template <typename LayoutPolicy, std::size_t... Exts>
+MDSPAN_INLINE_FUNCTION auto unravel_index(size_t idx,
+                                          detail::stdex::extents<Exts...> shape,
+                                          LayoutPolicy const&)
+{
+  static_assert(std::is_same<LayoutPolicy, stdex::layout_right>::value,
+                "Only C layout is supported.");
+  if (idx > std::numeric_limits<uint32_t>::max()) {
+    return unravel_index_impl<uint64_t, Exts...>(static_cast<uint64_t>(idx), shape);
+  } else {
+    return unravel_index_impl<uint32_t, Exts...>(static_cast<uint32_t>(idx), shape);
+  }
+}
+
+template <typename Fn, typename Tup, size_t... I>
+MDSPAN_INLINE_FUNCTION auto constexpr apply_impl(Fn&& f, Tup&& t, std::index_sequence<I...>)
+  -> decltype(auto)
+{
+  return f(thrust::get<I>(t)...);
+}
+
+/**
+ * C++ 17 style apply for thrust tuple.
+ *
+ * \param f function to apply
+ * \param t tuple of arguments
+ */
+template <typename Fn,
+          typename Tup,
+          std::size_t kTupSize = thrust::tuple_size<std::remove_reference_t<Tup>>::value>
+MDSPAN_INLINE_FUNCTION auto constexpr apply(Fn&& f, Tup&& t) -> decltype(auto)
+{
+  return apply_impl(
+    std::forward<Fn>(f), std::forward<Tup>(t), std::make_index_sequence<kTupSize>{});
+}
 }  // namespace raft::detail
@@ -15,9 +15,9 @@
  */
 #include <experimental/mdspan>
 #include <gtest/gtest.h>
+#include <raft/core/mdarray.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/mdarray.hpp>
 #include <rmm/cuda_stream.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/device_vector.hpp>
@@ -417,4 +417,96 @@ TEST(MDArray, FuncArg)
     // check_matrix_layout(slice);
   }
 }
+
+namespace {
+void test_mdarray_unravel()
+{
+  {
+    uint32_t v{0};
+    ASSERT_EQ(detail::native_popc(v), 0);
+    ASSERT_EQ(detail::popc(v), 0);
+    v = 1;
+    ASSERT_EQ(detail::native_popc(v), 1);
+    ASSERT_EQ(detail::popc(v), 1);
+    v = 0xffffffff;
+    ASSERT_EQ(detail::native_popc(v), 32);
+    ASSERT_EQ(detail::popc(v), 32);
+  }
+  {
+    uint64_t v{0};
+    ASSERT_EQ(detail::native_popc(v), 0);
+    ASSERT_EQ(detail::popc(v), 0);
+    v = 1;
+    ASSERT_EQ(detail::native_popc(v), 1);
+    ASSERT_EQ(detail::popc(v), 1);
+    v = 0xffffffff;
+    ASSERT_EQ(detail::native_popc(v), 32);
+    ASSERT_EQ(detail::popc(v), 32);
+    v = 0xffffffffffffffff;
+    ASSERT_EQ(detail::native_popc(v), 64);
+    ASSERT_EQ(detail::popc(v), 64);
+  }
+
+  // examples in numpy unravel_index
+  {
+    auto coord = detail::unravel_index(22, detail::matrix_extent{7, 6}, stdex::layout_right{});
+    static_assert(thrust::tuple_size<decltype(coord)>::value == 2);
+    ASSERT_EQ(thrust::get<0>(coord), 3);
+    ASSERT_EQ(thrust::get<1>(coord), 4);
+  }
+  {
+    auto coord = detail::unravel_index(41, detail::matrix_extent{7, 6}, stdex::layout_right{});
+    static_assert(thrust::tuple_size<decltype(coord)>::value == 2);
+    ASSERT_EQ(thrust::get<0>(coord), 6);
+    ASSERT_EQ(thrust::get<1>(coord), 5);
+  }
+  {
+    auto coord = detail::unravel_index(37, detail::matrix_extent{7, 6}, stdex::layout_right{});
+    static_assert(thrust::tuple_size<decltype(coord)>::value == 2);
+    ASSERT_EQ(thrust::get<0>(coord), 6);
+    ASSERT_EQ(thrust::get<1>(coord), 1);
+  }
+  // assignment
+  {
+    auto m   = make_host_matrix<float>(7, 6);
+    auto m_v = m.view();
+    for (size_t i = 0; i < m.size(); ++i) {
+      auto coord = detail::unravel_index(i, m.extents(), typename decltype(m)::layout_type{});
+      detail::apply(m_v, coord) = i;
+    }
+    for (size_t i = 0; i < m.size(); ++i) {
+      auto coord = detail::unravel_index(i, m.extents(), typename decltype(m)::layout_type{});
+      ASSERT_EQ(detail::apply(m_v, coord), i);
+    }
+  }
+
+  {
+    handle_t handle;
+    auto m   = make_device_matrix<float>(handle, 7, 6);
+    auto m_v = m.view();
+    thrust::for_each_n(handle.get_thrust_policy(),
+                       thrust::make_counting_iterator(0ul),
+                       m_v.size(),
+                       [=] __device__(size_t i) {
+                         auto coord = detail::unravel_index(
+                           i, m_v.extents(), typename decltype(m_v)::layout_type{});
+                         detail::apply(m_v, coord) = static_cast<float>(i);
+                       });
+    thrust::device_vector<int32_t> status(1, 0);
+    auto p_status = status.data().get();
+    thrust::for_each_n(handle.get_thrust_policy(),
+                       thrust::make_counting_iterator(0ul),
+                       m_v.size(),
+                       [=] __device__(size_t i) {
+                         auto coord = detail::unravel_index(
+                           i, m_v.extents(), typename decltype(m_v)::layout_type{});
+                         auto v = detail::apply(m_v, coord);
+                         if (v != static_cast<float>(i)) { raft::myAtomicAdd(p_status, 1); }
+                       });
+    check_status(p_status, handle.get_stream());
+  }
+}
+}  // anonymous namespace
+
+TEST(MDArray, Unravel) { test_mdarray_unravel(); }
 }  // namespace raft