From 4af551cc94998d01a3c37c867a363c77d678d4c2 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Wed, 27 Apr 2022 18:04:09 +0800
Subject: [PATCH 1/5] Implement unravel index for row-major array.

---
 cpp/include/raft/core/mdarray.hpp   |   2 +-
 cpp/include/raft/detail/mdarray.hpp | 102 ++++++++++++++++++++++++++++
 cpp/test/mdarray.cu                 |  62 ++++++++++++++++-
 3 files changed, 164 insertions(+), 2 deletions(-)
diff --git a/cpp/include/raft/core/mdarray.hpp b/cpp/include/raft/core/mdarray.hpp
index 595c0161cd..0354068c54 100644
--- a/cpp/include/raft/core/mdarray.hpp
+++ b/cpp/include/raft/core/mdarray.hpp
@@ -647,4 +647,4 @@ auto make_device_vector(raft::handle_t const& handle, size_t n)
 {
   return make_device_vector<ElementType>(n, handle.get_stream());
 }
-}  // namespace raft
\ No newline at end of file
+}  // namespace raft
diff --git a/cpp/include/raft/detail/mdarray.hpp b/cpp/include/raft/detail/mdarray.hpp
index 624c7a4d07..33f98355c0 100644
--- a/cpp/include/raft/detail/mdarray.hpp
+++ b/cpp/include/raft/detail/mdarray.hpp
@@ -21,6 +21,7 @@
  * limitations under the License.
  */
 #pragma once
+#include <cuda/std/tuple>
 #include <experimental/mdspan>
 #include <raft/detail/span.hpp>  // dynamic_extent
 #include <rmm/cuda_stream_view.hpp>
@@ -238,4 +239,105 @@ namespace stdex = std::experimental;
 using vector_extent = stdex::extents<dynamic_extent>;
 using matrix_extent = stdex::extents<dynamic_extent, dynamic_extent>;
 using scalar_extent = stdex::extents<1>;
+
+template <typename T>
+auto native_popc(T v) -> int32_t
+{
+  int c = 0;
+  for (; v != 0; v &= v - 1)
+    c++;
+  return c;
+}
+
+inline __host__ __device__ auto popc(uint32_t v) -> int32_t
+{
+#if defined(__CUDA_ARCH__)
+  return __popc(v);
+#elif defined(__GNUC__) || defined(__clang__)
+  return __builtin_popcount(v);
+#else
+  return native_popc(v);
+#endif  // compiler
+}
+
+inline __host__ __device__ auto popc(uint64_t v) -> int32_t
+{
+#if defined(__CUDA_ARCH__)
+  return __popcll(v);
+#elif defined(__GNUC__) || defined(__clang__)
+  return __builtin_popcountll(v);
+#else
+  return native_popc(v);
+#endif  // compiler
+}
+
+template <class T, std::size_t N, std::size_t... Idx>
+MDSPAN_INLINE_FUNCTION constexpr auto arr_to_tup(T (&arr)[N], std::index_sequence<Idx...>)
+{
+  return cuda::std::make_tuple(arr[Idx]...);
+}
+
+template <class T, std::size_t N>
+MDSPAN_INLINE_FUNCTION constexpr auto arr_to_tup(T (&arr)[N])
+{
+  return arr_to_tup(arr, std::make_index_sequence<N>{});
+}
+
+// uint division optimization inspired by the CIndexer in cupy.  Division operation is
+// slow on both CPU and GPU, especially 64 bit integer.  So here we first try to avoid 64
+// bit when the index is smaller, then try to avoid division when it's exp of 2.
+template <typename I, size_t... Extents>
+MDSPAN_INLINE_FUNCTION auto unravel_index_impl(I idx, stdex::extents<Extents...> shape)
+{
+  constexpr auto kRank = static_cast<int32_t>(shape.rank());
+  size_t index[shape.rank()]{0};  // NOLINT
+  static_assert(std::is_signed<decltype(kRank)>::value,
+                "Don't change the type without changing the for loop.");
+  for (int32_t dim = kRank; --dim > 0;) {
+    auto s = static_cast<std::remove_const_t<std::remove_reference_t<I>>>(shape.extent(dim));
+    if (s & (s - 1)) {
+      auto t     = idx / s;
+      index[dim] = idx - t * s;
+      idx        = t;
+    } else {  // exp of 2
+      index[dim] = idx & (s - 1);
+      idx >>= popc(s - 1);
+    }
+  }
+  index[0] = idx;
+  return arr_to_tup(index);
+}
+
+/**
+ * \brief Turns linear index into coordinate.  Similar to numpy unravel_index. This is not
+ *        exposed to public as it's not part of the mdspan proposal, the returned tuple
+ *        can not be directly used for indexing into mdspan and we might change the return
+ *        type in the future.
+ *
+ * \code
+ *   auto m = make_host_matrix<float>(7, 6);
+ *   auto m_v = m.view();
+ *   auto coord = detail::unravel_index(2, m.extents(), typename decltype(m)::layout_type{});
+ *   cuda::std::apply(m_v, coord) = 2;
+ * \endcode
+ *
+ * \param idx    The linear index.
+ * \param shape  The shape of the array to use.
+ * \param layout Must be `layout_right` (row-major) in current version.
+ *
+ * \return A cuda::std::tuple that represents the coordinate.
+ */
+template <typename LayoutPolicy, std::size_t... Exts>
+MDSPAN_INLINE_FUNCTION auto unravel_index(size_t idx,
+                                          detail::stdex::extents<Exts...> shape,
+                                          LayoutPolicy const&)
+{
+  static_assert(std::is_same<LayoutPolicy, stdex::layout_right>::value,
+                "Only C layout is supported.");
+  if (idx > std::numeric_limits<uint32_t>::max()) {
+    return unravel_index_impl<uint64_t, Exts...>(static_cast<uint64_t>(idx), shape);
+  } else {
+    return unravel_index_impl<uint32_t, Exts...>(static_cast<uint32_t>(idx), shape);
+  }
+}
 }  // namespace raft::detail
diff --git a/cpp/test/mdarray.cu b/cpp/test/mdarray.cu
index 961a703a8b..62ac5edb24 100644
--- a/cpp/test/mdarray.cu
+++ b/cpp/test/mdarray.cu
@@ -15,9 +15,9 @@
  */
 #include <experimental/mdspan>
 #include <gtest/gtest.h>
+#include <raft/core/mdarray.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/mdarray.hpp>
 #include <rmm/cuda_stream.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/device_vector.hpp>
@@ -417,4 +417,64 @@ TEST(MDArray, FuncArg)
     // check_matrix_layout(slice);
   }
 }
+
+TEST(MDArray, Unravel)
+{
+  {
+    uint32_t v{0};
+    ASSERT_EQ(detail::native_popc(v), 0);
+    ASSERT_EQ(detail::popc(v), 0);
+    v = 1;
+    ASSERT_EQ(detail::native_popc(v), 1);
+    ASSERT_EQ(detail::popc(v), 1);
+    v = 0xffffffff;
+    ASSERT_EQ(detail::native_popc(v), 32);
+    ASSERT_EQ(detail::popc(v), 32);
+  }
+  {
+    uint64_t v{0};
+    ASSERT_EQ(detail::native_popc(v), 0);
+    ASSERT_EQ(detail::popc(v), 0);
+    v = 1;
+    ASSERT_EQ(detail::native_popc(v), 1);
+    ASSERT_EQ(detail::popc(v), 1);
+    v = 0xffffffff;
+    ASSERT_EQ(detail::native_popc(v), 32);
+    ASSERT_EQ(detail::popc(v), 32);
+    v = 0xffffffffffffffff;
+    ASSERT_EQ(detail::native_popc(v), 64);
+    ASSERT_EQ(detail::popc(v), 64);
+  }
+
+  // examples in numpy unravel_index
+  {
+    auto coord = detail::unravel_index(22, detail::matrix_extent{7, 6}, stdex::layout_right{});
+    static_assert(cuda::std::tuple_size<decltype(coord)>::value == 2);
+    ASSERT_EQ(cuda::std::get<0>(coord), 3);
+    ASSERT_EQ(cuda::std::get<1>(coord), 4);
+  }
+  {
+    auto coord = detail::unravel_index(41, detail::matrix_extent{7, 6}, stdex::layout_right{});
+    static_assert(cuda::std::tuple_size<decltype(coord)>::value == 2);
+    ASSERT_EQ(cuda::std::get<0>(coord), 6);
+    ASSERT_EQ(cuda::std::get<1>(coord), 5);
+  }
+  {
+    auto coord = detail::unravel_index(37, detail::matrix_extent{7, 6}, stdex::layout_right{});
+    static_assert(cuda::std::tuple_size<decltype(coord)>::value == 2);
+    ASSERT_EQ(cuda::std::get<0>(coord), 6);
+    ASSERT_EQ(cuda::std::get<1>(coord), 1);
+
+    auto m = make_host_matrix<float>(7, 6);
+    auto m_v = m.view();
+    for (size_t i = 0; i < m.size(); ++i) {
+      auto coord = detail::unravel_index(i, m.extents(), typename decltype(m)::layout_type{});
+      cuda::std::apply(m_v, coord) = i;
+    }
+    for (size_t i = 0; i < m.size(); ++i) {
+      auto coord = detail::unravel_index(i, m.extents(), typename decltype(m)::layout_type{});
+      ASSERT_EQ(cuda::std::apply(m_v, coord), i);
+    }
+  }
+}
 }  // namespace raft

From 9427773e3ef653ed925dba893dbf6f9510a9b700 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Wed, 27 Apr 2022 18:07:29 +0800
Subject: [PATCH 2/5] format.

---
 cpp/test/mdarray.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/test/mdarray.cu b/cpp/test/mdarray.cu
index 62ac5edb24..6e1ad7158d 100644
--- a/cpp/test/mdarray.cu
+++ b/cpp/test/mdarray.cu
@@ -465,7 +465,7 @@ TEST(MDArray, Unravel)
     ASSERT_EQ(cuda::std::get<0>(coord), 6);
     ASSERT_EQ(cuda::std::get<1>(coord), 1);
 
-    auto m = make_host_matrix<float>(7, 6);
+    auto m   = make_host_matrix<float>(7, 6);
     auto m_v = m.view();
     for (size_t i = 0; i < m.size(); ++i) {
       auto coord = detail::unravel_index(i, m.extents(), typename decltype(m)::layout_type{});

From b79b13b0a7c428ab79aeae0cf56e02dbf5f01258 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Wed, 27 Apr 2022 18:09:27 +0800
Subject: [PATCH 3/5] Avoid change.

---
 cpp/include/raft/core/mdarray.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/core/mdarray.hpp b/cpp/include/raft/core/mdarray.hpp
index 0354068c54..595c0161cd 100644
--- a/cpp/include/raft/core/mdarray.hpp
+++ b/cpp/include/raft/core/mdarray.hpp
@@ -647,4 +647,4 @@ auto make_device_vector(raft::handle_t const& handle, size_t n)
 {
   return make_device_vector<ElementType>(n, handle.get_stream());
 }
-}  // namespace raft
+}  // namespace raft
\ No newline at end of file

From 0f6eaa3b8b46b24ad198ed666d77ca8a880aca5d Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Wed, 27 Apr 2022 18:17:56 +0800
Subject: [PATCH 4/5] cleanup.

---
 cpp/include/raft/detail/mdarray.hpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/cpp/include/raft/detail/mdarray.hpp b/cpp/include/raft/detail/mdarray.hpp
index 33f98355c0..7dc96e3f51 100644
--- a/cpp/include/raft/detail/mdarray.hpp
+++ b/cpp/include/raft/detail/mdarray.hpp
@@ -241,15 +241,16 @@ using matrix_extent = stdex::extents<dynamic_extent, dynamic_extent>;
 using scalar_extent = stdex::extents<1>;
 
 template <typename T>
-auto native_popc(T v) -> int32_t
+MDSPAN_INLINE_FUNCTION auto native_popc(T v) -> int32_t
 {
   int c = 0;
-  for (; v != 0; v &= v - 1)
+  for (; v != 0; v &= v - 1) {
     c++;
+  }
   return c;
 }
 
-inline __host__ __device__ auto popc(uint32_t v) -> int32_t
+MDSPAN_INLINE_FUNCTION auto popc(uint32_t v) -> int32_t
 {
 #if defined(__CUDA_ARCH__)
   return __popc(v);
@@ -260,7 +261,7 @@ inline __host__ __device__ auto popc(uint32_t v) -> int32_t
 #endif  // compiler
 }
 
-inline __host__ __device__ auto popc(uint64_t v) -> int32_t
+MDSPAN_INLINE_FUNCTION auto popc(uint64_t v) -> int32_t
 {
 #if defined(__CUDA_ARCH__)
   return __popcll(v);

From fd6d0a74fad34c3556c47dde3a1fc1b3d1616138 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Wed, 27 Apr 2022 23:16:44 +0800
Subject: [PATCH 5/5] Use thrust tuple instead.

---
 cpp/include/raft/detail/mdarray.hpp | 32 +++++++++++++---
 cpp/test/mdarray.cu                 | 58 ++++++++++++++++++++++-------
 2 files changed, 72 insertions(+), 18 deletions(-)

diff --git a/cpp/include/raft/detail/mdarray.hpp b/cpp/include/raft/detail/mdarray.hpp
index 7dc96e3f51..1797e5469f 100644
--- a/cpp/include/raft/detail/mdarray.hpp
+++ b/cpp/include/raft/detail/mdarray.hpp
@@ -21,12 +21,12 @@
  * limitations under the License.
  */
 #pragma once
-#include <cuda/std/tuple>
 #include <experimental/mdspan>
 #include <raft/detail/span.hpp>  // dynamic_extent
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <thrust/device_ptr.h>
+#include <thrust/tuple.h>
 
 namespace raft::detail {
 /**
@@ -275,7 +275,7 @@ MDSPAN_INLINE_FUNCTION auto popc(uint64_t v) -> int32_t
 template <class T, std::size_t N, std::size_t... Idx>
 MDSPAN_INLINE_FUNCTION constexpr auto arr_to_tup(T (&arr)[N], std::index_sequence<Idx...>)
 {
-  return cuda::std::make_tuple(arr[Idx]...);
+  return thrust::make_tuple(arr[Idx]...);
 }
 
 template <class T, std::size_t N>
@@ -319,14 +319,14 @@ MDSPAN_INLINE_FUNCTION auto unravel_index_impl(I idx, stdex::extents<Extents...>
  *   auto m = make_host_matrix<float>(7, 6);
  *   auto m_v = m.view();
  *   auto coord = detail::unravel_index(2, m.extents(), typename decltype(m)::layout_type{});
- *   cuda::std::apply(m_v, coord) = 2;
+ *   detail::apply(m_v, coord) = 2;
  * \endcode
  *
  * \param idx    The linear index.
  * \param shape  The shape of the array to use.
- * \param layout Must be `layout_right` (row-major) in current version.
+ * \param layout Must be `layout_right` (row-major) in current implementation.
  *
- * \return A cuda::std::tuple that represents the coordinate.
+ * \return A thrust::tuple that represents the coordinate.
  */
 template <typename LayoutPolicy, std::size_t... Exts>
 MDSPAN_INLINE_FUNCTION auto unravel_index(size_t idx,
@@ -341,4 +341,26 @@ MDSPAN_INLINE_FUNCTION auto unravel_index(size_t idx,
     return unravel_index_impl<uint32_t, Exts...>(static_cast<uint32_t>(idx), shape);
   }
 }
+
+template <typename Fn, typename Tup, size_t... I>
+MDSPAN_INLINE_FUNCTION auto constexpr apply_impl(Fn&& f, Tup&& t, std::index_sequence<I...>)
+  -> decltype(auto)
+{
+  return f(thrust::get<I>(t)...);
+}
+
+/**
+ * C++ 17 style apply for thrust tuple.
+ *
+ * \param f function to apply
+ * \param t tuple of arguments
+ */
+template <typename Fn,
+          typename Tup,
+          std::size_t kTupSize = thrust::tuple_size<std::remove_reference_t<Tup>>::value>
+MDSPAN_INLINE_FUNCTION auto constexpr apply(Fn&& f, Tup&& t) -> decltype(auto)
+{
+  return apply_impl(
+    std::forward<Fn>(f), std::forward<Tup>(t), std::make_index_sequence<kTupSize>{});
+}
 }  // namespace raft::detail
diff --git a/cpp/test/mdarray.cu b/cpp/test/mdarray.cu
index 6e1ad7158d..1b3ed1a0ee 100644
--- a/cpp/test/mdarray.cu
+++ b/cpp/test/mdarray.cu
@@ -418,7 +418,8 @@ TEST(MDArray, FuncArg)
   }
 }
 
-TEST(MDArray, Unravel)
+namespace {
+void test_mdarray_unravel()
 {
   {
     uint32_t v{0};
@@ -449,32 +450,63 @@ TEST(MDArray, Unravel)
   // examples in numpy unravel_index
   {
     auto coord = detail::unravel_index(22, detail::matrix_extent{7, 6}, stdex::layout_right{});
-    static_assert(cuda::std::tuple_size<decltype(coord)>::value == 2);
-    ASSERT_EQ(cuda::std::get<0>(coord), 3);
-    ASSERT_EQ(cuda::std::get<1>(coord), 4);
+    static_assert(thrust::tuple_size<decltype(coord)>::value == 2);
+    ASSERT_EQ(thrust::get<0>(coord), 3);
+    ASSERT_EQ(thrust::get<1>(coord), 4);
   }
   {
     auto coord = detail::unravel_index(41, detail::matrix_extent{7, 6}, stdex::layout_right{});
-    static_assert(cuda::std::tuple_size<decltype(coord)>::value == 2);
-    ASSERT_EQ(cuda::std::get<0>(coord), 6);
-    ASSERT_EQ(cuda::std::get<1>(coord), 5);
+    static_assert(thrust::tuple_size<decltype(coord)>::value == 2);
+    ASSERT_EQ(thrust::get<0>(coord), 6);
+    ASSERT_EQ(thrust::get<1>(coord), 5);
   }
   {
     auto coord = detail::unravel_index(37, detail::matrix_extent{7, 6}, stdex::layout_right{});
-    static_assert(cuda::std::tuple_size<decltype(coord)>::value == 2);
-    ASSERT_EQ(cuda::std::get<0>(coord), 6);
-    ASSERT_EQ(cuda::std::get<1>(coord), 1);
-
+    static_assert(thrust::tuple_size<decltype(coord)>::value == 2);
+    ASSERT_EQ(thrust::get<0>(coord), 6);
+    ASSERT_EQ(thrust::get<1>(coord), 1);
+  }
+  // assignment
+  {
     auto m   = make_host_matrix<float>(7, 6);
     auto m_v = m.view();
     for (size_t i = 0; i < m.size(); ++i) {
       auto coord = detail::unravel_index(i, m.extents(), typename decltype(m)::layout_type{});
-      cuda::std::apply(m_v, coord) = i;
+      detail::apply(m_v, coord) = i;
     }
     for (size_t i = 0; i < m.size(); ++i) {
       auto coord = detail::unravel_index(i, m.extents(), typename decltype(m)::layout_type{});
-      ASSERT_EQ(cuda::std::apply(m_v, coord), i);
+      ASSERT_EQ(detail::apply(m_v, coord), i);
     }
   }
+
+  {
+    handle_t handle;
+    auto m   = make_device_matrix<float>(handle, 7, 6);
+    auto m_v = m.view();
+    thrust::for_each_n(handle.get_thrust_policy(),
+                       thrust::make_counting_iterator(0ul),
+                       m_v.size(),
+                       [=] __device__(size_t i) {
+                         auto coord = detail::unravel_index(
+                           i, m_v.extents(), typename decltype(m_v)::layout_type{});
+                         detail::apply(m_v, coord) = static_cast<float>(i);
+                       });
+    thrust::device_vector<int32_t> status(1, 0);
+    auto p_status = status.data().get();
+    thrust::for_each_n(handle.get_thrust_policy(),
+                       thrust::make_counting_iterator(0ul),
+                       m_v.size(),
+                       [=] __device__(size_t i) {
+                         auto coord = detail::unravel_index(
+                           i, m_v.extents(), typename decltype(m_v)::layout_type{});
+                         auto v = detail::apply(m_v, coord);
+                         if (v != static_cast<float>(i)) { raft::myAtomicAdd(p_status, 1); }
+                       });
+    check_status(p_status, handle.get_stream());
+  }
 }
+}  // anonymous namespace
+
+TEST(MDArray, Unravel) { test_mdarray_unravel(); }
 }  // namespace raft