-
Notifications
You must be signed in to change notification settings - Fork 197
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement unravel_index for row-major array. #631
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,6 +26,7 @@ | |
#include <rmm/cuda_stream_view.hpp> | ||
#include <rmm/device_uvector.hpp> | ||
#include <thrust/device_ptr.h> | ||
#include <thrust/tuple.h> | ||
|
||
namespace raft::detail { | ||
/** | ||
|
@@ -238,4 +239,128 @@ namespace stdex = std::experimental; | |
using vector_extent = stdex::extents<dynamic_extent>; | ||
using matrix_extent = stdex::extents<dynamic_extent, dynamic_extent>; | ||
using scalar_extent = stdex::extents<1>; | ||
|
||
template <typename T> | ||
MDSPAN_INLINE_FUNCTION auto native_popc(T v) -> int32_t | ||
{ | ||
int c = 0; | ||
for (; v != 0; v &= v - 1) { | ||
c++; | ||
} | ||
return c; | ||
} | ||
|
||
MDSPAN_INLINE_FUNCTION auto popc(uint32_t v) -> int32_t | ||
{ | ||
#if defined(__CUDA_ARCH__) | ||
return __popc(v); | ||
#elif defined(__GNUC__) || defined(__clang__) | ||
return __builtin_popcount(v); | ||
#else | ||
return native_popc(v); | ||
#endif // compiler | ||
} | ||
|
||
MDSPAN_INLINE_FUNCTION auto popc(uint64_t v) -> int32_t | ||
{ | ||
#if defined(__CUDA_ARCH__) | ||
return __popcll(v); | ||
#elif defined(__GNUC__) || defined(__clang__) | ||
return __builtin_popcountll(v); | ||
#else | ||
return native_popc(v); | ||
#endif // compiler | ||
} | ||
|
||
template <class T, std::size_t N, std::size_t... Idx> | ||
MDSPAN_INLINE_FUNCTION constexpr auto arr_to_tup(T (&arr)[N], std::index_sequence<Idx...>) | ||
{ | ||
return thrust::make_tuple(arr[Idx]...); | ||
} | ||
|
||
template <class T, std::size_t N> | ||
MDSPAN_INLINE_FUNCTION constexpr auto arr_to_tup(T (&arr)[N]) | ||
{ | ||
return arr_to_tup(arr, std::make_index_sequence<N>{}); | ||
} | ||
|
||
// uint division optimization inspired by the CIndexer in cupy. Division operation is | ||
// slow on both CPU and GPU, especially 64 bit integer. So here we first try to avoid 64 | ||
// bit when the index is smaller, then try to avoid division when it's exp of 2. | ||
template <typename I, size_t... Extents> | ||
MDSPAN_INLINE_FUNCTION auto unravel_index_impl(I idx, stdex::extents<Extents...> shape) | ||
{ | ||
constexpr auto kRank = static_cast<int32_t>(shape.rank()); | ||
size_t index[shape.rank()]{0}; // NOLINT | ||
static_assert(std::is_signed<decltype(kRank)>::value, | ||
"Don't change the type without changing the for loop."); | ||
for (int32_t dim = kRank; --dim > 0;) { | ||
auto s = static_cast<std::remove_const_t<std::remove_reference_t<I>>>(shape.extent(dim)); | ||
if (s & (s - 1)) { | ||
auto t = idx / s; | ||
index[dim] = idx - t * s; | ||
idx = t; | ||
} else { // exp of 2 | ||
index[dim] = idx & (s - 1); | ||
idx >>= popc(s - 1); | ||
} | ||
} | ||
index[0] = idx; | ||
return arr_to_tup(index); | ||
} | ||
|
||
/** | ||
* \brief Turns linear index into coordinate. Similar to numpy unravel_index. This is not | ||
* exposed to public as it's not part of the mdspan proposal, the returned tuple | ||
* can not be directly used for indexing into mdspan and we might change the return | ||
* type in the future. | ||
* | ||
* \code | ||
* auto m = make_host_matrix<float>(7, 6); | ||
* auto m_v = m.view(); | ||
* auto coord = detail::unravel_index(2, m.extents(), typename decltype(m)::layout_type{}); | ||
* detail::apply(m_v, coord) = 2; | ||
* \endcode | ||
* | ||
* \param idx The linear index. | ||
* \param shape The shape of the array to use. | ||
* \param layout Must be `layout_right` (row-major) in current implementation. | ||
* | ||
* \return A thrust::tuple that represents the coordinate. | ||
*/ | ||
template <typename LayoutPolicy, std::size_t... Exts> | ||
MDSPAN_INLINE_FUNCTION auto unravel_index(size_t idx, | ||
detail::stdex::extents<Exts...> shape, | ||
LayoutPolicy const&) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this function arg needed? To stay compatible with NumPy args? I think it's acceptable that we are able to get this information directly from the template type, and thus to remove this arg There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Depends on how you like to call the function: unravel_index<stdex::layout_rigth>(idx, shape) or unravel_index(idx, shape, stdex::layout_right{}) I chose the second one as it feels more aligned with mdspan design: submdspan(array, std::full_extent_t{}) // use constructor of full_extent_t here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmmm, that is fair. I see value in deduced template types as compared to explicit |
||
{ | ||
static_assert(std::is_same<LayoutPolicy, stdex::layout_right>::value, | ||
"Only C layout is supported."); | ||
if (idx > std::numeric_limits<uint32_t>::max()) { | ||
return unravel_index_impl<uint64_t, Exts...>(static_cast<uint64_t>(idx), shape); | ||
} else { | ||
return unravel_index_impl<uint32_t, Exts...>(static_cast<uint32_t>(idx), shape); | ||
} | ||
} | ||
|
||
template <typename Fn, typename Tup, size_t... I> | ||
MDSPAN_INLINE_FUNCTION auto constexpr apply_impl(Fn&& f, Tup&& t, std::index_sequence<I...>) | ||
-> decltype(auto) | ||
{ | ||
return f(thrust::get<I>(t)...); | ||
} | ||
|
||
/** | ||
* C++ 17 style apply for thrust tuple. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Very much like this, and that it just works. I wager this would be very useful in accessing and assigning for COOs |
||
* | ||
* \param f function to apply | ||
* \param t tuple of arguments | ||
*/ | ||
template <typename Fn, | ||
typename Tup, | ||
std::size_t kTupSize = thrust::tuple_size<std::remove_reference_t<Tup>>::value> | ||
MDSPAN_INLINE_FUNCTION auto constexpr apply(Fn&& f, Tup&& t) -> decltype(auto) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The trailing return type here is redundant There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's required to denote that the return type might be a reference. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, I'm not sure if this will work. I'm okay with the former |
||
{ | ||
return apply_impl( | ||
std::forward<Fn>(f), std::forward<Tup>(t), std::make_index_sequence<kTupSize>{}); | ||
} | ||
} // namespace raft::detail |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it should be okay to expose this to public, as we are doing with
flatten
andreshape
in #601 ?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If others think it's public ready then I'm happy to expose it. For me these are the concerns:
cuda::std::tuple
or juststd::tuple
instead ofthrust::tuple
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@cjnolet had suggested that it's better we rely lesser on thrust as time goes on. I find value in using
cuda::std::tuple
overthrust::tuple
as I have a feeling the latter is going to be replaced by the former. But maybe Corey feels differently about introducing libcu++ back as a core dependencyThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@trivialfis I prefer to use
std::tuple
here. I'd like to keep boththrust
andlibcu++
out of public APIs. C++ stdlib is fine.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The challenge w/ header-only here is that the dependencies for all included headers are required at compile-time by consumers downstream. That includes things pulled in transitively from the
detail
namespace. The goal here is to draw the line and make it so that all the public apis in thecore/
directory can be safely exposed by our consumers through their own public APIs while not imposing any additional dependencies on their users outside of RMM and the CTK libs. Currently, our mdarray header indetail
pulls in thrust and I'd like to eventually separate that out so that thrust isn't a hard requirement just for includingmdarray
.If we want to allow this function to be included by files in
core/
, I would propose we find or create some other object to contain the unraveled indices. Though, if this is truly internal code and meant to stay that way, maybe we should consider separating this function out into a different header for now which is documented accordingly (for example, mentioning that it uses thrust, so it shouldn't be included by any headers incore/
). Maybe something likedetail/mdarray_internal_utils.hpp
just to be very explicit?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I will hide this function as an internal function. Closing this PR for now and will submit it again when it's actually used.