Skip to content

Commit

Permalink
Optionally nullify out-of-bounds indices in segmented_gather(). (#9318)
Browse files Browse the repository at this point in the history
The behaviour of `cudf::lists::segmented_gather()` is currently undefined for any
index value `i` that falls outside the range `[-n, n)`, where `n` is the number of
elements in the list row.

This commit adds support to explicitly specify an `out_of_bounds_policy`, like in
`cudf::gather()`. The erstwhile behaviour is retained when the bounds policy is set
to `DONT_CHECK`. If the bounds policy is specified as `NULLIFY`, then for any
index falling outside the range `[-n, n)`, the list element is set to `null`.

E.g.
```c++
auto source_column = [{"a", "b", "c", "d"}, {"1", "2", "3", "4"}, {"x", "y", "z"}];
auto gather_map    = [{0, -1, 4, -5}, {1, 3, 5}, {}];
auto result = segmented_gather(source_column, gather_map, NULLIFY);
result == [{"a", "d", null, null}, {"2", "4", null}, {}];
```

Authors:
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Conor Hoekstra (https://github.com/codereport)
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)

URL: #9318
  • Loading branch information
mythrocks authored Sep 30, 2021
1 parent ef50796 commit 5cea6b5
Show file tree
Hide file tree
Showing 4 changed files with 331 additions and 193 deletions.
2 changes: 2 additions & 0 deletions cpp/include/cudf/lists/detail/gather.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -283,13 +283,15 @@ std::unique_ptr<column> gather_list_leaf(
/**
* @copydoc cudf::lists::segmented_gather(lists_column_view const& source_column,
* lists_column_view const& gather_map_list,
* out_of_bounds_policy bounds_policy,
* rmm::mr::device_memory_resource* mr)
*
* @param stream CUDA stream on which to execute kernels
*/
std::unique_ptr<column> segmented_gather(
lists_column_view const& source_column,
lists_column_view const& gather_map_list,
out_of_bounds_policy bounds_policy = out_of_bounds_policy::DONT_CHECK,
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

Expand Down
19 changes: 17 additions & 2 deletions cpp/include/cudf/lists/gather.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include <cudf/column/column.hpp>
#include <cudf/column/column_view.hpp>
#include <cudf/copying.hpp>
#include <cudf/lists/lists_column_view.hpp>

namespace cudf {
Expand All @@ -32,7 +33,7 @@ namespace lists {
*
* `source_column` with any depth and `gather_map_list` with depth 1 are only supported.
*
* * @code{.pseudo}
* @code{.pseudo}
* source_column : [{"a", "b", "c", "d"}, {"1", "2", "3", "4"}, {"x", "y", "z"}]
* gather_map_list : [{0, 1, 3, 2}, {1, 3, 2}, {}]
*
Expand All @@ -44,18 +45,32 @@ namespace lists {
* @throws cudf::logic_error if gather_map is not list column of an index type.
*
* If indices in `gather_map_list` are outside the range `[-n, n)`, where `n` is the number of
* elements in corresponding row of the source column, the behavior is undefined.
* elements in corresponding row of the source column, the behaviour is as follows:
* 1. If `bounds_policy` is set to `DONT_CHECK`, the behaviour is undefined.
* 2. If `bounds_policy` is set to `NULLIFY`, the corresponding element in the list row
* is set to null in the output column.
*
* @code{.pseudo}
* source_column : [{"a", "b", "c", "d"}, {"1", "2", "3", "4"}, {"x", "y", "z"}]
* gather_map_list : [{0, -1, 4, -5}, {1, 3, 5}, {}]
*
* result_with_nullify : [{"a", "d", null, null}, {"2", "4", null}, {}]
* @endcode
*
* @param source_column View into the list column to gather from
* @param gather_map_list View into a non-nullable list column of integral indices that maps the
* element in list of each row in the source columns to rows of lists in the destination columns.
* @param bounds_policy Can be `DONT_CHECK` or `NULLIFY`. Selects whether or not to nullify the
* output list row's element, when the gather index falls outside the range `[-n, n)`,
* where `n` is the number of elements in list row corresponding to the gather-map row.
* @param mr Device memory resource to allocate any returned objects
* @return column with elements in list of rows gathered based on `gather_map_list`
*
*/
std::unique_ptr<column> segmented_gather(
lists_column_view const& source_column,
lists_column_view const& gather_map_list,
out_of_bounds_policy bounds_policy = out_of_bounds_policy::DONT_CHECK,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group
Expand Down
43 changes: 29 additions & 14 deletions cpp/src/lists/copying/segmented_gather.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <cudf/lists/detail/gather.cuh>

#include <thrust/binary_search.h>

#include <rmm/cuda_stream_view.hpp>

namespace cudf {
Expand All @@ -29,6 +30,7 @@ namespace detail {

std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,
lists_column_view const& gather_map,
out_of_bounds_policy bounds_policy,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
Expand All @@ -38,35 +40,46 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,
CUDF_EXPECTS(value_column.size() == gather_map.size(),
"Gather map and list column should be same size");

auto gather_map_sliced_child = gather_map.get_sliced_child(stream);
auto const gather_map_size = gather_map_sliced_child.size();
auto gather_index_begin = gather_map.offsets().begin<size_type>() + 1 + gather_map.offset();
auto gather_index_end = gather_index_begin + gather_map.size();
auto value_offsets = value_column.offsets().begin<size_type>() + value_column.offset();
auto map_begin = cudf::detail::indexalator_factory::make_input_iterator(gather_map_sliced_child);
auto const gather_map_sliced_child = gather_map.get_sliced_child(stream);
auto const gather_map_size = gather_map_sliced_child.size();
auto const gather_index_begin = gather_map.offsets_begin() + 1;
auto const gather_index_end = gather_map.offsets_end();
auto const value_offsets = value_column.offsets_begin();
auto const map_begin =
cudf::detail::indexalator_factory::make_input_iterator(gather_map_sliced_child);
auto const out_of_bounds = [] __device__(auto const index, auto const list_size) {
return index >= list_size || (index < 0 && -index > list_size);
};

// Calculate Flattened gather indices (value_offset[row]+sub_index
auto transformer = [value_offsets, map_begin, gather_index_begin, gather_index_end] __device__(
size_type index) -> size_type {
auto transformer = [value_offsets,
map_begin,
gather_index_begin,
gather_index_end,
bounds_policy,
out_of_bounds] __device__(size_type index) -> size_type {
// Get each row's offset. (Each row is a list).
auto offset_idx =
thrust::upper_bound(
thrust::seq, gather_index_begin, gather_index_end, gather_index_begin[-1] + index) -
gather_index_begin;
// Get each sub_index in list in each row of gather_map.
auto sub_index = map_begin[index];
auto list_size = value_offsets[offset_idx + 1] - value_offsets[offset_idx];
auto wrapped_sub_index = (sub_index % list_size + list_size) % list_size;
auto sub_index = map_begin[index];
auto list_size = value_offsets[offset_idx + 1] - value_offsets[offset_idx];
auto wrapped_sub_index = sub_index < 0 ? sub_index + list_size : sub_index;
auto constexpr null_idx = cuda::std::numeric_limits<cudf::size_type>::max();
// Add sub_index to value_column offsets, to get gather indices of child of value_column
return value_offsets[offset_idx] + wrapped_sub_index - value_offsets[0];
return (bounds_policy == out_of_bounds_policy::NULLIFY && out_of_bounds(sub_index, list_size))
? null_idx
: value_offsets[offset_idx] + wrapped_sub_index - value_offsets[0];
};
auto child_gather_index_begin = cudf::detail::make_counting_transform_iterator(0, transformer);

// Call gather on child of value_column
auto child_table = cudf::detail::gather(table_view({value_column.get_sliced_child(stream)}),
child_gather_index_begin,
child_gather_index_begin + gather_map_size,
out_of_bounds_policy::DONT_CHECK,
bounds_policy,
stream,
mr);
auto child = std::move(child_table->release().front());
Expand Down Expand Up @@ -94,9 +107,11 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,

std::unique_ptr<column> segmented_gather(lists_column_view const& source_column,
lists_column_view const& gather_map_list,
out_of_bounds_policy bounds_policy,
rmm::mr::device_memory_resource* mr)
{
return detail::segmented_gather(source_column, gather_map_list, rmm::cuda_stream_default, mr);
return detail::segmented_gather(
source_column, gather_map_list, bounds_policy, rmm::cuda_stream_default, mr);
}

} // namespace lists
Expand Down
Loading

0 comments on commit 5cea6b5

Please sign in to comment.