Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cudf::explode_position #7376

Merged
merged 9 commits into from
Feb 26, 2021
43 changes: 43 additions & 0 deletions cpp/include/cudf/reshape.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,49 @@ std::unique_ptr<table> explode(
size_type explode_column_idx,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Explodes a list column's elements and includes a position column.
*
* Any list is exploded, which means the elements of the list in each row are expanded into new rows
* in the output. The corresponding rows for other columns in the input are duplicated. A position
*column is added that has the index inside the original list for each row. Example:
hyperbolic2346 marked this conversation as resolved.
Show resolved Hide resolved
* ```
* [[5,10,15], 100],
* [[20,25], 200],
* [[30], 300],
* returns
* [5, 0, 100],
* [10, 1, 100],
* [15, 2, 100],
* [20, 0, 200],
* [25, 1, 200],
* [30, 0, 300],
* ```
*
* Nulls and empty lists propagate in different ways depending on what is null or empty.
*```
* [[5,null,15], 100],
* [null, 200],
* [[], 300],
* returns
* [5, 0, 100],
* [null, 1, 100],
* [15, 2, 100],
* ```
* Note that null lists are completely removed from the output
* and nulls and empty lists inside lists are pulled out and remain.
hyperbolic2346 marked this conversation as resolved.
Show resolved Hide resolved
*
* @param input_table Table to explode.
* @param explode_column_idx Column index to explode inside the table.
* @param mr Device memory resource used to allocate the returned column's device memory.
*
* @return A new table with explode_col exploded.
*/
std::unique_ptr<table> pos_explode(
hyperbolic2346 marked this conversation as resolved.
Show resolved Hide resolved
table_view const& input_table,
size_type explode_column_idx,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of group

} // namespace cudf
99 changes: 65 additions & 34 deletions cpp/src/reshape/explode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
*/

#include <cudf/column/column_device_view.cuh>
#include <cudf/detail/gather.hpp>
#include <cudf/detail/gather.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/reshape.hpp>
Expand All @@ -40,9 +40,13 @@ namespace {
* @brief Function object for exploding a column.
*/
struct explode_functor {
/**
* @brief Function object for exploding a column.
*/
template <typename T>
std::unique_ptr<table> operator()(table_view const& input_table,
size_type explode_column_idx,
int const explode_column_idx,
bool include_pos,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr) const
{
Expand All @@ -55,58 +59,72 @@ struct explode_functor {
template <>
std::unique_ptr<table> explode_functor::operator()<list_view>(
table_view const& input_table,
size_type explode_column_idx,
int const explode_column_idx,
hyperbolic2346 marked this conversation as resolved.
Show resolved Hide resolved
bool include_pos,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr) const
{
/* we explode by building a gather map that includes the number of entries in each list inside
the column for each index. Interestingly, this can be done with lower_bound across the offsets
as values between the offsets will all map down to the index below. We have some off-by-one
manipulations we need to do with the output, but it's almost our gather map by itself. Once we
build the gather map we need to remove the explode column from the table and run gather on it.
Next we build the explode column, which turns out is simply lifting the child column out of the
explode column. This unrolls the top level of lists. Then we need to insert the explode column
back into the table and return it. */
lists_column_view lc{input_table.column(explode_column_idx)};
auto sliced_child = lc.get_sliced_child(stream);
rmm::device_uvector<size_type> gather_map_indices(sliced_child.size(), stream, mr);
rmm::device_uvector<size_type> gather_map_indices(sliced_child.size(), stream);

// sliced columns can make this a little tricky. We have to start iterating at the start of the
// offsets for this column, which could be > 0. Then we also have to handle rebasing the offsets
// as we go.
auto offsets = lc.offsets().begin<size_type>() + lc.offset();
// Sliced columns may require rebasing of the offsets.
auto offsets = lc.offsets().begin<int32_t>() + lc.offset();
hyperbolic2346 marked this conversation as resolved.
Show resolved Hide resolved
auto offsets_minus_one = thrust::make_transform_iterator(
offsets, [offsets] __device__(auto i) { return (i - offsets[0]) - 1; });
auto counting_iter = thrust::make_counting_iterator(0);

rmm::device_uvector<size_type> pos(include_pos ? sliced_child.size() : 0, stream, mr);

// This looks like an off-by-one bug, but what is going on here is that we need to reduce each
// result from `lower_bound` by 1 to build the correct gather map. It was pointed out that
// this can be accomplished by simply skipping the first entry and using the result of
// `lower_bound` directly.
thrust::lower_bound(rmm::exec_policy(stream),
offsets_minus_one + 1,
offsets_minus_one + lc.size() + 1,
counting_iter,
counting_iter + gather_map_indices.size(),
gather_map_indices.begin());
// result from `lower_bound` by 1 to build the correct gather map. This can be accomplished by
// skipping the first entry and using the result of `lower_bound` directly.
if (include_pos) {
thrust::transform(
rmm::exec_policy(stream),
counting_iter,
counting_iter + gather_map_indices.size(),
gather_map_indices.begin(),
[position_array = pos.data(), offsets_minus_one, offsets, offset_size = lc.size()] __device__(
auto idx) -> size_type {
auto lb_idx =
thrust::lower_bound(
thrust::seq, offsets_minus_one + 1, offsets_minus_one + offset_size + 1, idx) -
(offsets_minus_one + 1);
hyperbolic2346 marked this conversation as resolved.
Show resolved Hide resolved
position_array[idx] = idx - (offsets[lb_idx] - offsets[0]);
return lb_idx;
});
} else {
thrust::lower_bound(rmm::exec_policy(stream),
offsets_minus_one + 1,
offsets_minus_one + lc.size() + 1,
counting_iter,
counting_iter + gather_map_indices.size(),
gather_map_indices.begin());
}

auto select_iter = thrust::make_transform_iterator(
thrust::make_counting_iterator(0),
[explode_column_idx](size_type i) { return i >= explode_column_idx ? i + 1 : i; });
std::vector<size_type> selected_columns(select_iter, select_iter + input_table.num_columns() - 1);

auto gathered_table = cudf::detail::gather(
input_table.select(selected_columns),
column_view(data_type(type_to_id<size_type>()), sliced_child.size(), gather_map_indices.data()),
cudf::out_of_bounds_policy::DONT_CHECK,
cudf::detail::negative_index_policy::ALLOWED,
stream,
mr);
auto gathered_table = cudf::detail::gather(input_table.select(selected_columns),
gather_map_indices.begin(),
gather_map_indices.end(),
cudf::out_of_bounds_policy::DONT_CHECK,
stream,
mr);

std::vector<std::unique_ptr<column>> columns = gathered_table.release()->release();

columns.insert(columns.begin() + explode_column_idx,
std::make_unique<column>(column(sliced_child, stream, mr)));
std::make_unique<column>(sliced_child, stream, mr));

if (include_pos) {
columns.insert(columns.begin() + explode_column_idx,
std::make_unique<column>(
data_type(type_to_id<size_type>()), sliced_child.size(), pos.release()));
}

return std::make_unique<table>(std::move(columns));
}
Expand All @@ -120,13 +138,15 @@ std::unique_ptr<table> explode_functor::operator()<list_view>(
*/
std::unique_ptr<table> explode(table_view const& input_table,
size_type explode_column_idx,
bool include_pos,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
return type_dispatcher(input_table.column(explode_column_idx).type(),
explode_functor{},
input_table,
explode_column_idx,
include_pos,
stream,
mr);
}
Expand All @@ -141,7 +161,18 @@ std::unique_ptr<table> explode(table_view const& input_table,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::explode(input_table, explode_column_idx, rmm::cuda_stream_default, mr);
return detail::explode(input_table, explode_column_idx, false, rmm::cuda_stream_default, mr);
}

/**
* @copydoc cudf::pos_explode(input_table,explode_column_idx,rmm::mr::device_memory_resource)
*/
std::unique_ptr<table> pos_explode(table_view const& input_table,
size_type explode_column_idx,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::explode(input_table, explode_column_idx, true, rmm::cuda_stream_default, mr);
}

} // namespace cudf
78 changes: 76 additions & 2 deletions cpp/tests/reshape/explode_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,13 @@ TEST_F(ExplodeTest, Empty)
cudf::table_view expected({expected_a, expected_b});

CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);

auto pos_ret = cudf::pos_explode(t, 0);

fixed_width_column_wrapper<int32_t> expected_c{};
cudf::table_view pos_expected({expected_a, expected_b, expected_c});

CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
}

TEST_F(ExplodeTest, NonList)
Expand All @@ -57,6 +64,7 @@ TEST_F(ExplodeTest, NonList)
cudf::table_view t({a, b});

EXPECT_THROW(cudf::explode(t, 1), cudf::logic_error);
EXPECT_THROW(cudf::pos_explode(t, 1), cudf::logic_error);
}

TEST_F(ExplodeTest, Basics)
Expand Down Expand Up @@ -85,6 +93,12 @@ TEST_F(ExplodeTest, Basics)
auto ret = cudf::explode(t, 1);

CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);

fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 2, 0, 1, 0, 1};
cudf::table_view pos_expected({expected_a, expected_pos_col, expected_b, expected_c});

auto pos_ret = cudf::pos_explode(t, 1);
CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
}

TEST_F(ExplodeTest, SingleNull)
Expand Down Expand Up @@ -116,6 +130,12 @@ TEST_F(ExplodeTest, SingleNull)
auto ret = cudf::explode(t, 0);

CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);

fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 0, 1};
cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});

auto pos_ret = cudf::pos_explode(t, 0);
CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
}

TEST_F(ExplodeTest, Nulls)
Expand Down Expand Up @@ -147,6 +167,12 @@ TEST_F(ExplodeTest, Nulls)
auto ret = cudf::explode(t, 0);

CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);

fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 2, 0, 1};
cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});

auto pos_ret = cudf::pos_explode(t, 0);
CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
}

TEST_F(ExplodeTest, NullsInList)
Expand Down Expand Up @@ -178,6 +204,12 @@ TEST_F(ExplodeTest, NullsInList)
auto ret = cudf::explode(t, 0);

CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);

fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 2, 0, 1, 2, 3, 0, 1, 2};
cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});

auto pos_ret = cudf::pos_explode(t, 0);
CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
}

TEST_F(ExplodeTest, Nested)
Expand Down Expand Up @@ -214,6 +246,12 @@ TEST_F(ExplodeTest, Nested)
auto ret = cudf::explode(t, 0);

CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);

fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 0, 0, 1, 2, 3};
cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});

auto pos_ret = cudf::pos_explode(t, 0);
CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
}

TEST_F(ExplodeTest, NestedNulls)
Expand Down Expand Up @@ -253,6 +291,12 @@ TEST_F(ExplodeTest, NestedNulls)
auto ret = cudf::explode(t, 0);

CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);

fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 0, 1, 2};
cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});

auto pos_ret = cudf::pos_explode(t, 0);
CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
}

TEST_F(ExplodeTest, NullsInNested)
Expand Down Expand Up @@ -290,6 +334,12 @@ TEST_F(ExplodeTest, NullsInNested)
auto ret = cudf::explode(t, 0);

CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);

fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 0, 0, 1, 2};
cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});

auto pos_ret = cudf::pos_explode(t, 0);
CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
}

TEST_F(ExplodeTest, NullsInNestedDoubleExplode)
Expand Down Expand Up @@ -322,10 +372,16 @@ TEST_F(ExplodeTest, NullsInNestedDoubleExplode)
cudf::table_view t({a, b});
cudf::table_view expected({expected_a, expected_b});

auto ret = cudf::explode(t, 0);
ret = cudf::explode(ret->view(), 0);
auto first_explode_ret = cudf::explode(t, 0);
auto ret = cudf::explode(first_explode_ret->view(), 0);

CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);

fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 0, 1, 2, 0, 1, 0, 1, 0, 0, 1};
cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});

auto pos_ret = cudf::pos_explode(first_explode_ret->view(), 0);
CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
}

TEST_F(ExplodeTest, NestedStructs)
Expand Down Expand Up @@ -367,6 +423,12 @@ TEST_F(ExplodeTest, NestedStructs)
auto ret = cudf::explode(t, 0);

CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);

fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 0, 0, 1, 2};
cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});

auto pos_ret = cudf::pos_explode(t, 0);
CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
}

TYPED_TEST(ExplodeTypedTest, ListOfStructs)
Expand Down Expand Up @@ -406,6 +468,12 @@ TYPED_TEST(ExplodeTypedTest, ListOfStructs)
cudf::table_view expected({expected_a->view(), expected_b});

CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);

fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
cudf::table_view pos_expected({expected_pos_col, expected_a->view(), expected_b});

auto pos_ret = cudf::pos_explode(t, 0);
CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
}

TEST_F(ExplodeTest, SlicedList)
Expand Down Expand Up @@ -453,4 +521,10 @@ TEST_F(ExplodeTest, SlicedList)
auto ret = cudf::explode(sliced_t[0], 0);

CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);

fixed_width_column_wrapper<int32_t> expected_pos_col{0, 1, 2, 0, 1, 2};
cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});

auto pos_ret = cudf::pos_explode(sliced_t[0], 0);
CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
}