diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu
index 8233635050e..2b495deb47f 100644
--- a/cpp/src/lists/explode.cu
+++ b/cpp/src/lists/explode.cu
@@ -188,7 +188,7 @@ std::unique_ptr
explode_outer(table_view const& input_table,
});
thrust::inclusive_scan(rmm::exec_policy(stream),
null_or_empty,
- null_or_empty + sliced_child.size(),
+ null_or_empty + explode_col.size(),
null_or_empty_offset.begin());
auto null_or_empty_count =
@@ -209,41 +209,48 @@ std::unique_ptr explode_outer(table_view const& input_table,
// offsets + 1 here to skip the 0th offset, which removes a - 1 operation later.
auto offsets_minus_one = thrust::make_transform_iterator(
thrust::next(offsets), [offsets] __device__(auto i) { return (i - offsets[0]) - 1; });
+
+ auto fill_gather_maps = [offsets_minus_one,
+ gather_map_p = gather_map.begin(),
+ explode_col_gather_map_p = explode_col_gather_map.begin(),
+ position_array = pos.begin(),
+ sliced_child_size = sliced_child.size(),
+ null_or_empty_offset_p = null_or_empty_offset.begin(),
+ include_position,
+ offsets,
+ null_or_empty,
+ offset_size = explode_col.offsets().size() - 1] __device__(auto idx) {
+ if (idx < sliced_child_size) {
+ auto lb_idx =
+ thrust::distance(offsets_minus_one,
+ thrust::lower_bound(
+ thrust::seq, offsets_minus_one, offsets_minus_one + (offset_size), idx));
+ auto index_to_write = null_or_empty_offset_p[lb_idx] + idx;
+ gather_map_p[index_to_write] = lb_idx;
+ explode_col_gather_map_p[index_to_write] = idx;
+ if (include_position) {
+ position_array[index_to_write] = idx - (offsets[lb_idx] - offsets[0]);
+ }
+ }
+ if (null_or_empty[idx]) {
+ auto invalid_index = null_or_empty_offset_p[idx] == 0
+ ? offsets[idx]
+ : offsets[idx] + null_or_empty_offset_p[idx] - 1;
+ gather_map_p[invalid_index] = idx;
+
+ // negative one to indicate a null value
+ explode_col_gather_map_p[invalid_index] = -1;
+ if (include_position) { position_array[invalid_index] = 0; }
+ }
+ };
+
+ // we need to do this loop at least explode_col times or we may not properly fill in null and
+ // empty entries.
+ auto loop_count = std::max(sliced_child.size(), explode_col.size());
+
// Fill in gather map with all the child column's entries
- thrust::for_each(rmm::exec_policy(stream),
- counting_iter,
- counting_iter + sliced_child.size(),
- [offsets_minus_one,
- gather_map = gather_map.begin(),
- explode_col_gather_map = explode_col_gather_map.begin(),
- position_array = pos.begin(),
- include_position,
- offsets,
- null_or_empty_offset = null_or_empty_offset.begin(),
- null_or_empty,
- offset_size = explode_col.offsets().size() - 1] __device__(auto idx) {
- auto lb_idx = thrust::distance(
- offsets_minus_one,
- thrust::lower_bound(
- thrust::seq, offsets_minus_one, offsets_minus_one + (offset_size), idx));
- auto index_to_write = null_or_empty_offset[lb_idx] + idx;
- gather_map[index_to_write] = lb_idx;
- explode_col_gather_map[index_to_write] = idx;
- if (include_position) {
- position_array[index_to_write] = idx - (offsets[lb_idx] - offsets[0]);
- }
- if (null_or_empty[idx]) {
- auto invalid_index = null_or_empty_offset[idx] == 0
- ? offsets[idx]
- : offsets[idx] + null_or_empty_offset[idx] - 1;
- gather_map[invalid_index] = idx;
-
- // negative one to indicate a null value
- explode_col_gather_map[invalid_index] = -1;
-
- if (include_position) { position_array[invalid_index] = 0; }
- }
- });
+ thrust::for_each(
+ rmm::exec_policy(stream), counting_iter, counting_iter + loop_count, fill_gather_maps);
return build_table(
input_table,
diff --git a/cpp/tests/lists/explode_tests.cpp b/cpp/tests/lists/explode_tests.cpp
index 2ec9294d118..4c7ded0efd7 100644
--- a/cpp/tests/lists/explode_tests.cpp
+++ b/cpp/tests/lists/explode_tests.cpp
@@ -102,15 +102,17 @@ TEST_F(ExplodeTest, Basics)
TEST_F(ExplodeTest, SingleNull)
{
// a b
- // [1, 2, 7] 100
+ // null 100
// [5, 6] 200
// [] 300
// [0, 3] 400
+ constexpr auto null = 0;
+
auto first_invalid =
cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 0 ? false : true; });
- LCW a({LCW{1, 2, 7}, LCW{5, 6}, LCW{}, LCW{0, 3}}, first_invalid);
+ LCW a({LCW{null}, LCW{5, 6}, LCW{}, LCW{0, 3}}, first_invalid);
FCW b({100, 200, 300, 400});
FCW expected_a{5, 6, 0, 3};
@@ -134,15 +136,17 @@ TEST_F(ExplodeTest, Nulls)
{
// a b
// [1, 2, 7] 100
- // [5, 6] 200
+ // null 200
// [0, 3] 300
+ constexpr auto null = 0;
+
auto valids = cudf::detail::make_counting_transform_iterator(
0, [](auto i) { return i % 2 == 0 ? true : false; });
auto always_valid =
cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
- LCW a({LCW{1, 2, 7}, LCW{5, 6}, LCW{0, 3}}, valids);
+ LCW a({LCW{1, 2, 7}, LCW{null}, LCW{0, 3}}, valids);
FCW b({100, 200, 300}, valids);
FCW expected_a({1, 2, 7, 0, 3});
@@ -165,18 +169,21 @@ TEST_F(ExplodeTest, Nulls)
TEST_F(ExplodeTest, NullsInList)
{
// a b
- // [1, 2, 7] 100
- // [5, 6, 0, 9] 200
+ // [1, null, 7] 100
+ // [5, null, 0, null] 200
// [] 300
- // [0, 3, 8] 400
+ // [0, null, 8] 400
+
+ constexpr auto null = 0;
auto valids = cudf::detail::make_counting_transform_iterator(
0, [](auto i) { return i % 2 == 0 ? true : false; });
- LCW a{LCW({1, 2, 7}, valids), LCW({5, 6, 0, 9}, valids), LCW{}, LCW({0, 3, 8}, valids)};
+ LCW a{
+ LCW({1, null, 7}, valids), LCW({5, null, 0, null}, valids), LCW{}, LCW({0, null, 8}, valids)};
FCW b{100, 200, 300, 400};
- FCW expected_a({1, 2, 7, 5, 6, 0, 9, 0, 3, 8}, {1, 0, 1, 1, 0, 1, 0, 1, 0, 1});
+ FCW expected_a({1, null, 7, 5, null, 0, null, 0, null, 8}, {1, 0, 1, 1, 0, 1, 0, 1, 0, 1});
FCW expected_b{100, 100, 100, 200, 200, 200, 200, 400, 400, 400};
cudf::table_view t({a, b});
@@ -224,16 +231,18 @@ TEST_F(ExplodeTest, NestedNulls)
{
// a b
// [[1, 2], [7, 6, 5]] 100
- // [[5, 6]] 200
+ // null null
// [[0, 3],[5],[2, 1]] 300
+ constexpr auto null = 0;
+
auto valids = cudf::detail::make_counting_transform_iterator(
0, [](auto i) { return i % 2 == 0 ? true : false; });
auto always_valid =
cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
- LCW a({LCW{LCW{1, 2}, LCW{7, 6, 5}}, LCW{LCW{5, 6}}, LCW{LCW{0, 3}, LCW{5}, LCW{2, 1}}}, valids);
- FCW b({100, 200, 300}, valids);
+ LCW a({LCW{LCW{1, 2}, LCW{7, 6, 5}}, LCW{LCW{null}}, LCW{LCW{0, 3}, LCW{5}, LCW{2, 1}}}, valids);
+ FCW b({100, null, 300}, valids);
LCW expected_a{LCW{1, 2}, LCW{7, 6, 5}, LCW{0, 3}, LCW{5}, LCW{2, 1}};
FCW expected_b({100, 100, 300, 300, 300}, always_valid);
@@ -254,21 +263,23 @@ TEST_F(ExplodeTest, NestedNulls)
TEST_F(ExplodeTest, NullsInNested)
{
- // a b
- // [[1, 2], [7, 6, 5]] 100
- // [[5, 6]] 200
- // [[0, 3],[5],[2, 1]] 300
+ // a b
+ // [[1, null], [7, 6, 5]] 100
+ // [[5, 6]] 200
+ // [[0, 3],[5],[2, null]] 300
+
+ constexpr auto null = 0;
auto valids = cudf::detail::make_counting_transform_iterator(
0, [](auto i) { return i % 2 == 0 ? true : false; });
- LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}},
+ LCW a({LCW{LCW({1, null}, valids), LCW{7, 6, 5}},
LCW{LCW{5, 6}},
- LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}});
+ LCW{LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}});
FCW b({100, 200, 300});
LCW expected_a{
- LCW({1, 2}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)};
+ LCW({1, null}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, null}, valids)};
FCW expected_b{100, 100, 200, 300, 300, 300};
cudf::table_view t({a, b});
@@ -287,20 +298,22 @@ TEST_F(ExplodeTest, NullsInNested)
TEST_F(ExplodeTest, NullsInNestedDoubleExplode)
{
- // a b
- // [[1, 2], [], [7, 6, 5]] 100
- // [[5, 6]] 200
- // [[0, 3],[5],[2, 1]] 300
+ // a b
+ // [[1, null], [], [7, 6, 5]] 100
+ // [[5, 6]] 200
+ // [[0, 3],[5],[2, null]] 300
+
+ constexpr auto null = 0;
auto valids = cudf::detail::make_counting_transform_iterator(
0, [](auto i) { return i % 2 == 0 ? true : false; });
- LCW a{LCW{LCW({1, 2}, valids), LCW{}, LCW{7, 6, 5}},
+ LCW a{LCW{LCW({1, null}, valids), LCW{}, LCW{7, 6, 5}},
LCW{LCW{5, 6}},
- LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}};
+ LCW{LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}};
FCW b{100, 200, 300};
- FCW expected_a({1, 2, 7, 6, 5, 5, 6, 0, 3, 5, 2, 1}, {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+ FCW expected_a({1, null, 7, 6, 5, 5, 6, 0, 3, 5, 2, null}, {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
FCW expected_b{100, 100, 100, 100, 100, 200, 200, 300, 300, 300, 300, 300};
cudf::table_view t({a, b});
@@ -320,23 +333,25 @@ TEST_F(ExplodeTest, NullsInNestedDoubleExplode)
TEST_F(ExplodeTest, NestedStructs)
{
- // a b
- // [[1, 2], [7, 6, 5]] {100, "100"}
- // [[5, 6]] {200, "200"}
- // [[0, 3],[5],[2, 1]] {300, "300"}
+ // a b
+ // [[1, null], [7, 6, 5]] {100, "100"}
+ // [[5, 6]] {200, "200"}
+ // [[0, 3],[5],[2, null]] {300, "300"}
+
+ constexpr auto null = 0;
auto valids = cudf::detail::make_counting_transform_iterator(
0, [](auto i) { return i % 2 == 0 ? true : false; });
- LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}},
+ LCW a({LCW{LCW({1, null}, valids), LCW{7, 6, 5}},
LCW{LCW{5, 6}},
- LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}});
+ LCW{LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}});
FCW b1({100, 200, 300});
strings_column_wrapper b2{"100", "200", "300"};
structs_column_wrapper b({b1, b2});
LCW expected_a{
- LCW({1, 2}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)};
+ LCW({1, null}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, null}, valids)};
FCW expected_b1{100, 100, 200, 300, 300, 300};
strings_column_wrapper expected_b2{"100", "100", "200", "300", "300", "300"};
structs_column_wrapper expected_b({expected_b1, expected_b2});
@@ -397,15 +412,17 @@ TYPED_TEST(ExplodeTypedTest, ListOfStructs)
TEST_F(ExplodeTest, SlicedList)
{
- // a b
- // [[1, 2],[7, 6, 5]] 100
- // [[5, 6]] 200
- // [[0, 3],[5],[2, 1]] 300
- // [[8, 3],[],[4, 3, 1, 2]] 400
- // [[2, 3, 4],[9, 8]] 500
+ // a b
+ // [[1, null],[7, 6, 5]] 100
+ // [[5, 6]] 200
+ // [[0, 3],[5],[2, null]] 300
+ // [[8, 3],[],[4, null, 1, null]] 400
+ // [[2, 3, 4],[9, 8]] 500
// slicing the top 2 rows and the bottom row off
+ constexpr auto null = 0;
+
auto valids = cudf::detail::make_counting_transform_iterator(
0, [](auto i) { return i % 2 == 0 ? true : false; });
@@ -417,7 +434,7 @@ TEST_F(ExplodeTest, SlicedList)
FCW b({100, 200, 300, 400, 500});
LCW expected_a{
- LCW{0, 3}, LCW{5}, LCW({2, 1}, valids), LCW{8, 3}, LCW{}, LCW({4, 3, 1, 2}, valids)};
+ LCW{0, 3}, LCW{5}, LCW({2, null}, valids), LCW{8, 3}, LCW{}, LCW({4, null, 1, null}, valids)};
FCW expected_b{300, 300, 300, 400, 400, 400};
cudf::table_view t({a, b});
@@ -490,19 +507,21 @@ TEST_F(ExplodeOuterTest, Basics)
TEST_F(ExplodeOuterTest, SingleNull)
{
- // a b
- // [1, 2, 7] 100
- // [5, 6] 200
- // [] 300
- // [0, 3] 400
+ // a b
+ // null 100
+ // [5, 6] 200
+ // [] 300
+ // [0, 3] 400
+
+ constexpr auto null = 0;
auto first_invalid =
cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 0 ? false : true; });
- LCW a({LCW{1, 2, 7}, LCW{5, 6}, LCW{}, LCW{0, 3}}, first_invalid);
+ LCW a({LCW{null}, LCW{5, 6}, LCW{}, LCW{0, 3}}, first_invalid);
FCW b({100, 200, 300, 400});
- FCW expected_a{{0, 5, 6, 0, 0, 3}, {0, 1, 1, 0, 1, 1}};
+ FCW expected_a{{null, 5, 6, 0, 0, 3}, {0, 1, 1, 0, 1, 1}};
FCW expected_b{100, 200, 200, 300, 400, 400};
cudf::table_view t({a, b});
@@ -522,17 +541,19 @@ TEST_F(ExplodeOuterTest, Nulls)
{
// a b
// [1, 2, 7] 100
- // [5, 6] 200
+ // null null
// [0, 3] 300
+ constexpr auto null = 0;
+
auto valids = cudf::detail::make_counting_transform_iterator(
0, [](auto i) { return i % 2 == 0 ? true : false; });
- LCW a({LCW{1, 2, 7}, LCW{5, 6}, LCW{0, 3}}, valids);
- FCW b({100, 200, 300}, valids);
+ LCW a({LCW{1, 2, 7}, LCW{null}, LCW{0, 3}}, valids);
+ FCW b({100, null, 300}, valids);
- FCW expected_a({1, 2, 7, 0, 0, 3}, {1, 1, 1, 0, 1, 1});
- FCW expected_b({100, 100, 100, 200, 300, 300}, {1, 1, 1, 0, 1, 1});
+ FCW expected_a({1, 2, 7, null, 0, 3}, {1, 1, 1, 0, 1, 1});
+ FCW expected_b({100, 100, 100, null, 300, 300}, {1, 1, 1, 0, 1, 1});
cudf::table_view t({a, b});
cudf::table_view expected({expected_a, expected_b});
@@ -547,21 +568,182 @@ TEST_F(ExplodeOuterTest, Nulls)
CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
}
+TEST_F(ExplodeOuterTest, AllNulls)
+{
+ // a b
+ // null 100
+ // null 200
+ // null 300
+
+ constexpr auto null = 0;
+
+ auto non_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return false; });
+
+ LCW a({LCW{null}, LCW{null}, LCW{null}}, non_valid);
+ FCW b({100, 200, 300});
+
+ FCW expected_a({null, null, null}, {0, 0, 0});
+ FCW expected_b({100, 200, 300});
+
+ cudf::table_view t({a, b});
+ cudf::table_view expected({expected_a, expected_b});
+
+ auto ret = cudf::explode_outer(t, 0);
+ CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+ FCW expected_pos_col{0, 0, 0};
+ cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+ auto pos_ret = cudf::explode_outer_position(t, 0);
+ CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, SequentialNulls)
+{
+ // a b
+ // [1, 2, null] 100
+ // [3, 4] 200
+ // [] 300
+ // [] 400
+ // [5, 6, 7] 500
+
+ constexpr auto null = 0;
+
+ auto third_invalid =
+ cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 2 ? false : true; });
+
+ LCW a{LCW({1, 2, null}, third_invalid), LCW{3, 4}, LCW{}, LCW{}, LCW{5, 6, 7}};
+ FCW b{100, 200, 300, 400, 500};
+
+ FCW expected_a({1, 2, null, 3, 4, null, null, 5, 6, 7}, {1, 1, 0, 1, 1, 0, 0, 1, 1, 1});
+ FCW expected_b({100, 100, 100, 200, 200, 300, 400, 500, 500, 500});
+
+ cudf::table_view t({a, b});
+ cudf::table_view expected({expected_a, expected_b});
+
+ auto ret = cudf::explode_outer(t, 0);
+ CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+ FCW expected_pos_col{0, 1, 2, 0, 1, 0, 0, 0, 1, 2};
+ cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+ auto pos_ret = cudf::explode_outer_position(t, 0);
+ CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, MoreEmptyThanData)
+{
+ // a b
+ // [1, 2] 100
+ // [] 200
+ // [] 300
+ // [] 400
+ // [] 500
+ // [3] 600
+
+ constexpr auto null = 0;
+
+ LCW a{LCW{1, 2}, LCW{}, LCW{}, LCW{}, LCW{}, LCW{3}};
+ FCW b{100, 200, 300, 400, 500, 600};
+
+ FCW expected_a({1, 2, null, null, null, null, 3}, {1, 1, 0, 0, 0, 0, 1});
+ FCW expected_b({100, 100, 200, 300, 400, 500, 600});
+
+ cudf::table_view t({a, b});
+ cudf::table_view expected({expected_a, expected_b});
+
+ auto ret = cudf::explode_outer(t, 0);
+ CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+ FCW expected_pos_col{0, 1, 0, 0, 0, 0, 0};
+ cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+ auto pos_ret = cudf::explode_outer_position(t, 0);
+ CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, TrailingEmptys)
+{
+ // a b
+ // [1, 2] 100
+ // [] 200
+ // [] 300
+ // [] 400
+ // [] 500
+
+ constexpr auto null = 0;
+
+ LCW a{LCW{1, 2}, LCW{}, LCW{}, LCW{}, LCW{}};
+ FCW b{100, 200, 300, 400, 500};
+
+ FCW expected_a({1, 2, null, null, null, null}, {1, 1, 0, 0, 0, 0});
+ FCW expected_b({100, 100, 200, 300, 400, 500});
+
+ cudf::table_view t({a, b});
+ cudf::table_view expected({expected_a, expected_b});
+
+ auto ret = cudf::explode_outer(t, 0);
+ CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+ FCW expected_pos_col{0, 1, 0, 0, 0, 0};
+ cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+ auto pos_ret = cudf::explode_outer_position(t, 0);
+ CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
+TEST_F(ExplodeOuterTest, LeadingNulls)
+{
+ // a b
+ // null 100
+ // null 200
+ // null 300
+ // null 400
+ // [1, 2] 500
+
+ constexpr auto null = 0;
+
+ auto valids =
+ cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 4 ? true : false; });
+
+ LCW a({LCW{null}, LCW{null}, LCW{null}, LCW{null}, LCW{1, 2}}, valids);
+ FCW b{100, 200, 300, 400, 500};
+
+ FCW expected_a({null, null, null, null, 1, 2}, {0, 0, 0, 0, 1, 1});
+ FCW expected_b({100, 200, 300, 400, 500, 500});
+
+ cudf::table_view t({a, b});
+ cudf::table_view expected({expected_a, expected_b});
+
+ auto ret = cudf::explode_outer(t, 0);
+ CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
+
+ FCW expected_pos_col{0, 0, 0, 0, 0, 1};
+ cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
+
+ auto pos_ret = cudf::explode_outer_position(t, 0);
+ CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
+}
+
TEST_F(ExplodeOuterTest, NullsInList)
{
// a b
- // [1, 2, 7] 100
- // [5, 6, 0, 9] 200
+ // [1, null, 7] 100
+ // [5, null, 0, null] 200
// [] 300
- // [0, 3, 8] 400
+ // [0, null, 8] 400
+
+ constexpr auto null = 0;
auto valids = cudf::detail::make_counting_transform_iterator(
0, [](auto i) { return i % 2 == 0 ? true : false; });
- LCW a{LCW({1, 2, 7}, valids), LCW({5, 6, 0, 9}, valids), LCW{}, LCW({0, 3, 8}, valids)};
+ LCW a{
+ LCW({1, null, 7}, valids), LCW({5, null, 0, null}, valids), LCW{}, LCW({0, null, 8}, valids)};
FCW b{100, 200, 300, 400};
- FCW expected_a({1, 2, 7, 5, 6, 0, 9, 0, 0, 3, 8}, {1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1});
+ FCW expected_a({1, null, 7, 5, null, 0, null, null, 0, null, 8},
+ {1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1});
FCW expected_b{100, 100, 100, 200, 200, 200, 200, 300, 400, 400, 400};
cudf::table_view t({a, b});
@@ -612,15 +794,18 @@ TEST_F(ExplodeOuterTest, NestedNulls)
// [[5, 6]] 200
// [[0, 3],[5],[2, 1]] 300
+ constexpr auto null = 0;
+
auto valids = cudf::detail::make_counting_transform_iterator(
0, [](auto i) { return i % 2 == 0 ? true : false; });
- LCW a({LCW{LCW{1, 2}, LCW{7, 6, 5}}, LCW{LCW{5, 6}}, LCW{LCW{0, 3}, LCW{5}, LCW{2, 1}}}, valids);
+ LCW a({LCW{LCW{1, 2}, LCW{7, 6, 5}}, LCW{LCW{null}}, LCW{LCW{0, 3}, LCW{5}, LCW{2, 1}}}, valids);
FCW b({100, 200, 300});
auto expected_valids =
cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 2 ? false : true; });
- LCW expected_a({LCW{1, 2}, LCW{7, 6, 5}, LCW{}, LCW{0, 3}, LCW{5}, LCW{2, 1}}, expected_valids);
+ LCW expected_a({LCW{1, 2}, LCW{7, 6, 5}, LCW{null}, LCW{0, 3}, LCW{5}, LCW{2, 1}},
+ expected_valids);
FCW expected_b({100, 100, 200, 300, 300, 300});
cudf::table_view t({a, b});
cudf::table_view expected({expected_a, expected_b});
@@ -637,21 +822,23 @@ TEST_F(ExplodeOuterTest, NestedNulls)
TEST_F(ExplodeOuterTest, NullsInNested)
{
- // a b
- // [[1, 2], [7, 6, 5]] 100
- // [[5, 6]] 200
- // [[0, 3],[5],[2, 1]] 300
+ // a b
+ // [[1, null], [7, 6, 5]] 100
+ // [[5, 6]] 200
+ // [[0, 3],[5],[2, null]] 300
+
+ constexpr auto null = 0;
auto valids = cudf::detail::make_counting_transform_iterator(
0, [](auto i) { return i % 2 == 0 ? true : false; });
- LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}},
+ LCW a({LCW{LCW({1, null}, valids), LCW{7, 6, 5}},
LCW{LCW{5, 6}},
- LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}});
+ LCW{LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}});
FCW b({100, 200, 300});
LCW expected_a{
- LCW({1, 2}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)};
+ LCW({1, null}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, null}, valids)};
FCW expected_b{100, 100, 200, 300, 300, 300};
cudf::table_view t({a, b});
@@ -670,20 +857,23 @@ TEST_F(ExplodeOuterTest, NullsInNested)
TEST_F(ExplodeOuterTest, NullsInNestedDoubleExplode)
{
- // a b
- // [[1, 2], [], [7, 6, 5]] 100
- // [[5, 6]] 200
- // [[0, 3],[5],[2, 1]] 300
+ // a b
+ // [[1, null], [], [7, 6, 5]] 100
+ // [[5, 6]] 200
+ // [[0, 3],[5],[2, null]] 300
+
+ constexpr auto null = 0;
auto valids = cudf::detail::make_counting_transform_iterator(
0, [](auto i) { return i % 2 == 0 ? true : false; });
- LCW a{LCW{LCW({1, 2}, valids), LCW{}, LCW{7, 6, 5}},
+ LCW a{LCW{LCW({1, null}, valids), LCW{}, LCW{7, 6, 5}},
LCW{LCW{5, 6}},
- LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}};
+ LCW{LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}};
FCW b{100, 200, 300};
- FCW expected_a({1, 2, 0, 7, 6, 5, 5, 6, 0, 3, 5, 2, 1}, {1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+ FCW expected_a({1, null, null, 7, 6, 5, 5, 6, 0, 3, 5, 2, null},
+ {1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
FCW expected_b{100, 100, 100, 100, 100, 100, 200, 200, 300, 300, 300, 300, 300};
cudf::table_view t({a, b});
@@ -703,23 +893,25 @@ TEST_F(ExplodeOuterTest, NullsInNestedDoubleExplode)
TEST_F(ExplodeOuterTest, NestedStructs)
{
- // a b
- // [[1, 2], [7, 6, 5]] {100, "100"}
- // [[5, 6]] {200, "200"}
- // [[0, 3],[5],[2, 1]] {300, "300"}
+ // a b
+ // [[1, null], [7, 6, 5]] {100, "100"}
+ // [[5, 6]] {200, "200"}
+ // [[0, 3],[5],[2, null]] {300, "300"}
+
+ constexpr auto null = 0;
auto valids = cudf::detail::make_counting_transform_iterator(
0, [](auto i) { return i % 2 == 0 ? true : false; });
- LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}},
+ LCW a({LCW{LCW({1, null}, valids), LCW{7, 6, 5}},
LCW{LCW{5, 6}},
- LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}});
+ LCW{LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}});
FCW b1({100, 200, 300});
strings_column_wrapper b2{"100", "200", "300"};
structs_column_wrapper b({b1, b2});
LCW expected_a{
- LCW({1, 2}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)};
+ LCW({1, null}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, null}, valids)};
FCW expected_b1{100, 100, 200, 300, 300, 300};
strings_column_wrapper expected_b2{"100", "100", "200", "300", "300", "300"};
structs_column_wrapper expected_b({expected_b1, expected_b2});
@@ -780,27 +972,29 @@ TYPED_TEST(ExplodeOuterTypedTest, ListOfStructs)
TEST_F(ExplodeOuterTest, SlicedList)
{
- // a b
- // [[1, 2],[7, 6, 5]] 100
- // [[5, 6]] 200
- // [[0, 3],[5],[2, 1]] 300
- // [[8, 3],[],[4, 3, 1, 2]] 400
- // [[2, 3, 4],[9, 8]] 500
+ // a b
+ // [[1, null],[7, 6, 5]] 100
+ // [[5, 6]] 200
+ // [[0, 3],[5],[2, null]] 300
+ // [[8, 3],[],[4, null, 1, null]] 400
+ // [[2, 3, 4],[9, 8]] 500
// slicing the top 2 rows and the bottom row off
+ constexpr auto null = 0;
+
auto valids = cudf::detail::make_counting_transform_iterator(
0, [](auto i) { return i % 2 == 0 ? true : false; });
- LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}},
+ LCW a({LCW{LCW({1, null}, valids), LCW{7, 6, 5}},
LCW{LCW{5, 6}},
- LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)},
- LCW{LCW{8, 3}, LCW{}, LCW({4, 3, 1, 2}, valids)},
+ LCW{LCW{0, 3}, LCW{5}, LCW({2, null}, valids)},
+ LCW{LCW{8, 3}, LCW{}, LCW({4, null, 1, null}, valids)},
LCW{LCW{2, 3, 4}, LCW{9, 8}}});
FCW b({100, 200, 300, 400, 500});
LCW expected_a{
- LCW{0, 3}, LCW{5}, LCW({2, 1}, valids), LCW{8, 3}, LCW{}, LCW({4, 3, 1, 2}, valids)};
+ LCW{0, 3}, LCW{5}, LCW({2, null}, valids), LCW{8, 3}, LCW{}, LCW({4, null, 1, null}, valids)};
FCW expected_b{300, 300, 300, 400, 400, 400};
cudf::table_view t({a, b});
diff --git a/python/cudf/cudf/_lib/table.pyx b/python/cudf/cudf/_lib/table.pyx
index f97b45d8abf..93d79ba6843 100644
--- a/python/cudf/cudf/_lib/table.pyx
+++ b/python/cudf/cudf/_lib/table.pyx
@@ -99,22 +99,30 @@ cdef class Table:
cdef vector[unique_ptr[column]].iterator it = columns.begin()
# First construct the index, if any
+ cdef int i
+
index = None
if index_names is not None:
- index_columns = []
- for _ in index_names:
- index_columns.append(Column.from_unique_ptr(
- move(dereference(it))
- ))
- it += 1
- index = Table(dict(zip(index_names, index_columns)))
+ index_data = ColumnAccessor._create_unsafe(
+ {
+ name: Column.from_unique_ptr(
+ move(dereference(it + i))
+ )
+ for i, name in enumerate(index_names)
+ }
+ )
+ index = Table(data=index_data)
# Construct the data dict
- data_columns = []
- for _ in column_names:
- data_columns.append(Column.from_unique_ptr(move(dereference(it))))
- it += 1
- data = dict(zip(column_names, data_columns))
+ cdef int n_index_columns = len(index_names) if index_names else 0
+ data = ColumnAccessor._create_unsafe(
+ {
+ name: Column.from_unique_ptr(
+ move(dereference(it + i + n_index_columns))
+ )
+ for i, name in enumerate(column_names)
+ }
+ )
return Table(data=data, index=index)
diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py
index 350346a87f9..9fc5570e35a 100644
--- a/python/cudf/cudf/core/buffer.py
+++ b/python/cudf/cudf/core/buffer.py
@@ -42,6 +42,10 @@ def __init__(
self.ptr = data.ptr
self.size = data.size
self._owner = owner or data._owner
+ elif isinstance(data, rmm.DeviceBuffer):
+ self.ptr = data.ptr
+ self.size = data.size
+ self._owner = data
elif hasattr(data, "__array_interface__") or hasattr(
data, "__cuda_array_interface__"
):
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index b2b2874eeb4..dd06d97d105 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1017,7 +1017,9 @@ def distinct_count(
return cpp_distinct_count(self, ignore_nulls=dropna)
def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
- if is_categorical_dtype(dtype):
+ if is_numerical_dtype(dtype):
+ return self.as_numerical_column(dtype)
+ elif is_categorical_dtype(dtype):
return self.as_categorical_column(dtype, **kwargs)
elif pd.api.types.pandas_dtype(dtype).type in {
np.str_,
@@ -1548,6 +1550,16 @@ def build_column(
"""
dtype = pd.api.types.pandas_dtype(dtype)
+ if is_numerical_dtype(dtype):
+ assert data is not None
+ return cudf.core.column.NumericalColumn(
+ data=data,
+ dtype=dtype,
+ mask=mask,
+ size=size,
+ offset=offset,
+ null_count=null_count,
+ )
if is_categorical_dtype(dtype):
if not len(children) == 1:
raise ValueError(
@@ -1634,15 +1646,7 @@ def build_column(
children=children,
)
else:
- assert data is not None
- return cudf.core.column.NumericalColumn(
- data=data,
- dtype=dtype,
- mask=mask,
- size=size,
- offset=offset,
- null_count=null_count,
- )
+ raise TypeError(f"Unrecognized dtype: {dtype}")
def build_categorical_column(
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 0c580132290..33bae5c1328 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -19,11 +19,7 @@
import cudf
from cudf.core import column
-from cudf.utils.utils import (
- cached_property,
- to_flat_dict,
- to_nested_dict,
-)
+from cudf.utils.utils import cached_property, to_flat_dict, to_nested_dict
if TYPE_CHECKING:
from cudf.core.column import ColumnBase
@@ -84,6 +80,21 @@ def __init__(
self.multiindex = multiindex
self._level_names = level_names
+ @classmethod
+ def _create_unsafe(
+ cls,
+ data: Dict[Any, ColumnBase],
+ multiindex: bool = False,
+ level_names=None,
+ ) -> ColumnAccessor:
+ # create a ColumnAccessor without verifying column
+ # type or size
+ obj = cls()
+ obj._data = data
+ obj.multiindex = multiindex
+ obj._level_names = level_names
+ return obj
+
def __iter__(self):
return self._data.__iter__()
@@ -167,7 +178,7 @@ def _column_length(self):
return 0
def _clear_cache(self):
- cached_properties = "columns", "names", "_grouped_data"
+ cached_properties = ("columns", "names", "_grouped_data")
for attr in cached_properties:
try:
self.__delattr__(attr)
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index e6898b8c606..ecff3dee573 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -2408,7 +2408,9 @@ def _copy_type_metadata(
for name, col, other_col in zip(
self._data.keys(), self._data.values(), other._data.values()
):
- self._data[name] = other_col._copy_type_metadata(col)
+ self._data.set_by_label(
+ name, other_col._copy_type_metadata(col), validate=False
+ )
if include_index:
if self._index is not None and other._index is not None:
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 1438421bb12..8875a36dba8 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -144,16 +144,13 @@ def numeric_normalize_types(*args):
def is_numerical_dtype(obj):
- if is_categorical_dtype(obj):
+ # TODO: we should handle objects with a `.dtype` attribute,
+ # e.g., arrays, here.
+ try:
+ dtype = np.dtype(obj)
+ except TypeError:
return False
- if is_list_dtype(obj):
- return False
- return (
- np.issubdtype(obj, np.bool_)
- or np.issubdtype(obj, np.floating)
- or np.issubdtype(obj, np.signedinteger)
- or np.issubdtype(obj, np.unsignedinteger)
- )
+ return dtype.kind in "biuf"
def is_string_dtype(obj):