diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu index 8233635050e..2b495deb47f 100644 --- a/cpp/src/lists/explode.cu +++ b/cpp/src/lists/explode.cu @@ -188,7 +188,7 @@ std::unique_ptr explode_outer(table_view const& input_table, }); thrust::inclusive_scan(rmm::exec_policy(stream), null_or_empty, - null_or_empty + sliced_child.size(), + null_or_empty + explode_col.size(), null_or_empty_offset.begin()); auto null_or_empty_count = @@ -209,41 +209,48 @@ std::unique_ptr
explode_outer(table_view const& input_table, // offsets + 1 here to skip the 0th offset, which removes a - 1 operation later. auto offsets_minus_one = thrust::make_transform_iterator( thrust::next(offsets), [offsets] __device__(auto i) { return (i - offsets[0]) - 1; }); + + auto fill_gather_maps = [offsets_minus_one, + gather_map_p = gather_map.begin(), + explode_col_gather_map_p = explode_col_gather_map.begin(), + position_array = pos.begin(), + sliced_child_size = sliced_child.size(), + null_or_empty_offset_p = null_or_empty_offset.begin(), + include_position, + offsets, + null_or_empty, + offset_size = explode_col.offsets().size() - 1] __device__(auto idx) { + if (idx < sliced_child_size) { + auto lb_idx = + thrust::distance(offsets_minus_one, + thrust::lower_bound( + thrust::seq, offsets_minus_one, offsets_minus_one + (offset_size), idx)); + auto index_to_write = null_or_empty_offset_p[lb_idx] + idx; + gather_map_p[index_to_write] = lb_idx; + explode_col_gather_map_p[index_to_write] = idx; + if (include_position) { + position_array[index_to_write] = idx - (offsets[lb_idx] - offsets[0]); + } + } + if (null_or_empty[idx]) { + auto invalid_index = null_or_empty_offset_p[idx] == 0 + ? offsets[idx] + : offsets[idx] + null_or_empty_offset_p[idx] - 1; + gather_map_p[invalid_index] = idx; + + // negative one to indicate a null value + explode_col_gather_map_p[invalid_index] = -1; + if (include_position) { position_array[invalid_index] = 0; } + } + }; + + // we need to do this loop at least explode_col times or we may not properly fill in null and + // empty entries. + auto loop_count = std::max(sliced_child.size(), explode_col.size()); + // Fill in gather map with all the child column's entries - thrust::for_each(rmm::exec_policy(stream), - counting_iter, - counting_iter + sliced_child.size(), - [offsets_minus_one, - gather_map = gather_map.begin(), - explode_col_gather_map = explode_col_gather_map.begin(), - position_array = pos.begin(), - include_position, - offsets, - null_or_empty_offset = null_or_empty_offset.begin(), - null_or_empty, - offset_size = explode_col.offsets().size() - 1] __device__(auto idx) { - auto lb_idx = thrust::distance( - offsets_minus_one, - thrust::lower_bound( - thrust::seq, offsets_minus_one, offsets_minus_one + (offset_size), idx)); - auto index_to_write = null_or_empty_offset[lb_idx] + idx; - gather_map[index_to_write] = lb_idx; - explode_col_gather_map[index_to_write] = idx; - if (include_position) { - position_array[index_to_write] = idx - (offsets[lb_idx] - offsets[0]); - } - if (null_or_empty[idx]) { - auto invalid_index = null_or_empty_offset[idx] == 0 - ? offsets[idx] - : offsets[idx] + null_or_empty_offset[idx] - 1; - gather_map[invalid_index] = idx; - - // negative one to indicate a null value - explode_col_gather_map[invalid_index] = -1; - - if (include_position) { position_array[invalid_index] = 0; } - } - }); + thrust::for_each( + rmm::exec_policy(stream), counting_iter, counting_iter + loop_count, fill_gather_maps); return build_table( input_table, diff --git a/cpp/tests/lists/explode_tests.cpp b/cpp/tests/lists/explode_tests.cpp index 2ec9294d118..4c7ded0efd7 100644 --- a/cpp/tests/lists/explode_tests.cpp +++ b/cpp/tests/lists/explode_tests.cpp @@ -102,15 +102,17 @@ TEST_F(ExplodeTest, Basics) TEST_F(ExplodeTest, SingleNull) { // a b - // [1, 2, 7] 100 + // null 100 // [5, 6] 200 // [] 300 // [0, 3] 400 + constexpr auto null = 0; + auto first_invalid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 0 ? false : true; }); - LCW a({LCW{1, 2, 7}, LCW{5, 6}, LCW{}, LCW{0, 3}}, first_invalid); + LCW a({LCW{null}, LCW{5, 6}, LCW{}, LCW{0, 3}}, first_invalid); FCW b({100, 200, 300, 400}); FCW expected_a{5, 6, 0, 3}; @@ -134,15 +136,17 @@ TEST_F(ExplodeTest, Nulls) { // a b // [1, 2, 7] 100 - // [5, 6] 200 + // null 200 // [0, 3] 300 + constexpr auto null = 0; + auto valids = cudf::detail::make_counting_transform_iterator( 0, [](auto i) { return i % 2 == 0 ? true : false; }); auto always_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - LCW a({LCW{1, 2, 7}, LCW{5, 6}, LCW{0, 3}}, valids); + LCW a({LCW{1, 2, 7}, LCW{null}, LCW{0, 3}}, valids); FCW b({100, 200, 300}, valids); FCW expected_a({1, 2, 7, 0, 3}); @@ -165,18 +169,21 @@ TEST_F(ExplodeTest, Nulls) TEST_F(ExplodeTest, NullsInList) { // a b - // [1, 2, 7] 100 - // [5, 6, 0, 9] 200 + // [1, null, 7] 100 + // [5, null, 0, null] 200 // [] 300 - // [0, 3, 8] 400 + // [0, null, 8] 400 + + constexpr auto null = 0; auto valids = cudf::detail::make_counting_transform_iterator( 0, [](auto i) { return i % 2 == 0 ? true : false; }); - LCW a{LCW({1, 2, 7}, valids), LCW({5, 6, 0, 9}, valids), LCW{}, LCW({0, 3, 8}, valids)}; + LCW a{ + LCW({1, null, 7}, valids), LCW({5, null, 0, null}, valids), LCW{}, LCW({0, null, 8}, valids)}; FCW b{100, 200, 300, 400}; - FCW expected_a({1, 2, 7, 5, 6, 0, 9, 0, 3, 8}, {1, 0, 1, 1, 0, 1, 0, 1, 0, 1}); + FCW expected_a({1, null, 7, 5, null, 0, null, 0, null, 8}, {1, 0, 1, 1, 0, 1, 0, 1, 0, 1}); FCW expected_b{100, 100, 100, 200, 200, 200, 200, 400, 400, 400}; cudf::table_view t({a, b}); @@ -224,16 +231,18 @@ TEST_F(ExplodeTest, NestedNulls) { // a b // [[1, 2], [7, 6, 5]] 100 - // [[5, 6]] 200 + // null null // [[0, 3],[5],[2, 1]] 300 + constexpr auto null = 0; + auto valids = cudf::detail::make_counting_transform_iterator( 0, [](auto i) { return i % 2 == 0 ? true : false; }); auto always_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); - LCW a({LCW{LCW{1, 2}, LCW{7, 6, 5}}, LCW{LCW{5, 6}}, LCW{LCW{0, 3}, LCW{5}, LCW{2, 1}}}, valids); - FCW b({100, 200, 300}, valids); + LCW a({LCW{LCW{1, 2}, LCW{7, 6, 5}}, LCW{LCW{null}}, LCW{LCW{0, 3}, LCW{5}, LCW{2, 1}}}, valids); + FCW b({100, null, 300}, valids); LCW expected_a{LCW{1, 2}, LCW{7, 6, 5}, LCW{0, 3}, LCW{5}, LCW{2, 1}}; FCW expected_b({100, 100, 300, 300, 300}, always_valid); @@ -254,21 +263,23 @@ TEST_F(ExplodeTest, NestedNulls) TEST_F(ExplodeTest, NullsInNested) { - // a b - // [[1, 2], [7, 6, 5]] 100 - // [[5, 6]] 200 - // [[0, 3],[5],[2, 1]] 300 + // a b + // [[1, null], [7, 6, 5]] 100 + // [[5, 6]] 200 + // [[0, 3],[5],[2, null]] 300 + + constexpr auto null = 0; auto valids = cudf::detail::make_counting_transform_iterator( 0, [](auto i) { return i % 2 == 0 ? true : false; }); - LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}}, + LCW a({LCW{LCW({1, null}, valids), LCW{7, 6, 5}}, LCW{LCW{5, 6}}, - LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}}); + LCW{LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}}); FCW b({100, 200, 300}); LCW expected_a{ - LCW({1, 2}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}; + LCW({1, null}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}; FCW expected_b{100, 100, 200, 300, 300, 300}; cudf::table_view t({a, b}); @@ -287,20 +298,22 @@ TEST_F(ExplodeTest, NullsInNested) TEST_F(ExplodeTest, NullsInNestedDoubleExplode) { - // a b - // [[1, 2], [], [7, 6, 5]] 100 - // [[5, 6]] 200 - // [[0, 3],[5],[2, 1]] 300 + // a b + // [[1, null], [], [7, 6, 5]] 100 + // [[5, 6]] 200 + // [[0, 3],[5],[2, null]] 300 + + constexpr auto null = 0; auto valids = cudf::detail::make_counting_transform_iterator( 0, [](auto i) { return i % 2 == 0 ? true : false; }); - LCW a{LCW{LCW({1, 2}, valids), LCW{}, LCW{7, 6, 5}}, + LCW a{LCW{LCW({1, null}, valids), LCW{}, LCW{7, 6, 5}}, LCW{LCW{5, 6}}, - LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}}; + LCW{LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}}; FCW b{100, 200, 300}; - FCW expected_a({1, 2, 7, 6, 5, 5, 6, 0, 3, 5, 2, 1}, {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}); + FCW expected_a({1, null, 7, 6, 5, 5, 6, 0, 3, 5, 2, null}, {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}); FCW expected_b{100, 100, 100, 100, 100, 200, 200, 300, 300, 300, 300, 300}; cudf::table_view t({a, b}); @@ -320,23 +333,25 @@ TEST_F(ExplodeTest, NullsInNestedDoubleExplode) TEST_F(ExplodeTest, NestedStructs) { - // a b - // [[1, 2], [7, 6, 5]] {100, "100"} - // [[5, 6]] {200, "200"} - // [[0, 3],[5],[2, 1]] {300, "300"} + // a b + // [[1, null], [7, 6, 5]] {100, "100"} + // [[5, 6]] {200, "200"} + // [[0, 3],[5],[2, null]] {300, "300"} + + constexpr auto null = 0; auto valids = cudf::detail::make_counting_transform_iterator( 0, [](auto i) { return i % 2 == 0 ? true : false; }); - LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}}, + LCW a({LCW{LCW({1, null}, valids), LCW{7, 6, 5}}, LCW{LCW{5, 6}}, - LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}}); + LCW{LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}}); FCW b1({100, 200, 300}); strings_column_wrapper b2{"100", "200", "300"}; structs_column_wrapper b({b1, b2}); LCW expected_a{ - LCW({1, 2}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}; + LCW({1, null}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}; FCW expected_b1{100, 100, 200, 300, 300, 300}; strings_column_wrapper expected_b2{"100", "100", "200", "300", "300", "300"}; structs_column_wrapper expected_b({expected_b1, expected_b2}); @@ -397,15 +412,17 @@ TYPED_TEST(ExplodeTypedTest, ListOfStructs) TEST_F(ExplodeTest, SlicedList) { - // a b - // [[1, 2],[7, 6, 5]] 100 - // [[5, 6]] 200 - // [[0, 3],[5],[2, 1]] 300 - // [[8, 3],[],[4, 3, 1, 2]] 400 - // [[2, 3, 4],[9, 8]] 500 + // a b + // [[1, null],[7, 6, 5]] 100 + // [[5, 6]] 200 + // [[0, 3],[5],[2, null]] 300 + // [[8, 3],[],[4, null, 1, null]] 400 + // [[2, 3, 4],[9, 8]] 500 // slicing the top 2 rows and the bottom row off + constexpr auto null = 0; + auto valids = cudf::detail::make_counting_transform_iterator( 0, [](auto i) { return i % 2 == 0 ? true : false; }); @@ -417,7 +434,7 @@ TEST_F(ExplodeTest, SlicedList) FCW b({100, 200, 300, 400, 500}); LCW expected_a{ - LCW{0, 3}, LCW{5}, LCW({2, 1}, valids), LCW{8, 3}, LCW{}, LCW({4, 3, 1, 2}, valids)}; + LCW{0, 3}, LCW{5}, LCW({2, null}, valids), LCW{8, 3}, LCW{}, LCW({4, null, 1, null}, valids)}; FCW expected_b{300, 300, 300, 400, 400, 400}; cudf::table_view t({a, b}); @@ -490,19 +507,21 @@ TEST_F(ExplodeOuterTest, Basics) TEST_F(ExplodeOuterTest, SingleNull) { - // a b - // [1, 2, 7] 100 - // [5, 6] 200 - // [] 300 - // [0, 3] 400 + // a b + // null 100 + // [5, 6] 200 + // [] 300 + // [0, 3] 400 + + constexpr auto null = 0; auto first_invalid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 0 ? false : true; }); - LCW a({LCW{1, 2, 7}, LCW{5, 6}, LCW{}, LCW{0, 3}}, first_invalid); + LCW a({LCW{null}, LCW{5, 6}, LCW{}, LCW{0, 3}}, first_invalid); FCW b({100, 200, 300, 400}); - FCW expected_a{{0, 5, 6, 0, 0, 3}, {0, 1, 1, 0, 1, 1}}; + FCW expected_a{{null, 5, 6, 0, 0, 3}, {0, 1, 1, 0, 1, 1}}; FCW expected_b{100, 200, 200, 300, 400, 400}; cudf::table_view t({a, b}); @@ -522,17 +541,19 @@ TEST_F(ExplodeOuterTest, Nulls) { // a b // [1, 2, 7] 100 - // [5, 6] 200 + // null null // [0, 3] 300 + constexpr auto null = 0; + auto valids = cudf::detail::make_counting_transform_iterator( 0, [](auto i) { return i % 2 == 0 ? true : false; }); - LCW a({LCW{1, 2, 7}, LCW{5, 6}, LCW{0, 3}}, valids); - FCW b({100, 200, 300}, valids); + LCW a({LCW{1, 2, 7}, LCW{null}, LCW{0, 3}}, valids); + FCW b({100, null, 300}, valids); - FCW expected_a({1, 2, 7, 0, 0, 3}, {1, 1, 1, 0, 1, 1}); - FCW expected_b({100, 100, 100, 200, 300, 300}, {1, 1, 1, 0, 1, 1}); + FCW expected_a({1, 2, 7, null, 0, 3}, {1, 1, 1, 0, 1, 1}); + FCW expected_b({100, 100, 100, null, 300, 300}, {1, 1, 1, 0, 1, 1}); cudf::table_view t({a, b}); cudf::table_view expected({expected_a, expected_b}); @@ -547,21 +568,182 @@ TEST_F(ExplodeOuterTest, Nulls) CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); } +TEST_F(ExplodeOuterTest, AllNulls) +{ + // a b + // null 100 + // null 200 + // null 300 + + constexpr auto null = 0; + + auto non_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return false; }); + + LCW a({LCW{null}, LCW{null}, LCW{null}}, non_valid); + FCW b({100, 200, 300}); + + FCW expected_a({null, null, null}, {0, 0, 0}); + FCW expected_b({100, 200, 300}); + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode_outer(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 0, 0}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_outer_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TEST_F(ExplodeOuterTest, SequentialNulls) +{ + // a b + // [1, 2, null] 100 + // [3, 4] 200 + // [] 300 + // [] 400 + // [5, 6, 7] 500 + + constexpr auto null = 0; + + auto third_invalid = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 2 ? false : true; }); + + LCW a{LCW({1, 2, null}, third_invalid), LCW{3, 4}, LCW{}, LCW{}, LCW{5, 6, 7}}; + FCW b{100, 200, 300, 400, 500}; + + FCW expected_a({1, 2, null, 3, 4, null, null, 5, 6, 7}, {1, 1, 0, 1, 1, 0, 0, 1, 1, 1}); + FCW expected_b({100, 100, 100, 200, 200, 300, 400, 500, 500, 500}); + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode_outer(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 1, 2, 0, 1, 0, 0, 0, 1, 2}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_outer_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TEST_F(ExplodeOuterTest, MoreEmptyThanData) +{ + // a b + // [1, 2] 100 + // [] 200 + // [] 300 + // [] 400 + // [] 500 + // [3] 600 + + constexpr auto null = 0; + + LCW a{LCW{1, 2}, LCW{}, LCW{}, LCW{}, LCW{}, LCW{3}}; + FCW b{100, 200, 300, 400, 500, 600}; + + FCW expected_a({1, 2, null, null, null, null, 3}, {1, 1, 0, 0, 0, 0, 1}); + FCW expected_b({100, 100, 200, 300, 400, 500, 600}); + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode_outer(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 1, 0, 0, 0, 0, 0}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_outer_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TEST_F(ExplodeOuterTest, TrailingEmptys) +{ + // a b + // [1, 2] 100 + // [] 200 + // [] 300 + // [] 400 + // [] 500 + + constexpr auto null = 0; + + LCW a{LCW{1, 2}, LCW{}, LCW{}, LCW{}, LCW{}}; + FCW b{100, 200, 300, 400, 500}; + + FCW expected_a({1, 2, null, null, null, null}, {1, 1, 0, 0, 0, 0}); + FCW expected_b({100, 100, 200, 300, 400, 500}); + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode_outer(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 1, 0, 0, 0, 0}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_outer_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + +TEST_F(ExplodeOuterTest, LeadingNulls) +{ + // a b + // null 100 + // null 200 + // null 300 + // null 400 + // [1, 2] 500 + + constexpr auto null = 0; + + auto valids = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 4 ? true : false; }); + + LCW a({LCW{null}, LCW{null}, LCW{null}, LCW{null}, LCW{1, 2}}, valids); + FCW b{100, 200, 300, 400, 500}; + + FCW expected_a({null, null, null, null, 1, 2}, {0, 0, 0, 0, 1, 1}); + FCW expected_b({100, 200, 300, 400, 500, 500}); + + cudf::table_view t({a, b}); + cudf::table_view expected({expected_a, expected_b}); + + auto ret = cudf::explode_outer(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected); + + FCW expected_pos_col{0, 0, 0, 0, 0, 1}; + cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b}); + + auto pos_ret = cudf::explode_outer_position(t, 0); + CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected); +} + TEST_F(ExplodeOuterTest, NullsInList) { // a b - // [1, 2, 7] 100 - // [5, 6, 0, 9] 200 + // [1, null, 7] 100 + // [5, null, 0, null] 200 // [] 300 - // [0, 3, 8] 400 + // [0, null, 8] 400 + + constexpr auto null = 0; auto valids = cudf::detail::make_counting_transform_iterator( 0, [](auto i) { return i % 2 == 0 ? true : false; }); - LCW a{LCW({1, 2, 7}, valids), LCW({5, 6, 0, 9}, valids), LCW{}, LCW({0, 3, 8}, valids)}; + LCW a{ + LCW({1, null, 7}, valids), LCW({5, null, 0, null}, valids), LCW{}, LCW({0, null, 8}, valids)}; FCW b{100, 200, 300, 400}; - FCW expected_a({1, 2, 7, 5, 6, 0, 9, 0, 0, 3, 8}, {1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1}); + FCW expected_a({1, null, 7, 5, null, 0, null, null, 0, null, 8}, + {1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1}); FCW expected_b{100, 100, 100, 200, 200, 200, 200, 300, 400, 400, 400}; cudf::table_view t({a, b}); @@ -612,15 +794,18 @@ TEST_F(ExplodeOuterTest, NestedNulls) // [[5, 6]] 200 // [[0, 3],[5],[2, 1]] 300 + constexpr auto null = 0; + auto valids = cudf::detail::make_counting_transform_iterator( 0, [](auto i) { return i % 2 == 0 ? true : false; }); - LCW a({LCW{LCW{1, 2}, LCW{7, 6, 5}}, LCW{LCW{5, 6}}, LCW{LCW{0, 3}, LCW{5}, LCW{2, 1}}}, valids); + LCW a({LCW{LCW{1, 2}, LCW{7, 6, 5}}, LCW{LCW{null}}, LCW{LCW{0, 3}, LCW{5}, LCW{2, 1}}}, valids); FCW b({100, 200, 300}); auto expected_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i == 2 ? false : true; }); - LCW expected_a({LCW{1, 2}, LCW{7, 6, 5}, LCW{}, LCW{0, 3}, LCW{5}, LCW{2, 1}}, expected_valids); + LCW expected_a({LCW{1, 2}, LCW{7, 6, 5}, LCW{null}, LCW{0, 3}, LCW{5}, LCW{2, 1}}, + expected_valids); FCW expected_b({100, 100, 200, 300, 300, 300}); cudf::table_view t({a, b}); cudf::table_view expected({expected_a, expected_b}); @@ -637,21 +822,23 @@ TEST_F(ExplodeOuterTest, NestedNulls) TEST_F(ExplodeOuterTest, NullsInNested) { - // a b - // [[1, 2], [7, 6, 5]] 100 - // [[5, 6]] 200 - // [[0, 3],[5],[2, 1]] 300 + // a b + // [[1, null], [7, 6, 5]] 100 + // [[5, 6]] 200 + // [[0, 3],[5],[2, null]] 300 + + constexpr auto null = 0; auto valids = cudf::detail::make_counting_transform_iterator( 0, [](auto i) { return i % 2 == 0 ? true : false; }); - LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}}, + LCW a({LCW{LCW({1, null}, valids), LCW{7, 6, 5}}, LCW{LCW{5, 6}}, - LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}}); + LCW{LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}}); FCW b({100, 200, 300}); LCW expected_a{ - LCW({1, 2}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}; + LCW({1, null}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}; FCW expected_b{100, 100, 200, 300, 300, 300}; cudf::table_view t({a, b}); @@ -670,20 +857,23 @@ TEST_F(ExplodeOuterTest, NullsInNested) TEST_F(ExplodeOuterTest, NullsInNestedDoubleExplode) { - // a b - // [[1, 2], [], [7, 6, 5]] 100 - // [[5, 6]] 200 - // [[0, 3],[5],[2, 1]] 300 + // a b + // [[1, null], [], [7, 6, 5]] 100 + // [[5, 6]] 200 + // [[0, 3],[5],[2, null]] 300 + + constexpr auto null = 0; auto valids = cudf::detail::make_counting_transform_iterator( 0, [](auto i) { return i % 2 == 0 ? true : false; }); - LCW a{LCW{LCW({1, 2}, valids), LCW{}, LCW{7, 6, 5}}, + LCW a{LCW{LCW({1, null}, valids), LCW{}, LCW{7, 6, 5}}, LCW{LCW{5, 6}}, - LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}}; + LCW{LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}}; FCW b{100, 200, 300}; - FCW expected_a({1, 2, 0, 7, 6, 5, 5, 6, 0, 3, 5, 2, 1}, {1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}); + FCW expected_a({1, null, null, 7, 6, 5, 5, 6, 0, 3, 5, 2, null}, + {1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}); FCW expected_b{100, 100, 100, 100, 100, 100, 200, 200, 300, 300, 300, 300, 300}; cudf::table_view t({a, b}); @@ -703,23 +893,25 @@ TEST_F(ExplodeOuterTest, NullsInNestedDoubleExplode) TEST_F(ExplodeOuterTest, NestedStructs) { - // a b - // [[1, 2], [7, 6, 5]] {100, "100"} - // [[5, 6]] {200, "200"} - // [[0, 3],[5],[2, 1]] {300, "300"} + // a b + // [[1, null], [7, 6, 5]] {100, "100"} + // [[5, 6]] {200, "200"} + // [[0, 3],[5],[2, null]] {300, "300"} + + constexpr auto null = 0; auto valids = cudf::detail::make_counting_transform_iterator( 0, [](auto i) { return i % 2 == 0 ? true : false; }); - LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}}, + LCW a({LCW{LCW({1, null}, valids), LCW{7, 6, 5}}, LCW{LCW{5, 6}}, - LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}}); + LCW{LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}}); FCW b1({100, 200, 300}); strings_column_wrapper b2{"100", "200", "300"}; structs_column_wrapper b({b1, b2}); LCW expected_a{ - LCW({1, 2}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}; + LCW({1, null}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}; FCW expected_b1{100, 100, 200, 300, 300, 300}; strings_column_wrapper expected_b2{"100", "100", "200", "300", "300", "300"}; structs_column_wrapper expected_b({expected_b1, expected_b2}); @@ -780,27 +972,29 @@ TYPED_TEST(ExplodeOuterTypedTest, ListOfStructs) TEST_F(ExplodeOuterTest, SlicedList) { - // a b - // [[1, 2],[7, 6, 5]] 100 - // [[5, 6]] 200 - // [[0, 3],[5],[2, 1]] 300 - // [[8, 3],[],[4, 3, 1, 2]] 400 - // [[2, 3, 4],[9, 8]] 500 + // a b + // [[1, null],[7, 6, 5]] 100 + // [[5, 6]] 200 + // [[0, 3],[5],[2, null]] 300 + // [[8, 3],[],[4, null, 1, null]] 400 + // [[2, 3, 4],[9, 8]] 500 // slicing the top 2 rows and the bottom row off + constexpr auto null = 0; + auto valids = cudf::detail::make_counting_transform_iterator( 0, [](auto i) { return i % 2 == 0 ? true : false; }); - LCW a({LCW{LCW({1, 2}, valids), LCW{7, 6, 5}}, + LCW a({LCW{LCW({1, null}, valids), LCW{7, 6, 5}}, LCW{LCW{5, 6}}, - LCW{LCW{0, 3}, LCW{5}, LCW({2, 1}, valids)}, - LCW{LCW{8, 3}, LCW{}, LCW({4, 3, 1, 2}, valids)}, + LCW{LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}, + LCW{LCW{8, 3}, LCW{}, LCW({4, null, 1, null}, valids)}, LCW{LCW{2, 3, 4}, LCW{9, 8}}}); FCW b({100, 200, 300, 400, 500}); LCW expected_a{ - LCW{0, 3}, LCW{5}, LCW({2, 1}, valids), LCW{8, 3}, LCW{}, LCW({4, 3, 1, 2}, valids)}; + LCW{0, 3}, LCW{5}, LCW({2, null}, valids), LCW{8, 3}, LCW{}, LCW({4, null, 1, null}, valids)}; FCW expected_b{300, 300, 300, 400, 400, 400}; cudf::table_view t({a, b}); diff --git a/python/cudf/cudf/_lib/table.pyx b/python/cudf/cudf/_lib/table.pyx index f97b45d8abf..93d79ba6843 100644 --- a/python/cudf/cudf/_lib/table.pyx +++ b/python/cudf/cudf/_lib/table.pyx @@ -99,22 +99,30 @@ cdef class Table: cdef vector[unique_ptr[column]].iterator it = columns.begin() # First construct the index, if any + cdef int i + index = None if index_names is not None: - index_columns = [] - for _ in index_names: - index_columns.append(Column.from_unique_ptr( - move(dereference(it)) - )) - it += 1 - index = Table(dict(zip(index_names, index_columns))) + index_data = ColumnAccessor._create_unsafe( + { + name: Column.from_unique_ptr( + move(dereference(it + i)) + ) + for i, name in enumerate(index_names) + } + ) + index = Table(data=index_data) # Construct the data dict - data_columns = [] - for _ in column_names: - data_columns.append(Column.from_unique_ptr(move(dereference(it)))) - it += 1 - data = dict(zip(column_names, data_columns)) + cdef int n_index_columns = len(index_names) if index_names else 0 + data = ColumnAccessor._create_unsafe( + { + name: Column.from_unique_ptr( + move(dereference(it + i + n_index_columns)) + ) + for i, name in enumerate(column_names) + } + ) return Table(data=data, index=index) diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py index 350346a87f9..9fc5570e35a 100644 --- a/python/cudf/cudf/core/buffer.py +++ b/python/cudf/cudf/core/buffer.py @@ -42,6 +42,10 @@ def __init__( self.ptr = data.ptr self.size = data.size self._owner = owner or data._owner + elif isinstance(data, rmm.DeviceBuffer): + self.ptr = data.ptr + self.size = data.size + self._owner = data elif hasattr(data, "__array_interface__") or hasattr( data, "__cuda_array_interface__" ): diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index b2b2874eeb4..dd06d97d105 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1017,7 +1017,9 @@ def distinct_count( return cpp_distinct_count(self, ignore_nulls=dropna) def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: - if is_categorical_dtype(dtype): + if is_numerical_dtype(dtype): + return self.as_numerical_column(dtype) + elif is_categorical_dtype(dtype): return self.as_categorical_column(dtype, **kwargs) elif pd.api.types.pandas_dtype(dtype).type in { np.str_, @@ -1548,6 +1550,16 @@ def build_column( """ dtype = pd.api.types.pandas_dtype(dtype) + if is_numerical_dtype(dtype): + assert data is not None + return cudf.core.column.NumericalColumn( + data=data, + dtype=dtype, + mask=mask, + size=size, + offset=offset, + null_count=null_count, + ) if is_categorical_dtype(dtype): if not len(children) == 1: raise ValueError( @@ -1634,15 +1646,7 @@ def build_column( children=children, ) else: - assert data is not None - return cudf.core.column.NumericalColumn( - data=data, - dtype=dtype, - mask=mask, - size=size, - offset=offset, - null_count=null_count, - ) + raise TypeError(f"Unrecognized dtype: {dtype}") def build_categorical_column( diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 0c580132290..33bae5c1328 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -19,11 +19,7 @@ import cudf from cudf.core import column -from cudf.utils.utils import ( - cached_property, - to_flat_dict, - to_nested_dict, -) +from cudf.utils.utils import cached_property, to_flat_dict, to_nested_dict if TYPE_CHECKING: from cudf.core.column import ColumnBase @@ -84,6 +80,21 @@ def __init__( self.multiindex = multiindex self._level_names = level_names + @classmethod + def _create_unsafe( + cls, + data: Dict[Any, ColumnBase], + multiindex: bool = False, + level_names=None, + ) -> ColumnAccessor: + # create a ColumnAccessor without verifying column + # type or size + obj = cls() + obj._data = data + obj.multiindex = multiindex + obj._level_names = level_names + return obj + def __iter__(self): return self._data.__iter__() @@ -167,7 +178,7 @@ def _column_length(self): return 0 def _clear_cache(self): - cached_properties = "columns", "names", "_grouped_data" + cached_properties = ("columns", "names", "_grouped_data") for attr in cached_properties: try: self.__delattr__(attr) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index e6898b8c606..ecff3dee573 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -2408,7 +2408,9 @@ def _copy_type_metadata( for name, col, other_col in zip( self._data.keys(), self._data.values(), other._data.values() ): - self._data[name] = other_col._copy_type_metadata(col) + self._data.set_by_label( + name, other_col._copy_type_metadata(col), validate=False + ) if include_index: if self._index is not None and other._index is not None: diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 1438421bb12..8875a36dba8 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -144,16 +144,13 @@ def numeric_normalize_types(*args): def is_numerical_dtype(obj): - if is_categorical_dtype(obj): + # TODO: we should handle objects with a `.dtype` attribute, + # e.g., arrays, here. + try: + dtype = np.dtype(obj) + except TypeError: return False - if is_list_dtype(obj): - return False - return ( - np.issubdtype(obj, np.bool_) - or np.issubdtype(obj, np.floating) - or np.issubdtype(obj, np.signedinteger) - or np.issubdtype(obj, np.unsignedinteger) - ) + return dtype.kind in "biuf" def is_string_dtype(obj):