From d791e20a6b29ef3576bf0eba97b84620d7dcaa6e Mon Sep 17 00:00:00 2001 From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> Date: Tue, 12 Jan 2021 14:22:27 -0600 Subject: [PATCH] Handle nested string columns with no children in contiguous_split.(#6864) Fixes a specific corner case: String columns with no children (a special form of empty string column that can happen) that are nested inside a list (or struct) column. This would be useful as a 0.17 PR but isn't strictly necessary, since it's pretty late. Edit: Updated the fix so that it always includes a record for src/dst buffers, even if they are of size 0 or have null data pointers. The previous method that only checked the data pointer being null was unclean and didn't handle a particularly strange case that came up with the Spark plugin: the plugin was reconstructing columns (on the receiver side of a shuffle) that had size 0 but a non-null data pointer. This is technically legal but super weird. Authors: - Dave Baranec - Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Approvers: - Karthikeyan (@karthikeyann) - Alfred Xu (@sperlingxx) - Karthikeyan (@karthikeyann) - Alfred Xu (@sperlingxx) - Karthikeyan (@karthikeyann) - Devavret Makkar (@devavret) URL: https://github.com/rapidsai/cudf/pull/6864 --- CHANGELOG.md | 1 + cpp/src/copying/contiguous_split.cu | 120 ++++++++++------- cpp/tests/copying/split_tests.cpp | 195 ++++++++++++++++++++++++++++ 3 files changed, 269 insertions(+), 47 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2d218b5a853..a3c84ba1b72 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -227,6 +227,7 @@ - PR #6855 Fix `.str.replace_with_backrefs` docs examples - PR #6853 Fix contiguous split of null string columns - PR #6861 Fix compile error in type_dispatch_benchmark.cu +- PR #6864 Handle contiguous_split corner case for nested string columns with no children - PR #6869 Avoid dependency resolution failure in latest version of pip by explicitly specifying versions for dask and distributed - PR #6806 Force install of local conda artifacts - PR #6887 Fix typo and `0-d` numpy array handling in binary operation diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index b896fafcbf6..10e2dfbdaeb 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -271,7 +271,7 @@ template size_t compute_offset_stack_size(InputIter begin, InputIter end, int offset_depth = 0) { return std::accumulate(begin, end, 0, [offset_depth](auto stack_size, column_view const& col) { - auto const num_buffers = (col.head() != nullptr ? 1 : 0) + (col.nullable() ? 1 : 0); + auto const num_buffers = 1 + (col.nullable() ? 1 : 0); return stack_size + (offset_depth * num_buffers) + compute_offset_stack_size( col.child_begin(), col.child_end(), offset_depth + is_offset_type(col.type().id())); @@ -300,11 +300,11 @@ OutputIter setup_src_buf_data(InputIter begin, InputIter end, OutputIter out_buf out_buf++; } // NOTE: we're always returning the base pointer here. column-level offset is accounted - // for later. - if (col.head() != nullptr) { - *out_buf = const_cast(col.head()); - out_buf++; - } + // for later. Also, for some column types (string, list, struct) this pointer will be null + // because there is no associated data with the root column. + *out_buf = const_cast(col.head()); + out_buf++; + out_buf = setup_src_buf_data(col.child_begin(), col.child_end(), out_buf); }); return out_buf; @@ -329,8 +329,7 @@ template size_type count_src_bufs(InputIter begin, InputIter end) { auto buf_iter = thrust::make_transform_iterator(begin, [](column_view const& col) { - return (col.head() != nullptr ? 1 : 0) + (col.nullable() ? 1 : 0) + - count_src_bufs(col.child_begin(), col.child_end()); + return 1 + (col.nullable() ? 1 : 0) + count_src_bufs(col.child_begin(), col.child_end()); }); return std::accumulate(buf_iter, buf_iter + std::distance(begin, end), 0); } @@ -417,38 +416,53 @@ std::pair buf_info_functor::operator()>(), - offset_stack_pos, - parent_offset_index, - false, - col.offset()); + // string columns don't necessarily have children + if (col.num_children() > 0) { + CUDF_EXPECTS(col.num_children() == 2, "Encountered malformed string column"); + strings_column_view scv(col); + + // info for the offsets buffer + auto offset_col = current; + CUDF_EXPECTS(scv.offsets().nullable() == false, "Encountered nullable string offsets column"); + *current = src_buf_info(type_id::INT32, + // note: offsets can be null in the case where the string column + // has been created with empty_like(). + scv.offsets().begin>(), + offset_stack_pos, + parent_offset_index, + false, + col.offset()); - // prevent appending buf_info for non-exist chars buffer - if (scv.chars_size() > 0) { current++; offset_stack_pos += offset_depth; - // since we are crossing an offset boundary, our offset_depth and parent_offset_index go up. + // since we are crossing an offset boundary, calculate our new depth and parent offset index. offset_depth++; parent_offset_index = offset_col - head; + // prevent appending buf_info for non-existent chars buffer + CUDF_EXPECTS(scv.chars().nullable() == false, "Encountered nullable string chars column"); + // info for the chars buffer *current = src_buf_info( type_id::INT8, nullptr, offset_stack_pos, parent_offset_index, false, col.offset()); + current++; + offset_stack_pos += offset_depth; } - return {current + 1, offset_stack_pos + offset_depth}; + return {current, offset_stack_pos}; } template <> @@ -466,10 +480,20 @@ std::pair buf_info_functor::operator()>(), offset_stack_pos, parent_offset_index, @@ -478,7 +502,7 @@ std::pair buf_info_functor::operator() buf_info_functor::operator()(nullptr), 0); }(); - uint8_t const* data_ptr; - size_type size; - std::tie(data_ptr, size) = [&]() { - if (src.head()) { - auto const ptr = base_ptr + current_info->dst_offset; - // if we have data, num_elements will always be the correct size. - // we don't want to use num_rows because if we are an offset column, num_rows - // represents the # of rows of our owning parent. num_elements always represents - // the proper size for this column - auto const size = current_info->num_elements; - ++current_info; - return std::make_pair(ptr, size); - } - // Parent columns w/o data (e.g., strings, lists) don't have an associated `dst_buf_info`, - // therefore, use the first child's info if it has at least one child. Their num_rows value - // will be correct (also see comment above) - auto const size = (src.num_children() == 0) ? 0 : current_info->num_rows; - return std::make_pair(static_cast(nullptr), size); - }(); + + // size/data pointer for the column + auto const size = current_info->num_elements; + uint8_t const* data_ptr = + size == 0 || src.head() == nullptr ? nullptr : base_ptr + current_info->dst_offset; + ++current_info; + + // children auto children = std::vector{}; children.reserve(src.num_children()); current_info = build_output_columns( src.child_begin(), src.child_end(), current_info, std::back_inserter(children), base_ptr); + return column_view{src.type(), size, data_ptr, bitmask_ptr, null_count, 0, std::move(children)}; }); @@ -819,8 +840,13 @@ std::vector contiguous_split(cudf::table_view const& in int row_end = d_indices[split_index + 1] + src_info.column_offset; while (stack_size > 0) { stack_size--; - row_start = d_src_buf_info[offset_stack[stack_size]].offsets[row_start]; - row_end = d_src_buf_info[offset_stack[stack_size]].offsets[row_end]; + auto const offsets = d_src_buf_info[offset_stack[stack_size]].offsets; + // this case can happen when you have empty string or list columns constructed with + // empty_like() + if (offsets != nullptr) { + row_start = offsets[row_start]; + row_end = offsets[row_end]; + } } // final row indices and row count @@ -832,7 +858,7 @@ std::vector contiguous_split(cudf::table_view const& in int const bit_shift = src_info.is_validity ? row_start % 32 : 0; // # of rows isn't necessarily the same as # of elements to be copied. auto const num_elements = [&]() { - if (src_info.offsets != nullptr) { + if (src_info.offsets != nullptr && num_rows > 0) { return num_rows + 1; } else if (src_info.is_validity) { return (num_rows + 31) / 32; diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp index e965df6db81..3c95e5e5da1 100644 --- a/cpp/tests/copying/split_tests.cpp +++ b/cpp/tests/copying/split_tests.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -899,6 +900,81 @@ void split_structs(bool include_validity, SplitFunc Split, CompareFunc Compare) } } +template +void split_structs_no_children(SplitFunc Split, CompareFunc Compare) +{ + using namespace cudf::test; + + // no nulls + { + auto struct_column = cudf::make_structs_column(4, {}, 0, rmm::device_buffer{}); + auto expected = cudf::make_structs_column(2, {}, 0, rmm::device_buffer{}); + + // split + std::vector splits{2}; + auto result = Split(*struct_column, splits); + + EXPECT_EQ(result.size(), 2); + Compare(*expected, result[0]); + Compare(*expected, result[1]); + } + + // all nulls + { + std::vector struct_validity{false, false, false, false}; + auto struct_column = cudf::make_structs_column( + 4, {}, 4, detail::make_null_mask(struct_validity.begin(), struct_validity.end())); + + std::vector expected_validity{false, false}; + auto expected = cudf::make_structs_column( + 2, {}, 2, detail::make_null_mask(expected_validity.begin(), expected_validity.end())); + + // split + std::vector splits{2}; + auto result = Split(*struct_column, splits); + + EXPECT_EQ(result.size(), 2); + Compare(*expected, result[0]); + Compare(*expected, result[1]); + } + + // no nulls, empty output column + { + auto struct_column = cudf::make_structs_column(4, {}, 0, rmm::device_buffer{}); + auto expected0 = cudf::make_structs_column(4, {}, 0, rmm::device_buffer{}); + auto expected1 = cudf::make_structs_column(0, {}, 0, rmm::device_buffer{}); + + // split + std::vector splits{4}; + auto result = Split(*struct_column, splits); + + EXPECT_EQ(result.size(), 2); + Compare(*expected0, result[0]); + Compare(*expected1, result[1]); + } + + // all nulls, empty output column + { + std::vector struct_validity{false, false, false, false}; + auto struct_column = cudf::make_structs_column( + 4, {}, 4, detail::make_null_mask(struct_validity.begin(), struct_validity.end())); + + std::vector expected_validity0{false, false, false, false}; + auto expected0 = cudf::make_structs_column( + 4, {}, 4, detail::make_null_mask(expected_validity0.begin(), expected_validity0.end())); + + auto expected1 = cudf::make_structs_column(0, {}, 0, rmm::device_buffer{}); + + // split + std::vector splits{4}; + auto result = Split(*struct_column, splits); + + EXPECT_EQ(result.size(), 2); + Compare(*expected0, result[0]); + Compare(*expected1, result[1]); + } +} + template void split_nested_struct_of_list(SplitFunc Split, CompareFunc Compare) { @@ -1012,6 +1088,17 @@ TEST_F(SplitNestedTypesTest, StructsWithNulls) }); } +TEST_F(SplitNestedTypesTest, StructsNoChildren) +{ + split_structs_no_children( + [](cudf::column_view const& t, std::vector const& splits) { + return cudf::split(t, splits); + }, + [](cudf::column_view const& expected, cudf::column_view const& result) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result); + }); +} + TEST_F(SplitNestedTypesTest, StructsOfList) { split_nested_struct_of_list( @@ -1276,6 +1363,102 @@ TEST_F(ContiguousSplitTableCornerCases, PreSplitTable) } } +TEST_F(ContiguousSplitTableCornerCases, NestedEmpty) +{ + // this produces an empty strings column with no children, + // nested inside a list + { + auto empty_string = cudf::strings::detail::make_empty_strings_column(); + auto offsets = cudf::test::fixed_width_column_wrapper({0, 0}); + auto list = cudf::make_lists_column( + 1, offsets.release(), std::move(empty_string), 0, rmm::device_buffer{0}); + + cudf::table_view src_table({static_cast(*list)}); + + std::vector splits({0}); + EXPECT_NO_THROW(contiguous_split(src_table, splits)); + + std::vector splits2({1}); + EXPECT_NO_THROW(contiguous_split(src_table, splits2)); + } + + // this produces an empty strings column with children that have no data, + // nested inside a list + { + cudf::test::strings_column_wrapper str{"abc"}; + auto empty_string = cudf::empty_like(str); + auto offsets = cudf::test::fixed_width_column_wrapper({0, 0}); + auto list = cudf::make_lists_column( + 1, offsets.release(), std::move(empty_string), 0, rmm::device_buffer{0}); + + cudf::table_view src_table({static_cast(*list)}); + + std::vector splits({0}); + EXPECT_NO_THROW(contiguous_split(src_table, splits)); + + std::vector splits2({1}); + EXPECT_NO_THROW(contiguous_split(src_table, splits2)); + } + + // this produces an empty lists column with children that have no data, + // nested inside a list + { + cudf::test::lists_column_wrapper listw{{1.0f, 2.0f}, {3.0f, 4.0f}}; + auto empty_list = cudf::empty_like(listw); + auto offsets = cudf::test::fixed_width_column_wrapper({0, 0}); + auto list = cudf::make_lists_column( + 1, offsets.release(), std::move(empty_list), 0, rmm::device_buffer{0}); + + cudf::table_view src_table({static_cast(*list)}); + + std::vector splits({0}); + EXPECT_NO_THROW(contiguous_split(src_table, splits)); + + std::vector splits2({1}); + EXPECT_NO_THROW(contiguous_split(src_table, splits2)); + } + + // this produces an empty lists column with children that have no data, + // nested inside a list + { + cudf::test::lists_column_wrapper listw{{1.0f, 2.0f}, {3.0f, 4.0f}}; + auto empty_list = cudf::empty_like(listw); + auto offsets = cudf::test::fixed_width_column_wrapper({0, 0}); + auto list = cudf::make_lists_column( + 1, offsets.release(), std::move(empty_list), 0, rmm::device_buffer{0}); + + cudf::table_view src_table({static_cast(*list)}); + + std::vector splits({0}); + EXPECT_NO_THROW(contiguous_split(src_table, splits)); + + std::vector splits2({1}); + EXPECT_NO_THROW(contiguous_split(src_table, splits2)); + } + + // this produces an empty struct column with children that have no data, + // nested inside a list + { + cudf::test::fixed_width_column_wrapper ints{0, 1, 2, 3, 4}; + cudf::test::fixed_width_column_wrapper floats{4, 3, 2, 1, 0}; + auto struct_column = cudf::test::structs_column_wrapper({ints, floats}); + auto empty_struct = cudf::empty_like(struct_column); + auto offsets = cudf::test::fixed_width_column_wrapper({0, 0}); + auto list = cudf::make_lists_column( + 1, offsets.release(), std::move(empty_struct), 0, rmm::device_buffer{0}); + + cudf::table_view src_table({static_cast(*list)}); + + std::vector splits({0}); + EXPECT_NO_THROW(contiguous_split(src_table, splits)); + + std::vector splits2({1}); + EXPECT_NO_THROW(contiguous_split(src_table, splits2)); + } +} + +TEST_F(ContiguousSplitTableCornerCases, MalformedColumns) {} + struct ContiguousSplitNestedTypesTest : public cudf::test::BaseFixture { }; @@ -1329,6 +1512,18 @@ TEST_F(ContiguousSplitNestedTypesTest, StructsWithNulls) }); } +TEST_F(ContiguousSplitNestedTypesTest, StructsNoChildren) +{ + split_structs_no_children( + [](cudf::column_view const& c, std::vector const& splits) { + cudf::table_view t({c}); + return cudf::contiguous_split(t, splits); + }, + [](cudf::column_view const& expected, cudf::contiguous_split_result const& result) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0)); + }); +} + TEST_F(ContiguousSplitNestedTypesTest, StructsOfList) { split_nested_struct_of_list(