diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu index 7fafa885c66..7b9fc25d1cc 100644 --- a/cpp/src/io/json/host_tree_algorithms.cu +++ b/cpp/src/io/json/host_tree_algorithms.cu @@ -222,18 +222,19 @@ struct json_column_data { using hashmap_of_device_columns = std::unordered_map>; -std::pair, hashmap_of_device_columns> build_tree( - device_json_column& root, - host_span is_str_column_all_nulls, - tree_meta_t& d_column_tree, - device_span d_unique_col_ids, - device_span d_max_row_offsets, - std::vector const& column_names, - NodeIndexT row_array_parent_col_id, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); +std:: + tuple, cudf::detail::host_vector, hashmap_of_device_columns> + build_tree(device_json_column& root, + host_span is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); void scatter_offsets(tree_meta_t const& tree, device_span col_ids, @@ -242,6 +243,7 @@ void scatter_offsets(tree_meta_t const& tree, device_span sorted_col_ids, // Reuse this for parent_col_ids tree_meta_t const& d_column_tree, host_span ignore_vals, + host_span is_mixed, hashmap_of_device_columns const& columns, rmm::cuda_stream_view stream); @@ -363,17 +365,17 @@ void make_device_json_column(device_span input, } return std::vector(); }(); - auto const [ignore_vals, columns] = build_tree(root, - is_str_column_all_nulls, - d_column_tree, - d_unique_col_ids, - d_max_row_offsets, - column_names, - row_array_parent_col_id, - is_array_of_arrays, - options, - stream, - mr); + auto const [ignore_vals, is_mixed_pruned, columns] = build_tree(root, + is_str_column_all_nulls, + d_column_tree, + d_unique_col_ids, + d_max_row_offsets, + column_names, + row_array_parent_col_id, + is_array_of_arrays, + options, + stream, + mr); if (ignore_vals.empty()) return; scatter_offsets(tree, col_ids, @@ -382,22 +384,24 @@ void make_device_json_column(device_span input, sorted_col_ids, d_column_tree, ignore_vals, + is_mixed_pruned, columns, stream); } -std::pair, hashmap_of_device_columns> build_tree( - device_json_column& root, - host_span is_str_column_all_nulls, - tree_meta_t& d_column_tree, - device_span d_unique_col_ids, - device_span d_max_row_offsets, - std::vector const& column_names, - NodeIndexT row_array_parent_col_id, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std:: + tuple, cudf::detail::host_vector, hashmap_of_device_columns> + build_tree(device_json_column& root, + host_span is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { bool const is_enabled_lines = options.is_enabled_lines(); bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); @@ -488,7 +492,9 @@ std::pair, hashmap_of_device_columns> build_tree // NoPruning: iterate through schema and enforce type. if (adj[parent_node_sentinel].empty()) - return {cudf::detail::make_host_vector(0, stream), {}}; // for empty file + return {cudf::detail::make_host_vector(0, stream), + cudf::detail::make_host_vector(0, stream), + {}}; // for empty file CUDF_EXPECTS(adj[parent_node_sentinel].size() == 1, "Should be 1"); auto expected_types = cudf::detail::make_host_vector(num_columns, stream); std::fill_n(expected_types.begin(), num_columns, NUM_NODE_CLASSES); @@ -551,11 +557,14 @@ std::pair, hashmap_of_device_columns> build_tree auto list_child = schema.child_types.at(this_list_child_name); for (auto const& child_id : child_ids) mark_is_pruned(child_id, list_child); + // TODO: Store null map of non-target types for list children to mark list entry as null. } }; if (is_array_of_arrays) { if (adj[adj[parent_node_sentinel][0]].empty()) - return {cudf::detail::make_host_vector(0, stream), {}}; + return {cudf::detail::make_host_vector(0, stream), + cudf::detail::make_host_vector(0, stream), + {}}; auto root_list_col_id = is_enabled_lines ? adj[parent_node_sentinel][0] : adj[adj[parent_node_sentinel][0]][0]; // mark root and row array col_id as not pruned. @@ -647,8 +656,12 @@ std::pair, hashmap_of_device_columns> build_tree ? adj[parent_node_sentinel][0] : (adj[adj[parent_node_sentinel][0]].empty() ? -1 : adj[adj[parent_node_sentinel][0]][0]); + // List children which are pruned mixed types, nullify parent list row. + auto is_mixed_pruned = cudf::detail::make_host_vector(num_columns, stream); + std::fill_n(is_mixed_pruned.begin(), num_columns, false); auto handle_mixed_types = [&column_categories, &is_str_column_all_nulls, + &is_mixed_pruned, &is_pruned, &expected_types, &is_enabled_mixed_types_as_string, @@ -794,6 +807,14 @@ std::pair, hashmap_of_device_columns> build_tree "list child column insertion failed, duplicate column name in the parent"); ref.get().column_order.emplace_back(list_child_name); auto this_ref = std::ref(ref.get().child_columns.at(list_child_name)); + if (options.is_enabled_experimental()) { + for (auto const& child_id : child_ids) { + if (is_pruned[child_id]) { + // store this child_id for mixed_type nullify parent list_id. + is_mixed_pruned[child_id] = is_pruned[child_id]; + } + } + } // Mixed type handling handle_mixed_types(child_ids); if (child_ids.empty()) { @@ -829,7 +850,7 @@ std::pair, hashmap_of_device_columns> build_tree [](auto exp, auto cat) { return exp == NUM_NODE_CLASSES ? cat : exp; }); cudf::detail::cuda_memcpy_async(d_column_tree.node_categories, expected_types, stream); - return {is_pruned, columns}; + return {is_pruned, is_mixed_pruned, columns}; } void scatter_offsets(tree_meta_t const& tree, @@ -839,6 +860,7 @@ void scatter_offsets(tree_meta_t const& tree, device_span sorted_col_ids, // Reuse this for parent_col_ids tree_meta_t const& d_column_tree, host_span ignore_vals, + host_span is_mixed_pruned, hashmap_of_device_columns const& columns, rmm::cuda_stream_view stream) { @@ -857,6 +879,8 @@ void scatter_offsets(tree_meta_t const& tree, auto d_ignore_vals = cudf::detail::make_device_uvector_async( ignore_vals, stream, cudf::get_current_device_resource_ref()); + auto d_is_mixed_pruned = cudf::detail::make_device_uvector_async( + is_mixed_pruned, stream, cudf::get_current_device_resource_ref()); auto d_columns_data = cudf::detail::make_device_uvector_async( columns_data, stream, cudf::get_current_device_resource_ref()); @@ -921,9 +945,31 @@ void scatter_offsets(tree_meta_t const& tree, column_categories[col_ids[parent_node_id]] == NC_LIST and (!d_ignore_vals[col_ids[parent_node_id]]); }); + // For children of list and in ignore_vals, find it's parent node id, and set corresponding + // parent's null mask to null. Setting mixed type list rows to null. + auto const num_list_children = thrust::distance( + thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), list_children_end); + thrust::for_each_n( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + num_list_children, + [node_ids = node_ids.begin(), + parent_node_ids = tree.parent_node_ids.begin(), + column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin(), + row_offsets = row_offsets.begin(), + d_is_mixed_pruned = d_is_mixed_pruned.begin(), + d_ignore_vals = d_ignore_vals.begin(), + d_columns_data = d_columns_data.begin()] __device__(size_type i) { + auto const node_id = node_ids[i]; + auto const parent_node_id = parent_node_ids[node_id]; + if (parent_node_id == parent_node_sentinel or d_ignore_vals[col_ids[parent_node_id]]) return; + if (column_categories[col_ids[parent_node_id]] == NC_LIST and + d_is_mixed_pruned[col_ids[node_id]]) { + clear_bit(d_columns_data[col_ids[parent_node_id]].validity, row_offsets[parent_node_id]); + } + }); - auto const num_list_children = - list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()); thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream), parent_col_ids.begin(), parent_col_ids.begin() + num_list_children, diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 30a154fdda2..1fe58a0449f 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -464,46 +464,49 @@ std::pair, std::vector> device_json_co column_names.emplace_back( json_col.child_columns.empty() ? list_child_name : json_col.child_columns.begin()->first); - // Note: json_col modified here, reuse the memory + // If child is not present, set the null mask correctly, but offsets are zero, and children + // are empty. Note: json_col modified here, reuse the memory auto offsets_column = std::make_unique(data_type{type_id::INT32}, num_rows + 1, json_col.child_offsets.release(), rmm::device_buffer{}, 0); // Create children column - auto child_schema_element = - json_col.child_columns.empty() ? std::optional{} : get_list_child_schema(); - auto [child_column, names] = - json_col.child_columns.empty() or (prune_columns and !child_schema_element.has_value()) - ? std::pair, - // EMPTY type could not used because gather throws exception on EMPTY type. - std::vector>{std::make_unique( - data_type{type_id::INT8}, - 0, - rmm::device_buffer{}, - rmm::device_buffer{}, - 0), - std::vector{}} - : device_json_column_to_cudf_column(json_col.child_columns.begin()->second, - d_input, - options, - prune_columns, - child_schema_element, - stream, - mr); + auto child_schema_element = get_list_child_schema(); + auto [child_column, names] = [&]() { + if (json_col.child_columns.empty()) { + // EMPTY type could not used because gather throws exception on EMPTY type. + auto empty_col = make_empty_column( + child_schema_element.value_or(schema_element{data_type{type_id::INT8}}), stream, mr); + auto children_metadata = std::vector{ + make_column_name_info( + child_schema_element.value_or(schema_element{data_type{type_id::INT8}}), + list_child_name) + .children}; + + return std::pair, std::vector>{ + std::move(empty_col), children_metadata}; + } + return device_json_column_to_cudf_column(json_col.child_columns.begin()->second, + d_input, + options, + prune_columns, + child_schema_element, + stream, + mr); + }(); column_names.back().children = names; auto [result_bitmask, null_count] = make_validity(json_col); - auto ret_col = make_lists_column(num_rows, - std::move(offsets_column), - std::move(child_column), - 0, - rmm::device_buffer{0, stream, mr}, - stream, - mr); - // The null_mask is set after creation of list column is to skip the purge_nonempty_nulls and - // null validation applied in make_lists_column factory, which is not needed for json - // parent column cannot be null when its children is non-empty in JSON - if (null_count != 0) { ret_col->set_null_mask(std::move(result_bitmask), null_count); } + auto ret_col = make_lists_column( + num_rows, + std::move(offsets_column), + std::move(child_column), + null_count, + null_count == 0 ? rmm::device_buffer{0, stream, mr} : std::move(result_bitmask), + stream, + mr); + // Since some rows in child column may need to be nullified due to mixed types, we can not + // skip the purge_nonempty_nulls call in make_lists_column factory return {std::move(ret_col), std::move(column_names)}; } default: CUDF_FAIL("Unsupported column type"); break; diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 4989fff4b30..2f6942fe139 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -429,6 +429,18 @@ table_with_metadata device_parse_nested_json(device_span input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +/** + * @brief Create empty column of a given nested schema + * + * @param schema The schema of the column to create + * @param stream The CUDA stream to which kernels are dispatched + * @param mr resource with which to allocate + * @return The empty column + */ +std::unique_ptr make_empty_column(schema_element const& schema, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + /** * @brief Create all null column of a given nested schema * diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp index ced7acb9cde..2da320b2af3 100644 --- a/cpp/src/io/json/parser_features.cpp +++ b/cpp/src/io/json/parser_features.cpp @@ -159,7 +159,17 @@ struct empty_column_functor { std::unique_ptr child = cudf::type_dispatcher( schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name)); auto offsets = make_empty_column(data_type(type_to_id())); - return make_lists_column(0, std::move(offsets), std::move(child), 0, {}, stream, mr); + std::vector> child_columns; + child_columns.push_back(std::move(offsets)); + child_columns.push_back(std::move(child)); + // Do not use `cudf::make_lists_column` since we do not need to call `purge_nonempty_nulls` on + // the child column as it does not have non-empty nulls. Look issue #17356 + return std::make_unique(cudf::data_type{type_id::LIST}, + 0, + rmm::device_buffer{}, + rmm::device_buffer{}, + 0, + std::move(child_columns)); } template )> @@ -174,6 +184,13 @@ struct empty_column_functor { } }; +std::unique_ptr make_empty_column(schema_element const& schema, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + return cudf::type_dispatcher(schema.type, empty_column_functor{stream, mr}, schema); +} + /// Created all null column of the specified schema struct allnull_column_functor { rmm::cuda_stream_view stream; @@ -198,10 +215,9 @@ struct allnull_column_functor { std::unique_ptr operator()(schema_element const& schema, size_type size) const { CUDF_EXPECTS(schema.child_types.size() == 1, "Dictionary column should have only one child"); - auto const& child_name = schema.child_types.begin()->first; - std::unique_ptr child = cudf::type_dispatcher(schema.child_types.at(child_name).type, - empty_column_functor{stream, mr}, - schema.child_types.at(child_name)); + auto const& child_name = schema.child_types.begin()->first; + std::unique_ptr child = + make_empty_column(schema.child_types.at(child_name), stream, mr); return make_fixed_width_column(schema.type, size, mask_state::ALL_NULL, stream, mr); auto indices = make_zeroed_offsets(size - 1); auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); @@ -221,14 +237,22 @@ struct allnull_column_functor { std::unique_ptr operator()(schema_element const& schema, size_type size) const { CUDF_EXPECTS(schema.child_types.size() == 1, "List column should have only one child"); - auto const& child_name = schema.child_types.begin()->first; - std::unique_ptr child = cudf::type_dispatcher(schema.child_types.at(child_name).type, - empty_column_functor{stream, mr}, - schema.child_types.at(child_name)); - auto offsets = make_zeroed_offsets(size); + auto const& child_name = schema.child_types.begin()->first; + std::unique_ptr child = + make_empty_column(schema.child_types.at(child_name), stream, mr); + auto offsets = make_zeroed_offsets(size); auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); - return make_lists_column( - size, std::move(offsets), std::move(child), size, std::move(null_mask), stream, mr); + std::vector> child_columns; + child_columns.push_back(std::move(offsets)); + child_columns.push_back(std::move(child)); + // Do not use `cudf::make_lists_column` since we do not need to call `purge_nonempty_nulls` on + // the child column as it does not have non-empty nulls. Look issue #17356 + return std::make_unique(cudf::data_type{type_id::LIST}, + size, + rmm::device_buffer{}, + std::move(null_mask), + size, + std::move(child_columns)); } template )> @@ -240,8 +264,14 @@ struct allnull_column_functor { schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name), size)); } auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr); - return make_structs_column( - size, std::move(child_columns), size, std::move(null_mask), stream, mr); + // Do not use `cudf::make_structs_column` since we do not need to call `superimpose_nulls` on + // the children columns. Look issue #17356 + return std::make_unique(cudf::data_type{type_id::STRUCT}, + size, + rmm::device_buffer{}, + std::move(null_mask), + size, + std::move(child_columns)); } }; diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 3c8db99c3c7..37a750330fa 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -56,6 +56,8 @@ using int16_wrapper = wrapper; using int64_wrapper = wrapper; using timestamp_ms_wrapper = wrapper; using bool_wrapper = wrapper; +using size_type_wrapper = wrapper; +using strings_wrapper = cudf::test::strings_column_wrapper; using cudf::data_type; using cudf::type_id; @@ -3253,6 +3255,144 @@ TEST_F(JsonReaderTest, JsonNestedDtypeFilterWithOrder) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), *wrapped); } } + + // test list (all-null) of struct (empty) of string (empty) + { + std::string json_stringl = R"( + {"a" : [1], "c2": [1, 2]} + {} + )"; + auto lines = true; + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_stringl.data(), json_stringl.size()}) + .prune_columns(true) + .experimental(true) + .lines(lines); + + cudf::io::schema_element dtype_schema{ + data_type{cudf::type_id::STRUCT}, + { + {"a", {data_type{cudf::type_id::LIST}, {{"element", {dtype()}}}}}, + {"c2", + {data_type{cudf::type_id::LIST}, + {{"element", + {data_type{cudf::type_id::STRUCT}, + { + {"d", {data_type{cudf::type_id::STRING}}}, + }, + {{"d"}}}}}}}, + }, + {{"a", "c2"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have column "a":[int64_t] + ASSERT_EQ(result.tbl->num_columns(), 2); + ASSERT_EQ(result.metadata.schema_info.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "offsets"); + EXPECT_EQ(result.metadata.schema_info[0].children[1].name, "element"); + // Make sure we have all null list "c2": [{"d": ""}] + EXPECT_EQ(result.metadata.schema_info[1].name, "c2"); + ASSERT_EQ(result.metadata.schema_info[1].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[1].children[0].name, "offsets"); + EXPECT_EQ(result.metadata.schema_info[1].children[1].name, "element"); + ASSERT_EQ(result.metadata.schema_info[1].children[1].children.size(), 1); + EXPECT_EQ(result.metadata.schema_info[1].children[1].children[0].name, "d"); + + auto const expected0 = [&] { + auto const valids = std::vector{1, 0}; + auto [null_mask, null_count] = + cudf::test::detail::make_null_mask(valids.begin(), valids.end()); + return cudf::make_lists_column(2, + size_type_wrapper{0, 1, 1}.release(), + int64_wrapper{1}.release(), + null_count, + std::move(null_mask)); + }(); + + auto const expected1 = [&] { + auto const get_structs = [] { + auto child = cudf::test::strings_column_wrapper{}; + return cudf::test::structs_column_wrapper{{child}}; + }; + auto const valids = std::vector{0, 0}; + auto [null_mask, null_count] = + cudf::test::detail::make_null_mask(valids.begin(), valids.end()); + return cudf::make_lists_column(2, + size_type_wrapper{0, 0, 0}.release(), + get_structs().release(), + null_count, + std::move(null_mask)); + }(); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected0, result.tbl->get_column(0).view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected1, result.tbl->get_column(1).view()); + } +} + +TEST_F(JsonReaderTest, NullifyMixedList) +{ + using namespace cudf::test::iterators; + // test list + std::string json_stringl = R"( + {"c2": []} + {"c2": [{}]} + {"c2": [[]]} + {"c2": [{}, [], {}]} + {"c2": [[123], {"b": "1"}]} + {"c2": [{"x": "y"}, {"b": "1"}]} + {} + )"; + // [], [{null, null}], null, null, null, [{null, null}, {1, null}], null + // valid 1 1 0 0 0 1 0 + // ofset 0, 0, 1, 1, 1, 1, 3, 3 + // child {null, null}, {null, null}, {1, null} + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_stringl.data(), json_stringl.size()}) + .prune_columns(true) + .experimental(true) + .lines(true); + + // struct>> eg. {"c2": [{"b": "1", "c": "2"}]} + cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT}, + { + {"c2", + {data_type{cudf::type_id::LIST}, + {{"element", + {data_type{cudf::type_id::STRUCT}, + { + {"b", {data_type{cudf::type_id::STRING}}}, + {"c", {data_type{cudf::type_id::STRING}}}, + }, + {{"b", "c"}}}}}}}, + }, + {{"c2"}}}; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + ASSERT_EQ(result.tbl->num_columns(), 1); + ASSERT_EQ(result.metadata.schema_info.size(), 1); + + // Expected: A list of struct of 2-string columns + // [], [{null, null}], null, null, null, [{null, null}, {1, null}], null + auto get_structs = [] { + strings_wrapper child0{{"", "", "1"}, nulls_at({0, 0, 1})}; + strings_wrapper child1{{"", "", ""}, all_nulls()}; + // purge non-empty nulls in list seems to retain nullmask in struct child column + return cudf::test::structs_column_wrapper{{child0, child1}, no_nulls()}.release(); + }; + std::vector const list_nulls{1, 1, 0, 0, 0, 1, 0}; + auto [null_mask, null_count] = + cudf::test::detail::make_null_mask(list_nulls.cbegin(), list_nulls.cend()); + auto const expected = cudf::make_lists_column( + 7, + cudf::test::fixed_width_column_wrapper{0, 0, 1, 1, 1, 1, 3, 3}.release(), + get_structs(), + null_count, + std::move(null_mask)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, result.tbl->get_column(0).view()); } struct JsonCompressedIOTest : public cudf::test::BaseFixture,