diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 56da1095b81..10646fad354 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -392,6 +392,54 @@ std::vector copy_strings_to_host(device_span input, return to_host(d_column_names->view()); } +/** + * @brief Checks if all strings in each string column in the tree are nulls. + * For non-string columns, it's set as true. If any of rows in a string column is false, it's set as + * false. + * + * @param input Input JSON string device data + * @param d_column_tree column tree representation of JSON string + * @param tree Node tree representation of the JSON string + * @param col_ids Column ids of the nodes in the tree + * @param options Parsing options specifying the parsing behaviour + * @param stream CUDA stream used for device memory operations and kernel launches + * @return Array of bytes where each byte indicate if it is all nulls string column. + */ +rmm::device_uvector is_all_nulls_each_column(device_span input, + tree_meta_t const& d_column_tree, + tree_meta_t const& tree, + device_span col_ids, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream) +{ + auto const num_nodes = col_ids.size(); + auto const num_cols = d_column_tree.node_categories.size(); + rmm::device_uvector is_all_nulls(num_cols, stream); + thrust::fill(rmm::exec_policy(stream), is_all_nulls.begin(), is_all_nulls.end(), true); + + auto parse_opt = parsing_options(options, stream); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + num_nodes, + [options = parse_opt.view(), + data = input.data(), + column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin(), + range_begin = tree.node_range_begin.begin(), + range_end = tree.node_range_end.begin(), + is_all_nulls = is_all_nulls.begin()] __device__(size_type i) { + auto const node_category = column_categories[col_ids[i]]; + if (node_category == NC_STR or node_category == NC_VAL) { + auto const is_null_literal = serialized_trie_contains( + options.trie_na, + {data + range_begin[i], static_cast(range_end[i] - range_begin[i])}); + if (!is_null_literal) is_all_nulls[col_ids[i]] = false; + } + }); + return is_all_nulls; +} + /** * @brief Holds member data pointers of `d_json_column` * @@ -415,8 +463,10 @@ struct json_column_data { * @param row_offsets Row offsets of the nodes in the tree * @param root Root node of the `d_json_column` tree * @param is_array_of_arrays Whether the tree is an array of arrays - * @param is_enabled_lines Whether the input is a line-delimited JSON - * @param is_enabled_mixed_types_as_string Whether to enable reading mixed types as string + * @param options Parsing options specifying the parsing behaviour + * options affecting behaviour are + * is_enabled_lines: Whether the input is a line-delimited JSON + * is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the device memory * of child_offets and validity members of `d_json_column` @@ -427,13 +477,15 @@ void make_device_json_column(device_span input, device_span row_offsets, device_json_column& root, bool is_array_of_arrays, - bool is_enabled_lines, - bool is_enabled_mixed_types_as_string, + cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - auto num_nodes = col_ids.size(); + + bool const is_enabled_lines = options.is_enabled_lines(); + bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); + auto const num_nodes = col_ids.size(); rmm::device_uvector sorted_col_ids(col_ids.size(), stream); // make a copy thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin()); @@ -548,6 +600,12 @@ void make_device_json_column(device_span input, return thrust::get<0>(a) < thrust::get<0>(b); }); + std::vector is_str_column_all_nulls{}; + if (is_enabled_mixed_types_as_string) { + is_str_column_all_nulls = cudf::detail::make_std_vector_async( + is_all_nulls_each_column(input, d_column_tree, tree, col_ids, options, stream), stream); + } + // use hash map because we may skip field name's col_ids std::unordered_map> columns; // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking @@ -592,29 +650,39 @@ void make_device_json_column(device_span input, auto& parent_col = it->second.get(); bool replaced = false; if (mapped_columns.count({parent_col_id, name}) > 0) { + auto const old_col_id = mapped_columns[{parent_col_id, name}]; // If mixed type as string is enabled, make both of them strings and merge them. // All child columns will be ignored when parsing. if (is_enabled_mixed_types_as_string) { - // VAL/STR or STRUCT or LIST - auto old_col_id = mapped_columns[{parent_col_id, name}]; - - is_mixed_type_column[this_col_id] = 1; - is_mixed_type_column[old_col_id] = 1; - // if old col type (not cat) is list or struct, replace with string. - auto& col = columns.at(old_col_id).get(); - if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) { - reinitialize_as_string(old_col_id, col); - // all its children (which are already inserted) are ignored later. + bool const is_mixed_type = [&]() { + // If new or old is STR and they are all not null, make it mixed type, else ignore. + if (column_categories[this_col_id] == NC_VAL || + column_categories[this_col_id] == NC_STR) { + if (is_str_column_all_nulls[this_col_id]) return false; + } + if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { + if (is_str_column_all_nulls[old_col_id]) return false; + } + return true; + }(); + if (is_mixed_type) { + is_mixed_type_column[this_col_id] = 1; + is_mixed_type_column[old_col_id] = 1; + // if old col type (not cat) is list or struct, replace with string. + auto& col = columns.at(old_col_id).get(); + if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) { + reinitialize_as_string(old_col_id, col); + // all its children (which are already inserted) are ignored later. + } + columns.try_emplace(this_col_id, columns.at(old_col_id)); + continue; } - columns.try_emplace(this_col_id, columns.at(old_col_id)); - continue; } if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) { ignore_vals[this_col_id] = 1; continue; } - auto old_col_id = mapped_columns[{parent_col_id, name}]; if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { // remap ignore_vals[old_col_id] = 1; @@ -795,15 +863,6 @@ void make_device_json_column(device_span input, } } -/** - * @brief Retrieves the parse_options to be used for type inference and type casting - * - * @param options The reader options to influence the relevant type inference and type casting - * options - */ -cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream); - std::pair, std::vector> device_json_column_to_cudf_column( device_json_column& json_col, device_span d_input, @@ -1021,8 +1080,7 @@ table_with_metadata device_parse_nested_json(device_span d_input, gpu_row_offsets, root_column, is_array_of_arrays, - options.is_enabled_lines(), - options.is_enabled_mixed_types_as_string(), + options, stream, mr); diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index c13daf9b9f5..f41b024bb1e 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -25,6 +25,10 @@ #include #include +// Forward declaration of parse_options from parsing_utils.cuh +namespace cudf::io { +struct parse_options; +} namespace cudf::io::json { /** @@ -284,6 +288,16 @@ reduce_to_column_tree(tree_meta_t& tree, device_span row_offsets, rmm::cuda_stream_view stream); +/** + * @brief Retrieves the parse_options to be used for type inference and type casting + * + * @param options The reader options to influence the relevant type inference and type casting + * options + * @param stream The CUDA stream to which kernels are dispatched + */ +cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream); + /** @copydoc host_parse_nested_json * All processing is done in device memory. * diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 73af983d108..a6a57c36b08 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -2042,7 +2042,8 @@ void make_json_column(json_column& root_column, * options * @param stream The CUDA stream to which kernels are dispatched */ -auto parsing_options(cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream) +cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream) { auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'}; diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index e4ed09d3962..450ea550e99 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -2052,6 +2052,9 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringIgnoreExcessChars) TEST_F(JsonReaderTest, MixedTypes) { + using LCWS = cudf::test::lists_column_wrapper; + using LCWI = cudf::test::lists_column_wrapper; + using valid_t = std::vector; { // Simple test for mixed types std::string json_string = R"({ "foo": [1,2,3], "bar": 123 } @@ -2084,34 +2087,112 @@ TEST_F(JsonReaderTest, MixedTypes) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + static int num_case = 0; + num_case++; + std::cout << "case:" << num_case << "\n"; CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), expected); }; - - // test cases. + // value + string (not mixed type case) test_fn(R"( { "a": "123" } { "a": 123 } )", cudf::test::strings_column_wrapper({"123", "123"})); + // test cases. + // STR + STRUCT, STR + LIST, STR + null + // STRUCT + STR, STRUCT + LIST, STRUCT + null + // LIST + STR, LIST + STRUCT, LIST + null + // LIST + STRUCT + STR, STRUCT + LIST + STR, STR + STRUCT + LIST, STRUCT + LIST + null + // STR + STRUCT + LIST + null + + // STRING mixed: + // STR + STRUCT, STR + LIST, STR + null test_fn(R"( -{ "a": [1,2,3] } +{ "a": "123" } { "a": { "b": 1 } } )", - cudf::test::strings_column_wrapper({"[1,2,3]", "{ \"b\": 1 }"})); + cudf::test::strings_column_wrapper({"123", "{ \"b\": 1 }"})); + test_fn(R"( +{ "a": "123" } +{ "a": [1,2,3] } +)", + cudf::test::strings_column_wrapper({"123", "[1,2,3]"})); + test_fn(R"( +{ "a": "123" } +{ "a": null } +)", + cudf::test::strings_column_wrapper({"123", ""}, std::vector{1, 0}.begin())); + // STRUCT mixed: + // STRUCT + STR, STRUCT + LIST, STRUCT + null test_fn(R"( +{ "a": { "b": 1 } } { "a": "fox" } +)", + cudf::test::strings_column_wrapper({"{ \"b\": 1 }", "fox"})); + test_fn(R"( +{ "a": { "b": 1 } } +{ "a": [1,2,3] } +)", + cudf::test::strings_column_wrapper({"{ \"b\": 1 }", "[1,2,3]"})); + cudf::test::fixed_width_column_wrapper child_int_col_wrapper{1, 2}; + test_fn(R"( { "a": { "b": 1 } } +{ "a": null } )", - cudf::test::strings_column_wrapper({"fox", "{ \"b\": 1 }"})); + cudf::test::structs_column_wrapper{ + {child_int_col_wrapper}, {1, 0} /*Validity*/ + }); + // LIST mixed: + // LIST + STR, LIST + STRUCT, LIST + null test_fn(R"( { "a": [1,2,3] } -{ "a": "fox" } +{ "a": "123" } +)", + cudf::test::strings_column_wrapper({"[1,2,3]", "123"})); + test_fn(R"( +{ "a": [1,2,3] } +{ "a": { "b": 1 } } +)", + cudf::test::strings_column_wrapper({"[1,2,3]", "{ \"b\": 1 }"})); + test_fn( + R"( +{ "a": [1,2,3] } +{ "a": null } +)", + cudf::test::lists_column_wrapper{{LCWI{1L, 2L, 3L}, LCWI{4L, 5L}}, valid_t{1, 0}.begin()}); + + // All mixed: + // LIST + STRUCT + STR, STRUCT + LIST + STR, STR + STRUCT + LIST, STRUCT + LIST + null + test_fn(R"( +{ "a": [1,2,3] } +{ "a": { "b": 1 } } +{ "a": "fox"} +)", + cudf::test::strings_column_wrapper({"[1,2,3]", "{ \"b\": 1 }", "fox"})); + test_fn(R"( +{ "a": { "b": 1 } } +{ "a": [1,2,3] } +{ "a": "fox"} +)", + cudf::test::strings_column_wrapper({"{ \"b\": 1 }", "[1,2,3]", "fox"})); + test_fn(R"( +{ "a": "fox"} +{ "a": { "b": 1 } } +{ "a": [1,2,3] } +)", + cudf::test::strings_column_wrapper({"fox", "{ \"b\": 1 }", "[1,2,3]"})); + test_fn(R"( +{ "a": [1,2,3] } +{ "a": { "b": 1 } } +{ "a": null} )", - cudf::test::strings_column_wrapper({"[1,2,3]", "fox"})); + cudf::test::strings_column_wrapper({"[1,2,3]", "{ \"b\": 1 }", "NA"}, + valid_t{1, 1, 0}.begin())); // RIGHT + // value + string inside list test_fn(R"( { "a": [1,2,3] } { "a": [true,false,true] } @@ -2119,36 +2200,31 @@ TEST_F(JsonReaderTest, MixedTypes) )", cudf::test::lists_column_wrapper{ {"1", "2", "3"}, {"true", "false", "true"}, {"a", "b", "c"}}); - { - std::string json_string = R"( -{ "var1": true } -{ "var1": [{ "var0": true, "var1": "hello", "var2": null }, null, [true, null, null]] } - )"; - - cudf::io::json_reader_options in_options = - cudf::io::json_reader_options::builder( - cudf::io::source_info{json_string.data(), json_string.size()}) - .mixed_types_as_string(true) - .lines(true); - cudf::io::table_with_metadata result = cudf::io::read_json(in_options); - } + // null + list of mixed types and null + test_fn(R"( +{ "var1": null } +{ "var1": [{ "var0": true, "var1": "hello", "var2": null }, null, [true, null, null]] } + )", + cudf::test::lists_column_wrapper( + {{"NA", "NA"}, + {{R"({ "var0": true, "var1": "hello", "var2": null })", "null", "[true, null, null]"}, + valid_t{1, 0, 1}.begin()}}, + valid_t{0, 1}.begin())); // test to confirm if reinitialize a non-string column as string affects max_rowoffsets. // max_rowoffsets is generated based on parent col id, // so, even if mixed types are present, their row offset will be correct. - using LCW = cudf::test::lists_column_wrapper; - using valid_t = std::vector; cudf::test::lists_column_wrapper expected_list{ { - cudf::test::lists_column_wrapper({LCW({"1", "2", "3"}), LCW({"4", "5", "6"})}), - cudf::test::lists_column_wrapper({LCW()}), - cudf::test::lists_column_wrapper({LCW()}), // null - cudf::test::lists_column_wrapper({LCW()}), // null - cudf::test::lists_column_wrapper({LCW({"{\"c\": -1}"}), LCW({"5"})}), - cudf::test::lists_column_wrapper({LCW({"7"}), LCW({"8", "9"})}), - cudf::test::lists_column_wrapper({LCW()}), // null + cudf::test::lists_column_wrapper({LCWS({"1", "2", "3"}), LCWS({"4", "5", "6"})}), + cudf::test::lists_column_wrapper({LCWS()}), + cudf::test::lists_column_wrapper({LCWS()}), // null + cudf::test::lists_column_wrapper({LCWS()}), // null + cudf::test::lists_column_wrapper({LCWS({"{\"c\": -1}"}), LCWS({"5"})}), + cudf::test::lists_column_wrapper({LCWS({"7"}), LCWS({"8", "9"})}), + cudf::test::lists_column_wrapper({LCWS()}), // null }, valid_t{1, 1, 0, 0, 1, 1, 0}.begin()}; test_fn(R"(