diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 472d42b1db5..2a39a539cc7 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -98,6 +98,8 @@ class json_reader_options { // Read the file as a json object per line bool _lines = false; + // Parse mixed types as a string column + bool _mixed_types_as_string = false; // Bytes to skip from the start size_t _byte_range_offset = 0; @@ -225,6 +227,13 @@ class json_reader_options { */ bool is_enabled_lines() const { return _lines; } + /** + * @brief Whether to parse mixed types as a string column. + * + * @return `true` if mixed types are parsed as a string column + */ + bool is_enabled_mixed_types_as_string() const { return _mixed_types_as_string; } + /** * @brief Whether to parse dates as DD/MM versus MM/DD. * @@ -302,6 +311,13 @@ class json_reader_options { */ void enable_lines(bool val) { _lines = val; } + /** + * @brief Set whether to parse mixed types as a string column. + * + * @param val Boolean value to enable/disable parsing mixed types as a string column + */ + void enable_mixed_types_as_string(bool val) { _mixed_types_as_string = val; } + /** * @brief Set whether to parse dates as DD/MM versus MM/DD. * @@ -437,6 +453,18 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set whether to parse mixed types as a string column. + * + * @param val Boolean value to enable/disable parsing mixed types as a string column + * @return this for chaining + */ + json_reader_options_builder& mixed_types_as_string(bool val) + { + options._mixed_types_as_string = val; + return *this; + } + /** * @brief Set whether to parse dates as DD/MM versus MM/DD. * diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index f1296daca26..b1dc2c9dd7f 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -277,6 +277,16 @@ reduce_to_column_tree(tree_meta_t& tree, return is_non_list_parent(parent_col_id); }); + // For Struct and List (to avoid copying entire strings when mixed type as string is enabled) + thrust::transform_if( + rmm::exec_policy(stream), + col_range_begin.begin(), + col_range_begin.end(), + column_categories.begin(), + col_range_end.begin(), + [] __device__(auto i) { return i + 1; }, + [] __device__(NodeT type) { return type == NC_STRUCT || type == NC_LIST; }); + return std::tuple{tree_meta_t{std::move(column_categories), std::move(parent_col_ids), std::move(column_levels), @@ -407,6 +417,7 @@ struct json_column_data { * @param root Root node of the `d_json_column` tree * @param is_array_of_arrays Whether the tree is an array of arrays * @param is_enabled_lines Whether the input is a line-delimited JSON + * @param is_enabled_mixed_types_as_string Whether to enable reading mixed types as string * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the device memory * of child_offets and validity members of `d_json_column` @@ -418,6 +429,7 @@ void make_device_json_column(device_span input, device_json_column& root, bool is_array_of_arrays, bool is_enabled_lines, + bool is_enabled_mixed_types_as_string, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -516,6 +528,19 @@ void make_device_json_column(device_span input, col.type = to_json_col_type(column_categories[i]); }; + auto reinitialize_as_string = [&](auto i, auto& col) { + col.string_offsets.resize(max_row_offsets[i] + 1, stream); + col.string_lengths.resize(max_row_offsets[i] + 1, stream); + init_to_zero(col.string_offsets); + init_to_zero(col.string_lengths); + col.num_rows = max_row_offsets[i] + 1; + col.validity = + cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); + col.type = json_col_t::StringColumn; + col.child_columns.clear(); // their references should be deleted too. + col.column_order.clear(); + }; + // 2. generate nested columns tree and its device_memory // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order. auto h_range_col_id_it = @@ -530,6 +555,7 @@ void make_device_json_column(device_span input, std::map, NodeIndexT> mapped_columns; // find column_ids which are values, but should be ignored in validity std::vector ignore_vals(num_columns, 0); + std::vector is_mixed_type_column(num_columns, 0); columns.try_emplace(parent_node_sentinel, std::ref(root)); for (auto const this_col_id : unique_col_ids) { @@ -552,6 +578,13 @@ void make_device_json_column(device_span input, } else { CUDF_FAIL("Unexpected parent column category"); } + + if (parent_col_id != parent_node_sentinel && is_mixed_type_column[parent_col_id] == 1) { + // if parent is mixed type column, ignore this column. + is_mixed_type_column[this_col_id] = 1; + ignore_vals[this_col_id] = 1; + continue; + } // If the child is already found, // replace if this column is a nested column and the existing was a value column // ignore this column if this column is a value column and the existing was a nested column @@ -560,6 +593,24 @@ void make_device_json_column(device_span input, auto& parent_col = it->second.get(); bool replaced = false; if (mapped_columns.count({parent_col_id, name}) > 0) { + // If mixed type as string is enabled, make both of them strings and merge them. + // All child columns will be ignored when parsing. + if (is_enabled_mixed_types_as_string) { + // VAL/STR or STRUCT or LIST + auto old_col_id = mapped_columns[{parent_col_id, name}]; + + is_mixed_type_column[this_col_id] = 1; + is_mixed_type_column[old_col_id] = 1; + // if old col type (not cat) is list or struct, replace with string. + auto& col = columns.at(old_col_id).get(); + if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) { + reinitialize_as_string(old_col_id, col); + // all its children (which are already inserted) are ignored later. + } + columns.try_emplace(this_col_id, columns.at(old_col_id)); + continue; + } + if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) { ignore_vals[this_col_id] = 1; continue; @@ -592,6 +643,28 @@ void make_device_json_column(device_span input, columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name))); mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id); } + + if (is_enabled_mixed_types_as_string) { + // ignore all children of mixed type columns + for (auto const this_col_id : unique_col_ids) { + auto parent_col_id = column_parent_ids[this_col_id]; + if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) { + is_mixed_type_column[this_col_id] = 1; + ignore_vals[this_col_id] = 1; + columns.erase(this_col_id); + } + // Convert only mixed type columns as string (so to copy), but not its children + if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and + is_mixed_type_column[this_col_id] == 1) + column_categories[this_col_id] = NC_STR; + } + cudaMemcpyAsync(d_column_tree.node_categories.begin(), + column_categories.data(), + column_categories.size() * sizeof(column_categories[0]), + cudaMemcpyDefault, + stream.value()); + } + // restore unique_col_ids order std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { return thrust::get<1>(a) < thrust::get<1>(b); @@ -617,14 +690,16 @@ void make_device_json_column(device_span input, rmm::exec_policy(stream), thrust::counting_iterator(0), num_nodes, - [node_categories = tree.node_categories.begin(), - col_ids = col_ids.begin(), - row_offsets = row_offsets.begin(), - range_begin = tree.node_range_begin.begin(), - range_end = tree.node_range_end.begin(), - d_ignore_vals = d_ignore_vals.begin(), - d_columns_data = d_columns_data.begin()] __device__(size_type i) { - switch (node_categories[i]) { + [column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin(), + row_offsets = row_offsets.begin(), + range_begin = tree.node_range_begin.begin(), + range_end = tree.node_range_end.begin(), + d_ignore_vals = d_ignore_vals.begin(), + d_columns_data = d_columns_data.begin()] __device__(size_type i) { + if (d_ignore_vals[col_ids[i]]) return; + auto const node_category = column_categories[col_ids[i]]; + switch (node_category) { case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; case NC_STR: [[fallthrough]]; @@ -662,10 +737,14 @@ void make_device_json_column(device_span input, num_nodes, thrust::make_counting_iterator(0), thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), - [node_categories = tree.node_categories.begin(), - parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) { + [d_ignore_vals = d_ignore_vals.begin(), + parent_node_ids = tree.parent_node_ids.begin(), + column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin()] __device__(size_type node_id) { auto parent_node_id = parent_node_ids[node_id]; - return parent_node_id != parent_node_sentinel and node_categories[parent_node_id] == NC_LIST; + return parent_node_id != parent_node_sentinel and + column_categories[col_ids[parent_node_id]] == NC_LIST and + (!d_ignore_vals[col_ids[parent_node_id]]); }); auto const num_list_children = @@ -896,8 +975,11 @@ table_with_metadata device_parse_nested_json(device_span d_input, const auto [tokens_gpu, token_indices_gpu] = get_token_stream(d_input, options, stream, rmm::mr::get_current_device_resource()); // gpu tree generation - return get_tree_representation( - tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource()); + return get_tree_representation(tokens_gpu, + token_indices_gpu, + options.is_enabled_mixed_types_as_string(), + stream, + rmm::mr::get_current_device_resource()); }(); // IILE used to free memory of token data. #ifdef NJP_DEBUG_PRINT auto h_input = cudf::detail::make_host_vector_async(d_input, stream); @@ -941,6 +1023,7 @@ table_with_metadata device_parse_nested_json(device_span d_input, root_column, is_array_of_arrays, options.is_enabled_lines(), + options.is_enabled_mixed_types_as_string(), stream, mr); diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 9a70b987fa5..275907c19c9 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -134,6 +134,14 @@ struct node_ranges { } }; +struct is_nested_end { + PdaTokenT const* tokens; + __device__ auto operator()(NodeIndexT i) -> bool + { + return tokens[i] == token_t::StructEnd or tokens[i] == token_t::ListEnd; + } +}; + /** * @brief Returns stable sorted keys and its sorted order * @@ -184,16 +192,16 @@ std::pair, rmm::device_uvector> stable_s } /** - * @brief Propagate parent node to siblings from first sibling. + * @brief Propagate parent node from first sibling to other siblings. * * @param node_levels Node levels of each node * @param parent_node_ids parent node ids initialized for first child of each push node, * and other siblings are initialized to -1. * @param stream CUDA stream used for device memory operations and kernel launches. */ -void propagate_parent_to_siblings(cudf::device_span node_levels, - cudf::device_span parent_node_ids, - rmm::cuda_stream_view stream) +void propagate_first_sibling_to_other(cudf::device_span node_levels, + cudf::device_span parent_node_ids, + rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); auto [sorted_node_levels, sorted_order] = stable_sorted_key_order(node_levels, stream); @@ -212,6 +220,7 @@ void propagate_parent_to_siblings(cudf::device_span node_level // Generates a tree representation of the given tokens, token_indices. tree_meta_t get_tree_representation(device_span tokens, device_span token_indices, + bool is_strict_nested_boundaries, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -297,9 +306,9 @@ tree_meta_t get_tree_representation(device_span tokens, // Node parent ids: // previous push node_id transform, stable sort by level, segmented scan with Max, reorder. rmm::device_uvector parent_node_ids(num_nodes, stream, mr); + rmm::device_uvector node_token_ids(num_nodes, stream); // needed for SE, LE later // This block of code is generalized logical stack algorithm. TODO: make this a separate function. { - rmm::device_uvector node_token_ids(num_nodes, stream); cudf::detail::copy_if_safe(thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + num_tokens, tokens.begin(), @@ -345,7 +354,7 @@ tree_meta_t get_tree_representation(device_span tokens, }); } // Propagate parent node to siblings from first sibling - inplace. - propagate_parent_to_siblings( + propagate_first_sibling_to_other( cudf::device_span{node_levels.data(), node_levels.size()}, parent_node_ids, stream); @@ -380,6 +389,105 @@ tree_meta_t get_tree_representation(device_span tokens, stream); CUDF_EXPECTS(node_range_out_end - node_range_out_it == num_nodes, "node range count mismatch"); + // Extract Struct, List range_end: + // 1. Extract Struct, List - begin & end separately, their token ids + // 2. push, pop to get levels + // 3. copy first child's parent token_id, also translate to node_id + // 4. propagate to siblings using levels, parent token id. (segmented scan) + // 5. scatter to node_range_end for only nested end tokens. + if (is_strict_nested_boundaries) { + // Whether the token is nested + auto const is_nested = [] __device__(PdaTokenT const token) -> bool { + switch (token) { + case token_t::StructBegin: + case token_t::StructEnd: + case token_t::ListBegin: + case token_t::ListEnd: return true; + default: return false; + }; + }; + auto const num_nested = + thrust::count_if(rmm::exec_policy(stream), tokens.begin(), tokens.end(), is_nested); + rmm::device_uvector token_levels(num_nested, stream); + rmm::device_uvector token_id(num_nested, stream); + rmm::device_uvector parent_node_ids(num_nested, stream); + auto const push_pop_it = thrust::make_transform_iterator( + tokens.begin(), + cuda::proclaim_return_type( + [] __device__(PdaTokenT const token) -> size_type { + if (token == token_t::StructBegin or token == token_t::ListBegin) { + return 1; + } else if (token == token_t::StructEnd or token == token_t::ListEnd) { + return -1; + } + return 0; + })); + // copy_if only struct/list's token levels, token ids, tokens. + auto zipped_in_it = + thrust::make_zip_iterator(push_pop_it, thrust::make_counting_iterator(0)); + auto zipped_out_it = thrust::make_zip_iterator(token_levels.begin(), token_id.begin()); + cudf::detail::copy_if_safe( + zipped_in_it, zipped_in_it + num_tokens, tokens.begin(), zipped_out_it, is_nested, stream); + + thrust::exclusive_scan( + rmm::exec_policy(stream), token_levels.begin(), token_levels.end(), token_levels.begin()); + + // Get parent of first child of struct/list begin. + auto const nested_first_childs_parent_token_id = + [tokens_gpu = tokens.begin(), token_id = token_id.begin()] __device__(auto i) -> NodeIndexT { + if (i <= 0) { return -1; } + auto id = token_id[i - 1]; // current token's predecessor + if (tokens_gpu[id] == token_t::StructBegin or tokens_gpu[id] == token_t::ListBegin) { + return id; + } else { + return -1; + } + }; + + // copied L+S tokens, and their token ids, their token levels. + // initialize first child parent token ids + // translate token ids to node id using similar binary search. + thrust::transform( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + num_nested, + parent_node_ids.begin(), + [node_ids_gpu = node_token_ids.begin(), + num_nodes, + nested_first_childs_parent_token_id] __device__(NodeIndexT const tid) -> NodeIndexT { + auto const pid = nested_first_childs_parent_token_id(tid); + // token_ids which are converted to nodes, are stored in node_ids_gpu in order + // so finding index of token_id in node_ids_gpu will return its node index. + return pid < 0 + ? parent_node_sentinel + : thrust::lower_bound(thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) - + node_ids_gpu; + // parent_node_sentinel is -1, useful for segmented max operation below + }); + + // propagate parent node from first sibling to other siblings - inplace. + propagate_first_sibling_to_other( + cudf::device_span{token_levels.data(), token_levels.size()}, + parent_node_ids, + stream); + + // scatter to node_range_end for only nested end tokens. + auto token_indices_it = + thrust::make_permutation_iterator(token_indices.begin(), token_id.begin()); + auto nested_node_range_end_it = + thrust::make_transform_output_iterator(node_range_end.begin(), [] __device__(auto i) { + // add +1 to include end symbol. + return i + 1; + }); + auto stencil = thrust::make_transform_iterator(token_id.begin(), is_nested_end{tokens.begin()}); + thrust::scatter_if(rmm::exec_policy(stream), + token_indices_it, + token_indices_it + num_nested, + parent_node_ids.begin(), + stencil, + nested_node_range_end_it); + } + return {std::move(node_categories), std::move(parent_node_ids), std::move(node_levels), diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 8d89f4ff927..c13daf9b9f5 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -216,6 +216,7 @@ std::pair, rmm::device_uvector> pr * * @param tokens Vector of token types in the json string * @param token_indices The indices within the input string corresponding to each token + * @param is_strict_nested_boundaries Whether to extract node end of nested types strictly * @param stream The CUDA stream to which kernels are dispatched * @param mr Optional, resource with which to allocate * @return A tree representation of the input JSON string as vectors of node type, parent index, @@ -223,6 +224,7 @@ std::pair, rmm::device_uvector> pr */ tree_meta_t get_tree_representation(device_span tokens, device_span token_indices, + bool is_strict_nested_boundaries, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 7fce31461ef..22c2f0de924 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -2050,4 +2050,117 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringIgnoreExcessChars) float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0, 0.0, 0.0, 0.0}, c_validity.cbegin()}); } +TEST_F(JsonReaderTest, MixedTypes) +{ + { + // Simple test for mixed types + std::string json_string = R"({ "foo": [1,2,3], "bar": 123 } + { "foo": { "a": 1 }, "bar": 456 })"; + + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.data(), json_string.size()}) + .mixed_types_as_string(true) + .lines(true); + + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + + EXPECT_EQ(result.tbl->num_columns(), 2); + EXPECT_EQ(result.tbl->num_rows(), 2); + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRING); + EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::INT64); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), + cudf::test::strings_column_wrapper({"[1,2,3]", "{ \"a\": 1 }"})); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), + cudf::test::fixed_width_column_wrapper({123, 456})); + } + + // Testing function for mixed types in JSON (for spark json reader) + auto test_fn = [](std::string_view json_string, cudf::column_view expected) { + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.data(), json_string.size()}) + .mixed_types_as_string(true) + .lines(true); + + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), expected); + }; + + // test cases. + test_fn(R"( +{ "a": "123" } +{ "a": 123 } +)", + cudf::test::strings_column_wrapper({"123", "123"})); + + test_fn(R"( +{ "a": [1,2,3] } +{ "a": { "b": 1 } } +)", + cudf::test::strings_column_wrapper({"[1,2,3]", "{ \"b\": 1 }"})); + + test_fn(R"( +{ "a": "fox" } +{ "a": { "b": 1 } } +)", + cudf::test::strings_column_wrapper({"fox", "{ \"b\": 1 }"})); + + test_fn(R"( +{ "a": [1,2,3] } +{ "a": "fox" } +)", + cudf::test::strings_column_wrapper({"[1,2,3]", "fox"})); + + test_fn(R"( +{ "a": [1,2,3] } +{ "a": [true,false,true] } +{ "a": ["a", "b", "c"] } +)", + cudf::test::lists_column_wrapper{ + {"1", "2", "3"}, {"true", "false", "true"}, {"a", "b", "c"}}); + { + std::string json_string = R"( +{ "var1": true } +{ "var1": [{ "var0": true, "var1": "hello", "var2": null }, null, [true, null, null]] } + )"; + + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.data(), json_string.size()}) + .mixed_types_as_string(true) + .lines(true); + + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + } + + // test to confirm if reinitialize a non-string column as string affects max_rowoffsets. + // max_rowoffsets is generated based on parent col id, + // so, even if mixed types are present, their row offset will be correct. + using LCW = cudf::test::lists_column_wrapper; + using valid_t = std::vector; + + cudf::test::lists_column_wrapper expected_list{ + { + cudf::test::lists_column_wrapper({LCW({"1", "2", "3"}), LCW({"4", "5", "6"})}), + cudf::test::lists_column_wrapper({LCW()}), + cudf::test::lists_column_wrapper({LCW()}), // null + cudf::test::lists_column_wrapper({LCW()}), // null + cudf::test::lists_column_wrapper({LCW({"{\"c\": -1}"}), LCW({"5"})}), + cudf::test::lists_column_wrapper({LCW({"7"}), LCW({"8", "9"})}), + cudf::test::lists_column_wrapper({LCW()}), // null + }, + valid_t{1, 1, 0, 0, 1, 1, 0}.begin()}; + test_fn(R"( +{"b": [ [1, 2, 3], [ 4, 5, 6] ]} +{"b": [[]]} +{} +{} +{"b": [ [ {"c": -1} ], [ 5 ] ]} +{"b": [ [7], [8, 9]]} +{} +)", + expected_list); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json_tree.cpp index f5d03293d30..40996e4fffa 100644 --- a/cpp/tests/io/json_tree.cpp +++ b/cpp/tests/io/json_tree.cpp @@ -596,7 +596,7 @@ TEST_F(JsonTest, TreeRepresentation) // Get the JSON's tree representation auto gpu_tree = cuio_json::detail::get_tree_representation( - tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource()); + tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource()); // host tree generation auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream); compare_trees(cpu_tree, gpu_tree); @@ -684,7 +684,7 @@ TEST_F(JsonTest, TreeRepresentation2) // Get the JSON's tree representation auto gpu_tree = cuio_json::detail::get_tree_representation( - tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource()); + tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource()); // host tree generation auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream); compare_trees(cpu_tree, gpu_tree); @@ -759,7 +759,7 @@ TEST_F(JsonTest, TreeRepresentation3) // Get the JSON's tree representation auto gpu_tree = cuio_json::detail::get_tree_representation( - tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource()); + tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource()); // host tree generation auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream); compare_trees(cpu_tree, gpu_tree); @@ -785,9 +785,10 @@ TEST_F(JsonTest, TreeRepresentationError) // Get the JSON's tree representation // This JSON is invalid and will raise an exception. - EXPECT_THROW(cuio_json::detail::get_tree_representation( - tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource()), - cudf::logic_error); + EXPECT_THROW( + cuio_json::detail::get_tree_representation( + tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource()), + cudf::logic_error); } /** @@ -876,7 +877,7 @@ TEST_P(JsonTreeTraversalTest, CPUvsGPUTraversal) records_orient_tree_traversal_cpu(input, cpu_tree, is_array_of_arrays, json_lines, stream); // gpu tree generation auto gpu_tree = cuio_json::detail::get_tree_representation( - tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource()); + tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource()); #if LIBCUDF_JSON_DEBUG_DUMP printf("BEFORE traversal (gpu_tree):\n"); diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index f98687df5fa..523d594f8ba 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,12 +30,14 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean dayFirst; private final boolean lines; private final boolean recoverWithNull; + private final boolean mixedTypesAsStrings; private JSONOptions(Builder builder) { super(builder); dayFirst = builder.dayFirst; lines = builder.lines; recoverWithNull = builder.recoverWithNull; + mixedTypesAsStrings = builder.mixedTypesAsStrings; } public boolean isDayFirst() { @@ -51,6 +53,10 @@ public boolean isRecoverWithNull() { return recoverWithNull; } + public boolean isMixedTypesAsStrings() { + return mixedTypesAsStrings; + } + @Override String[] getIncludeColumnNames() { throw new UnsupportedOperationException("JSON reader didn't support column prune"); @@ -66,6 +72,8 @@ public static final class Builder extends ColumnFilterOptions.Builder= 0 && offset < buffer.length; return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len, - opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull())); + opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(), + opts.isMixedTypesAsStrings())); } /** @@ -1162,7 +1166,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b try (TableWithMeta twm = new TableWithMeta(readJSON(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(), null, buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(), - opts.isRecoverWithNull()))) { + opts.isRecoverWithNull(), opts.isMixedTypesAsStrings()))) { return gatherJSONColumns(schema, twm); } } @@ -1178,7 +1182,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) { long dsHandle = DataSourceHelper.createWrapperDataSource(ds); try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(), opts.isDayFirst(), opts.isLines(), - opts.isRecoverWithNull(), dsHandle))) { + opts.isRecoverWithNull(), opts.isMixedTypesAsStrings(), dsHandle))) { return gatherJSONColumns(schema, twm); } finally { DataSourceHelper.destroyWrapperDataSource(dsHandle); diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 295574858da..1ac15a3023c 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1392,7 +1392,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines, - jboolean recover_with_null) { + jboolean recover_with_null, jboolean mixed_types_as_string) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); if (buffer_length <= 0) { @@ -1411,7 +1411,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) .dayfirst(static_cast(day_first)) .lines(static_cast(lines)) - .recovery_mode(recovery_mode); + .recovery_mode(recovery_mode) + .mixed_types_as_string(mixed_types_as_string); auto result = std::make_unique(cudf::io::read_json(opts.build())); @@ -1469,7 +1470,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales, - jboolean day_first, jboolean lines, jboolean recover_with_null, jlong ds_handle) { + jboolean day_first, jboolean lines, jboolean recover_with_null, jboolean mixed_types_as_string, + jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1504,7 +1506,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) .dayfirst(static_cast(day_first)) .lines(static_cast(lines)) - .recovery_mode(recovery_mode); + .recovery_mode(recovery_mode) + .mixed_types_as_string(mixed_types_as_string); if (!n_col_names.is_null() && data_types.size() > 0) { if (n_col_names.size() != n_types.size()) { @@ -1536,7 +1539,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales, jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines, - jboolean recover_with_null) { + jboolean recover_with_null, jboolean mixed_types_as_string) { bool read_buffer = true; if (buffer == 0) { @@ -1586,7 +1589,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) .dayfirst(static_cast(day_first)) .lines(static_cast(lines)) - .recovery_mode(recovery_mode); + .recovery_mode(recovery_mode) + .mixed_types_as_string(mixed_types_as_string); if (!n_col_names.is_null() && data_types.size() > 0) { if (n_col_names.size() != n_types.size()) { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 8df8ebea8a7..73002644858 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -87,6 +87,8 @@ public class TableTest extends CudfTestBase { private static final File TEST_SIMPLE_CSV_FILE = TestUtils.getResourceAsFile("simple.csv"); private static final File TEST_SIMPLE_JSON_FILE = TestUtils.getResourceAsFile("people.json"); private static final File TEST_JSON_ERROR_FILE = TestUtils.getResourceAsFile("people_with_invalid_lines.json"); + private static final File TEST_MIXED_TYPE_1_JSON = TestUtils.getResourceAsFile("mixed_types_1.json"); + private static final File TEST_MIXED_TYPE_2_JSON = TestUtils.getResourceAsFile("mixed_types_2.json"); private static final Schema CSV_DATA_BUFFER_SCHEMA = Schema.builder() .column(DType.INT32, "A") @@ -327,6 +329,54 @@ void testReadJSONFile() { } } + @Test + void testReadMixedType2JSONFileFeatureDisabled() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withMixedTypesAsStrings(false) + .build(); + assertThrows(CudfException.class, () -> + Table.readJSON(schema, opts, TEST_MIXED_TYPE_2_JSON)); + } + + @Test + void testReadMixedType1JSONFile() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withMixedTypesAsStrings(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column("123", "123" ) + .build(); + Table table = Table.readJSON(schema, opts, TEST_MIXED_TYPE_1_JSON)) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testReadMixedType2JSONFile() throws IOException { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withMixedTypesAsStrings(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column("[1,2,3]", "{ \"b\": 1 }" ) + .build(); + MultiBufferDataSource source = sourceFrom(TEST_MIXED_TYPE_2_JSON); + Table table = Table.readJSON(schema, opts, source)) { + assertTablesAreEqual(expected, table); + } + } + @Test void testReadJSONFromDataSource() throws IOException { Schema schema = Schema.builder() diff --git a/java/src/test/resources/mixed_types_1.json b/java/src/test/resources/mixed_types_1.json new file mode 100644 index 00000000000..21d625bbf2a --- /dev/null +++ b/java/src/test/resources/mixed_types_1.json @@ -0,0 +1,2 @@ +{ "a": "123" } +{ "a": 123 } diff --git a/java/src/test/resources/mixed_types_2.json b/java/src/test/resources/mixed_types_2.json new file mode 100644 index 00000000000..becad2d0db7 --- /dev/null +++ b/java/src/test/resources/mixed_types_2.json @@ -0,0 +1,2 @@ +{ "a": [1,2,3] } +{ "a": { "b": 1 } } diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd index ad618cc4ed6..b916c2b7ad9 100644 --- a/python/cudf/cudf/_lib/cpp/io/json.pxd +++ b/python/cudf/cudf/_lib/cpp/io/json.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from libc.stdint cimport uint8_t from libcpp cimport bool @@ -27,6 +27,7 @@ cdef extern from "cudf/io/json.hpp" \ size_type get_byte_range_offset() except + size_type get_byte_range_size() except + bool is_enabled_lines() except + + bool is_enabled_mixed_types_as_string() except + bool is_enabled_dayfirst() except + bool is_enabled_experimental() except + @@ -39,6 +40,7 @@ cdef extern from "cudf/io/json.hpp" \ void set_byte_range_offset(size_type offset) except + void set_byte_range_size(size_type size) except + void enable_lines(bool val) except + + void enable_mixed_types_as_string(bool val) except + void enable_dayfirst(bool val) except + void enable_experimental(bool val) except + void enable_keep_quotes(bool val) except + @@ -74,6 +76,9 @@ cdef extern from "cudf/io/json.hpp" \ json_reader_options_builder& lines( bool val ) except + + json_reader_options_builder& mixed_types_as_string( + bool val + ) except + json_reader_options_builder& dayfirst( bool val ) except + diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index c361a3f00c4..9bbad0f61c3 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -50,7 +50,8 @@ cpdef read_json(object filepaths_or_buffers, object compression, object byte_range, bool legacy, - bool keep_quotes): + bool keep_quotes, + bool mixed_types_as_string): """ Cython function to call into libcudf API, see `read_json`. @@ -128,6 +129,7 @@ cpdef read_json(object filepaths_or_buffers, opts.set_dtypes(c_dtypes_schema_map) opts.enable_keep_quotes(keep_quotes) + opts.enable_mixed_types_as_string(mixed_types_as_string) # Read JSON cdef cudf_io_types.table_with_metadata c_result diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index ae2f0203642..35d91f9c062 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import warnings from collections import abc @@ -25,6 +25,7 @@ def read_json( byte_range=None, keep_quotes=False, storage_options=None, + mixed_types_as_string=False, *args, **kwargs, ): @@ -116,6 +117,7 @@ def read_json( byte_range, engine == "cudf_legacy", keep_quotes, + mixed_types_as_string, ) else: warnings.warn(