From a91853d09ce4a25c10b300c8eb755028a5ea46ac Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Thu, 22 Sep 2022 08:11:43 +0200 Subject: [PATCH] Adds option to take explicit nested schema for nested JSON reader (#11682) This PR adds the option to take an explicit nested schema, allowing users to specify the target data types of the leave columns in the nested JSON reader. This PR adds the corresponding interface and implementation to libcudf. In addition, the PR makes existing JSON reader tests parametrised tests and enables those tests for dual execution of (1) the existing JSON reader and (2) the new nested JSON reader. Authors: - Elias Stehle (https://github.com/elstehle) - Vukasin Milovanovic (https://github.com/vuule) - Yunsong Wang (https://github.com/PointKernel) - Karthikeyan (https://github.com/karthikeyann) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - David Wendt (https://github.com/davidwendt) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/11682 --- cpp/include/cudf/io/json.hpp | 45 +- cpp/src/io/json/experimental/read_json.cpp | 3 - cpp/src/io/json/nested_json_gpu.cu | 98 +++- cpp/src/io/json/reader_impl.cu | 12 + cpp/src/io/utilities/parsing_utils.cuh | 10 + cpp/tests/io/json_test.cpp | 535 ++++++++++++++++++--- cpp/tests/io/nested_json_test.cpp | 9 +- 7 files changed, 617 insertions(+), 95 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index aa7dca0dad3..66ac6d74cff 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -38,6 +38,22 @@ namespace io { class json_reader_options_builder; +/** + * @brief Allows specifying the target types for nested JSON data via json_reader_options' + * `set_dtypes` method. + */ +struct schema_element { + /** + * @brief The type that this column should be converted to + */ + data_type type; + + /** + * @brief Allows specifying this column's child columns target type + */ + std::map child_types; +}; + /** * @brief Input arguments to the `read_json` interface. * @@ -65,7 +81,10 @@ class json_reader_options { source_info _source; // Data types of the column; empty to infer dtypes - std::variant, std::map> _dtypes; + std::variant, + std::map, + std::map> + _dtypes; // Specify the compression format of the source or infer from file extension compression_type _compression = compression_type::AUTO; @@ -123,7 +142,10 @@ class json_reader_options { * * @returns Data types of the columns */ - std::variant, std::map> const& get_dtypes() const + std::variant, + std::map, + std::map> const& + get_dtypes() const { return _dtypes; } @@ -227,6 +249,13 @@ class json_reader_options { */ void set_dtypes(std::map types) { _dtypes = std::move(types); } + /** + * @brief Set data types for a potentially nested column hierarchy. + * + * @param types Map of column names to schema_element to support arbitrary nesting of data types + */ + void set_dtypes(std::map types) { _dtypes = std::move(types); } + /** * @brief Set the compression type. * @@ -323,6 +352,18 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set data types for columns to be read. + * + * @param types Column name -> schema_element map + * @return this for chaining + */ + json_reader_options_builder& dtypes(std::map types) + { + options._dtypes = std::move(types); + return *this; + } + /** * @brief Set the compression type. * diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp index ceac40ba4f9..7d78bd34b19 100644 --- a/cpp/src/io/json/experimental/read_json.cpp +++ b/cpp/src/io/json/experimental/read_json.cpp @@ -47,9 +47,6 @@ table_with_metadata read_json(host_span> sources, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const dtypes_empty = - std::visit([](const auto& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes()); - CUDF_EXPECTS(dtypes_empty, "user specified dtypes are not yet supported"); CUDF_EXPECTS(reader_opts.get_byte_range_offset() == 0 and reader_opts.get_byte_range_size() == 0, "specifying a byte range is not yet supported"); diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index de814cb5358..7e567aae9fe 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -1541,6 +1542,7 @@ auto parsing_options(cudf::io::json_reader_options const& options) auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'}; auto const stream = cudf::default_stream_value; + parse_opts.dayfirst = options.is_enabled_dayfirst(); parse_opts.keepquotes = options.is_enabled_keep_quotes(); parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); @@ -1552,6 +1554,7 @@ std::pair, std::vector> json_column_to json_column const& json_col, device_span d_input, cudf::io::json_reader_options const& options, + std::optional schema, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -1567,6 +1570,14 @@ std::pair, std::vector> json_column_to json_col.current_offset - json_col.valid_count}; }; + auto get_child_schema = [schema](auto child_name) -> std::optional { + if (schema.has_value()) { + auto const result = schema.value().child_types.find(child_name); + if (result != std::end(schema.value().child_types)) { return result->second; } + } + return {}; + }; + switch (json_col.type) { case json_col_t::StringColumn: { auto const col_size = json_col.string_offsets.size(); @@ -1597,9 +1608,21 @@ std::pair, std::vector> json_column_to data + thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; }); - // Infer column type - auto target_type = cudf::io::detail::infer_data_type( - parsing_options(options).json_view(), d_input, string_ranges_it, col_size, stream); + data_type target_type{}; + + if (schema.has_value()) { +#ifdef NJP_DEBUG_PRINT + std::cout << "-> explicit type: " + << (schema.has_value() ? std::to_string(static_cast(schema->type.id())) + : "n/a"); +#endif + target_type = schema.value().type; + } + // Infer column type, if we don't have an explicit type for it + else { + target_type = cudf::io::detail::infer_data_type( + parsing_options(options).json_view(), d_input, string_ranges_it, col_size, stream); + } // Convert strings to the inferred data type auto col = experimental::detail::parse_data(string_spans_it, @@ -1611,7 +1634,12 @@ std::pair, std::vector> json_column_to mr); // Reset nullable if we do not have nulls - if (col->null_count() == 0) { col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); } + // This is to match the existing JSON reader's behaviour: + // - Non-string columns will always be returned as nullable + // - String columns will be returned as nullable, iff there's at least one null entry + if (target_type.id() == type_id::STRING and col->null_count() == 0) { + col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); + } // For string columns return ["offsets", "char"] schema if (target_type.id() == type_id::STRING) { @@ -1631,9 +1659,9 @@ std::pair, std::vector> json_column_to for (auto const& col_name : json_col.column_order) { auto const& col = json_col.child_columns.find(col_name); column_names.emplace_back(col->first); - auto const& child_col = col->second; - auto [child_column, names] = - json_column_to_cudf_column(child_col, d_input, options, stream, mr); + auto const& child_col = col->second; + auto [child_column, names] = json_column_to_cudf_column( + child_col, d_input, options, get_child_schema(col_name), stream, mr); CUDF_EXPECTS(num_rows == child_column->size(), "All children columns must have the same size"); child_columns.push_back(std::move(child_column)); @@ -1657,8 +1685,13 @@ std::pair, std::vector> json_column_to auto offsets_column = std::make_unique(data_type{type_id::INT32}, num_rows, d_offsets.release()); // Create children column - auto [child_column, names] = json_column_to_cudf_column( - json_col.child_columns.begin()->second, d_input, options, stream, mr); + auto [child_column, names] = + json_column_to_cudf_column(json_col.child_columns.begin()->second, + d_input, + options, + get_child_schema(json_col.child_columns.begin()->first), + stream, + mr); column_names.back().children = names; auto [result_bitmask, null_count] = make_validity(json_col); return {make_lists_column(num_rows - 1, @@ -1738,16 +1771,61 @@ table_with_metadata parse_nested_json(host_span input, std::vector out_column_names; // Iterate over the struct's child columns and convert to cudf column + size_type column_index = 0; for (auto const& col_name : root_struct_col.column_order) { auto const& json_col = root_struct_col.child_columns.find(col_name)->second; // Insert this columns name into the schema out_column_names.emplace_back(col_name); + std::optional child_schema_element = std::visit( + cudf::detail::visitor_overload{ + [column_index](const std::vector& user_dtypes) -> std::optional { + auto ret = (static_cast(column_index) < user_dtypes.size()) + ? std::optional{{user_dtypes[column_index]}} + : std::optional{}; +#ifdef NJP_DEBUG_PRINT + std::cout << "Column by index: #" << column_index << ", type id: " + << (ret.has_value() ? std::to_string(static_cast(ret->type.id())) : "n/a") + << ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children" + << "\n"; +#endif + return ret; + }, + [col_name]( + std::map const& user_dtypes) -> std::optional { + auto ret = (user_dtypes.find(col_name) != std::end(user_dtypes)) + ? std::optional{{user_dtypes.find(col_name)->second}} + : std::optional{}; +#ifdef NJP_DEBUG_PRINT + std::cout << "Column by flat name: '" << col_name << "', type id: " + << (ret.has_value() ? std::to_string(static_cast(ret->type.id())) : "n/a") + << ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children" + << "\n"; +#endif + return ret; + }, + [col_name](std::map const& user_dtypes) + -> std::optional { + auto ret = (user_dtypes.find(col_name) != std::end(user_dtypes)) + ? user_dtypes.find(col_name)->second + : std::optional{}; +#ifdef NJP_DEBUG_PRINT + std::cout << "Column by nested name: #" << col_name << ", type id: " + << (ret.has_value() ? std::to_string(static_cast(ret->type.id())) : "n/a") + << ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children" + << "\n"; +#endif + return ret; + }}, + options.get_dtypes()); + // Get this JSON column's cudf column and schema info auto [cudf_col, col_name_info] = - json_column_to_cudf_column(json_col, d_input, options, stream, mr); + json_column_to_cudf_column(json_col, d_input, options, child_schema_element, stream, mr); out_column_names.back().children = std::move(col_name_info); out_columns.emplace_back(std::move(cudf_col)); + + column_index++; } return table_with_metadata{std::make_unique(std::move(out_columns)), diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index da6e7621449..48b2af81fcd 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -432,6 +432,18 @@ std::vector get_data_types(json_reader_options const& reader_opts, return it->second; }); return sorted_dtypes; + }, + [&](const std::map& dtypes) { + std::vector sorted_dtypes; + std::transform(std::cbegin(column_names), + std::cend(column_names), + std::back_inserter(sorted_dtypes), + [&](auto const& column_name) { + auto const it = dtypes.find(column_name); + CUDF_EXPECTS(it != dtypes.end(), "Must specify types for all columns"); + return it->second.type; + }); + return sorted_dtypes; }}, reader_opts.get_dtypes()); } else { diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 118fde6fdb6..388c9b28001 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -563,12 +563,22 @@ __inline__ __device__ T decode_value(char const* begin, char const* end, parse_options_view const& opts) { + // If this is a string value, remove quotes + if ((thrust::distance(begin, end) >= 2 && *begin == '\"' && *thrust::prev(end) == '\"')) { + thrust::advance(begin, 1); + thrust::advance(end, -1); + } return to_timestamp(begin, end, opts.dayfirst); } template ())> __inline__ __device__ T decode_value(char const* begin, char const* end, parse_options_view const&) { + // If this is a string value, remove quotes + if ((thrust::distance(begin, end) >= 2 && *begin == '\"' && *thrust::prev(end) == '\"')) { + thrust::advance(begin, 1); + thrust::advance(end, -1); + } return to_duration(begin, end); } diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 7f698774084..5a0db6e3c64 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -27,6 +27,9 @@ #include #include #include +#include + +#include #include @@ -156,14 +159,91 @@ void check_float_column(cudf::column_view const& col, struct JsonReaderTest : public cudf::test::BaseFixture { }; -TEST_F(JsonReaderTest, BasicJsonLines) +/** + * @brief Enum class to be used to specify the test case of parametrized tests + */ +enum class json_test_t { + // Run test with the existing JSON lines reader using row-orient input data + json_lines_row_orient, + // Run test with the existing JSON lines reader using record-orient input data + json_lines_record_orient, + // Run test with the nested JSON lines reader using record-orient input data + json_experimental_record_orient +}; + +/** + * @brief Test fixture for parametrized JSON reader tests + */ +struct JsonReaderParamTest : public cudf::test::BaseFixture, + public testing::WithParamInterface { +}; + +/** + * @brief Test fixture for parametrized JSON reader tests, testing record orient-only for existing + * JSON lines reader and the new experimental reader + */ +struct JsonReaderDualTest : public cudf::test::BaseFixture, + public testing::WithParamInterface { +}; + +/** + * @brief Generates a JSON lines string that uses the record orient + * + * @param records An array of a map of key-value pairs + * @param record_delimiter The delimiter to be used to delimit a record + * @param prefix The prefix prepended to the whole string + * @param suffix The suffix to be appended after the whole string + * @return The JSON lines string that uses the record orient + */ +std::string to_records_orient(std::vector> const& records, + std::string record_delimiter, + std::string prefix = "", + std::string suffix = "") { - std::string data = "[1, 1.1]\n[2, 2.2]\n[3, 3.3]\n"; + std::string result = prefix; + for (auto record_it = std::cbegin(records); record_it != std::cend(records); record_it++) { + result += "{"; + for (auto kv_pair_it = std::cbegin(*record_it); kv_pair_it != std::cend(*record_it); + kv_pair_it++) { + auto const& [key, value] = *kv_pair_it; + result += "\"" + key + "\":" + value; + result += (kv_pair_it != std::prev(std::end(*record_it))) ? ", " : ""; + } + result += "}"; + if (record_it != std::prev(std::end(records))) { result += record_delimiter; } + } + return (result + suffix); +} + +// Parametrize qualifying JSON tests for executing both experimental reader and existing JSON lines +// reader +INSTANTIATE_TEST_CASE_P(JsonReaderParamTest, + JsonReaderParamTest, + ::testing::Values(json_test_t::json_lines_row_orient, + json_test_t::json_lines_record_orient, + json_test_t::json_experimental_record_orient)); + +// Parametrize qualifying JSON tests for executing both experimental reader and existing JSON lines +// reader +INSTANTIATE_TEST_CASE_P(JsonReaderDualTest, + JsonReaderDualTest, + ::testing::Values(json_test_t::json_lines_record_orient, + json_test_t::json_experimental_record_orient)); + +TEST_P(JsonReaderParamTest, BasicJsonLines) +{ + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + std::string row_orient = "[1, 1.1]\n[2, 2.2]\n[3, 3.3]\n"; + std::string record_orient = to_records_orient( + {{{"0", "1"}, {"1", "1.1"}}, {{"0", "2"}, {"1", "2.2"}}, {{"0", "3"}, {"1", "3.3"}}}, "\n"); + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()}) .dtypes(std::vector{dtype(), dtype()}) - .lines(true); + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); EXPECT_EQ(result.tbl->num_columns(), 2); @@ -182,19 +262,36 @@ TEST_F(JsonReaderTest, BasicJsonLines) float64_wrapper{{1.1, 2.2, 3.3}, validity}); } -TEST_F(JsonReaderTest, FloatingPoint) +TEST_P(JsonReaderParamTest, FloatingPoint) { + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + std::string row_orient = + "[5.6]\n[0.5679e2]\n[1.2e10]\n[0.07e1]\n[3000e-3]\n[12.34e0]\n[3.1e-001]\n[-73." + "98007199999998]\n"; + std::string record_orient = to_records_orient({{{"0", "5.6"}}, + {{"0", "0.5679e2"}}, + {{"0", "1.2e10"}}, + {{"0", "0.07e1"}}, + {{"0", "3000e-3"}}, + {{"0", "12.34e0"}}, + {{"0", "3.1e-001"}}, + {{"0", "-73.98007199999998"}}}, + "\n"); + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; + auto filepath = temp_env->get_temp_dir() + "FloatingPoint.json"; { std::ofstream outfile(filepath, std::ofstream::out); - outfile << "[5.6]\n[0.5679e2]\n[1.2e10]\n[0.07e1]\n[3000e-3]\n[12.34e0]\n[3.1e-001]\n[-73." - "98007199999998]\n"; + outfile << data; } cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) .dtypes({dtype()}) - .lines(true); + .lines(true) + .experimental(test_experimental); + cudf_io::table_with_metadata result = cudf_io::read_json(in_options); EXPECT_EQ(result.tbl->num_columns(), 1); @@ -211,14 +308,21 @@ TEST_F(JsonReaderTest, FloatingPoint) ASSERT_EQ((1u << result.tbl->get_column(0).size()) - 1, bitmask[0]); } -TEST_F(JsonReaderTest, JsonLinesStrings) +TEST_P(JsonReaderParamTest, JsonLinesStrings) { - std::string data = "[1, 1.1, \"aa \"]\n[2, 2.2, \" bbb\"]"; + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + std::string row_orient = "[1, 1.1, \"aa \"]\n[2, 2.2, \" bbb\"]"; + std::string record_orient = to_records_orient({{{"0", "1"}, {"1", "1.1"}, {"2", R"("aa ")"}}, + {{"0", "2"}, {"1", "2.2"}, {"2", R"(" bbb")"}}}, + "\n"); + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()}) .dtypes({{"2", dtype()}, {"0", dtype()}, {"1", dtype()}}) - .lines(true); + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -241,8 +345,12 @@ TEST_F(JsonReaderTest, JsonLinesStrings) cudf::test::strings_column_wrapper({"aa ", " bbb"})); } -TEST_F(JsonReaderTest, MultiColumn) +TEST_P(JsonReaderParamTest, MultiColumn) { + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + bool const row_orient = (test_opt == json_test_t::json_lines_row_orient); + constexpr auto num_rows = 10; auto int8_values = random_values(num_rows); auto int16_values = random_values(num_rows); @@ -254,10 +362,25 @@ TEST_F(JsonReaderTest, MultiColumn) auto filepath = temp_env->get_temp_dir() + "MultiColumn.json"; { std::ostringstream line; - for (int i = 0; i < num_rows; ++i) { - line << "[" << std::to_string(int8_values[i]) << "," << int16_values[i] << "," - << int32_values[i] << "," << int64_values[i] << "," << float32_values[i] << "," - << float64_values[i] << "]\n"; + if (row_orient) { + for (int i = 0; i < num_rows; ++i) { + line << "[" << std::to_string(int8_values[i]) << "," << int16_values[i] << "," + << int32_values[i] << "," << int64_values[i] << "," << float32_values[i] << "," + << float64_values[i] << "]\n"; + } + } else { + std::vector> records; + for (int i = 0; i < num_rows; ++i) { + records.push_back({ + {"0", std::to_string(int8_values[i])}, // + {"1", std::to_string(int16_values[i])}, // + {"2", std::to_string(int32_values[i])}, // + {"3", std::to_string(int64_values[i])}, // + {"4", std::to_string(float32_values[i])}, // + {"5", std::to_string(float64_values[i])}, // + }); + } + line << to_records_orient(records, "\n"); } std::ofstream outfile(filepath, std::ofstream::out); outfile << line.str(); @@ -271,7 +394,8 @@ TEST_F(JsonReaderTest, MultiColumn) dtype(), dtype(), dtype()}) - .lines(true); + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); @@ -297,18 +421,33 @@ TEST_F(JsonReaderTest, MultiColumn) check_float_column(view.column(5), float64_values, validity); } -TEST_F(JsonReaderTest, Booleans) +TEST_P(JsonReaderParamTest, Booleans) { + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + std::string row_orient = "[true]\n[true]\n[false]\n[false]\n[true]"; + std::string record_orient = to_records_orient( + { + {{"0", "true"}}, + {{"0", "true"}}, + {{"0", "false"}}, + {{"0", "false"}}, + {{"0", "true"}}, + }, + "\n"); + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; + auto filepath = temp_env->get_temp_dir() + "Booleans.json"; { std::ofstream outfile(filepath, std::ofstream::out); - outfile << "[true]\n[true]\n[false]\n[false]\n[true]"; + outfile << data; } cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) .dtypes({dtype()}) - .lines(true); + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); // Booleans are the same (integer) data type, but valued at 0 or 1 @@ -322,21 +461,39 @@ TEST_F(JsonReaderTest, Booleans) bool_wrapper{{true, true, false, false, true}, validity}); } -TEST_F(JsonReaderTest, Dates) +TEST_P(JsonReaderParamTest, Dates) { + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + std::string row_orient = + "[05/03/2001]\n[31/10/2010]\n[20/10/1994]\n[18/10/1990]\n[1/1/1970]\n" + "[18/04/1995]\n[14/07/1994]\n[07/06/2006 11:20:30.400]\n" + "[16/09/2005T1:2:30.400PM]\n[2/2/1970]\n"; + std::string record_orient = to_records_orient({{{"0", R"("05/03/2001")"}}, + {{"0", R"("31/10/2010")"}}, + {{"0", R"("20/10/1994")"}}, + {{"0", R"("18/10/1990")"}}, + {{"0", R"("1/1/1970")"}}, + {{"0", R"("18/04/1995")"}}, + {{"0", R"("14/07/1994")"}}, + {{"0", R"("07/06/2006 11:20:30.400")"}}, + {{"0", R"("16/09/2005T1:2:30.400PM")"}}, + {{"0", R"("2/2/1970")"}}}, + "\n"); + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; + auto filepath = temp_env->get_temp_dir() + "Dates.json"; { std::ofstream outfile(filepath, std::ofstream::out); - outfile << "[05/03/2001]\n[31/10/2010]\n[20/10/1994]\n[18/10/1990]\n[1/1/1970]\n"; - outfile << "[18/04/1995]\n[14/07/1994]\n[07/06/2006 11:20:30.400]\n"; - outfile << "[16/09/2005T1:2:30.400PM]\n[2/2/1970]\n"; + outfile << data; } cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}}) .lines(true) - .dayfirst(true); + .dayfirst(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); const auto view = result.tbl->view(); @@ -359,21 +516,39 @@ TEST_F(JsonReaderTest, Dates) validity}); } -TEST_F(JsonReaderTest, Durations) +TEST_P(JsonReaderParamTest, Durations) { - auto filepath = temp_env->get_temp_dir() + "Durations.json"; + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + std::string row_orient = + "[-2]\n[-1]\n[0]\n" + "[1 days]\n[0 days 23:01:00]\n[0 days 00:00:00.000000123]\n" + "[0:0:0.000123]\n[0:0:0.000123000]\n[00:00:00.100000001]\n" + "[-2147483648]\n[2147483647]\n"; + std::string record_orient = to_records_orient({{{"0", "-2"}}, + {{"0", "-1"}}, + {{"0", "0"}}, + {{"0", R"("1 days")"}}, + {{"0", R"("0 days 23:01:00")"}}, + {{"0", R"("0 days 00:00:00.000000123")"}}, + {{"0", R"("0:0:0.000123")"}}, + {{"0", R"("0:0:0.000123000")"}}, + {{"0", R"("00:00:00.100000001")"}}, + {{"0", R"(-2147483648)"}}, + {{"0", R"(2147483647)"}}}, + "\n"); + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; + auto filepath = temp_env->get_temp_dir() + "Durations.json"; { std::ofstream outfile(filepath, std::ofstream::out); - outfile << "[-2]\n[-1]\n[0]\n"; - outfile << "[1 days]\n[0 days 23:01:00]\n[0 days 00:00:00.000000123]\n"; - outfile << "[0:0:0.000123]\n[0:0:0.000123000]\n[00:00:00.100000001]\n"; - outfile << "[-2147483648]\n[2147483647]\n"; + outfile << data; } cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) .dtypes({data_type{type_id::DURATION_NANOSECONDS}}) - .lines(true); + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); const auto view = result.tbl->view(); @@ -398,13 +573,20 @@ TEST_F(JsonReaderTest, Durations) validity}); } -TEST_F(JsonReaderTest, JsonLinesDtypeInference) +TEST_P(JsonReaderParamTest, JsonLinesDtypeInference) { - std::string data = "[100, 1.1, \"aa \"]\n[200, 2.2, \" bbb\"]"; + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + std::string row_orient = "[100, 1.1, \"aa \"]\n[200, 2.2, \" bbb\"]"; + std::string record_orient = to_records_orient({{{"0", "100"}, {"1", "1.1"}, {"2", R"("aa ")"}}, + {{"0", "200"}, {"1", "2.2"}, {"2", R"(" bbb")"}}}, + "\n"); + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()}) - .lines(true); + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -427,15 +609,24 @@ TEST_F(JsonReaderTest, JsonLinesDtypeInference) cudf::test::strings_column_wrapper({"aa ", " bbb"})); } -TEST_F(JsonReaderTest, JsonLinesFileInput) +TEST_P(JsonReaderParamTest, JsonLinesFileInput) { + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + std::string row_orient = "[11, 1.1]\n[22, 2.2]"; + std::string record_orient = + to_records_orient({{{"0", "11"}, {"1", "1.1"}}, {{"0", "22"}, {"1", "2.2"}}}, "\n"); + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; + const std::string fname = temp_env->get_temp_dir() + "JsonLinesFileTest.json"; std::ofstream outfile(fname, std::ofstream::out); - outfile << "[11, 1.1]\n[22, 2.2]"; + outfile << data; outfile.close(); cudf_io::json_reader_options in_options = - cudf_io::json_reader_options::builder(cudf_io::source_info{fname}).lines(true); + cudf_io::json_reader_options::builder(cudf_io::source_info{fname}) + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -481,15 +672,19 @@ TEST_F(JsonReaderTest, JsonLinesByteRange) int64_wrapper{{3000, 4000, 5000}, validity}); } -TEST_F(JsonReaderTest, JsonLinesObjects) +TEST_P(JsonReaderDualTest, JsonLinesObjects) { - const std::string fname = temp_env->get_temp_dir() + "JsonLinesObjectsTest.json"; + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + const std::string fname = temp_env->get_temp_dir() + "JsonLinesObjectsTest.json"; std::ofstream outfile(fname, std::ofstream::out); outfile << " {\"co\\\"l1\" : 1, \"col2\" : 2.0} \n"; outfile.close(); cudf_io::json_reader_options in_options = - cudf_io::json_reader_options::builder(cudf_io::source_info{fname}).lines(true); + cudf_io::json_reader_options::builder(cudf_io::source_info{fname}) + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -507,12 +702,15 @@ TEST_F(JsonReaderTest, JsonLinesObjects) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{2.0}, validity}); } -TEST_F(JsonReaderTest, JsonLinesObjectsStrings) +TEST_P(JsonReaderDualTest, JsonLinesObjectsStrings) { - auto test_json_objects = [](std::string const& data) { + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + auto test_json_objects = [test_experimental](std::string const& data) { cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()}) - .lines(true); + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -545,15 +743,18 @@ TEST_F(JsonReaderTest, JsonLinesObjectsStrings) "{\"col3\":\"bbb\", \"col1\":200, \"col2\":2.2}\n"); } -TEST_F(JsonReaderTest, JsonLinesObjectsMissingData) +TEST_P(JsonReaderDualTest, JsonLinesObjectsMissingData) { + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); // Note: columns will be ordered based on which fields appear first std::string const data = "{ \"col2\":1.1, \"col3\":\"aaa\"}\n" "{\"col1\":200, \"col3\":\"bbb\"}\n"; cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()}) - .lines(true); + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -581,15 +782,18 @@ TEST_F(JsonReaderTest, JsonLinesObjectsMissingData) cudf::test::strings_column_wrapper({"aaa", "bbb"})); } -TEST_F(JsonReaderTest, JsonLinesObjectsOutOfOrder) +TEST_P(JsonReaderDualTest, JsonLinesObjectsOutOfOrder) { + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); std::string const data = "{\"col1\":100, \"col2\":1.1, \"col3\":\"aaa\"}\n" "{\"col3\":\"bbb\", \"col1\":200, \"col2\":2.2}\n"; cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()}) - .lines(true); + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -675,18 +879,31 @@ TEST_F(JsonReaderTest, ArrowFileSource) int8_wrapper{{9, 8, 7, 6, 5, 4, 3, 2}, validity}); } -TEST_F(JsonReaderTest, InvalidFloatingPoint) +TEST_P(JsonReaderParamTest, InvalidFloatingPoint) { + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + std::string row_orient = "[1.2e1+]\n[3.4e2-]\n[5.6e3e]\n[7.8e3A]\n[9.0Be1]\n[1C.2]"; + std::string record_orient = to_records_orient({{{"0", "1.2e1+"}}, + {{"0", "3.4e2-"}}, + {{"0", "5.6e3e"}}, + {{"0", "7.8e3A"}}, + {{"0", "9.0Be1"}}, + {{"0", "1C.2"}}}, + "\n"); + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; + const auto filepath = temp_env->get_temp_dir() + "InvalidFloatingPoint.json"; { std::ofstream outfile(filepath, std::ofstream::out); - outfile << "[1.2e1+]\n[3.4e2-]\n[5.6e3e]\n[7.8e3A]\n[9.0Be1]\n[1C.2]"; + outfile << data; } cudf_io::json_reader_options in_options = cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) .dtypes({dtype()}) - .lines(true); + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); EXPECT_EQ(result.tbl->num_columns(), 1); @@ -700,20 +917,30 @@ TEST_F(JsonReaderTest, InvalidFloatingPoint) ASSERT_EQ(0u, col_data.second[0]); } -TEST_F(JsonReaderTest, StringInference) +TEST_P(JsonReaderParamTest, StringInference) { - std::string buffer = "[\"-1\"]"; + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + std::string row_orient = "[\"-1\"]"; + std::string record_orient = to_records_orient({{{"0", R"("-1")"}}}, "\n"); + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; + cudf_io::json_reader_options in_options = - cudf_io::json_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()}) - .lines(true); + cudf_io::json_reader_options::builder(cudf_io::source_info{data.c_str(), data.size()}) + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); EXPECT_EQ(result.tbl->num_columns(), 1); EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRING); } -TEST_F(JsonReaderTest, ParseInRangeIntegers) +TEST_P(JsonReaderParamTest, ParseInRangeIntegers) { + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + bool const row_orient = (test_opt == json_test_t::json_lines_row_orient); + constexpr auto num_rows = 4; std::vector small_int = {0, -10, 20, -30}; std::vector less_equal_int64_max = {std::numeric_limits::max() - 3, @@ -751,19 +978,41 @@ TEST_F(JsonReaderTest, ParseInRangeIntegers) auto filepath = temp_env->get_temp_dir() + "ParseInRangeIntegers.json"; { std::ostringstream line; - for (int i = 0; i < num_rows; ++i) { - line << "[" << small_int[i] << "," << less_equal_int64_max[i] << "," - << greater_equal_int64_min[i] << "," << greater_int64_max[i] << "," - << less_equal_uint64_max[i] << "," << small_int_append_zeros[i] << "," - << less_equal_int64_max_append_zeros[i] << "," << greater_equal_int64_min_append_zeros[i] - << "," << greater_int64_max_append_zeros[i] << "," - << less_equal_uint64_max_append_zeros[i] << "]\n"; + if (row_orient) { + for (int i = 0; i < num_rows; ++i) { + line << "[" << small_int[i] << "," << less_equal_int64_max[i] << "," + << greater_equal_int64_min[i] << "," << greater_int64_max[i] << "," + << less_equal_uint64_max[i] << "," << small_int_append_zeros[i] << "," + << less_equal_int64_max_append_zeros[i] << "," + << greater_equal_int64_min_append_zeros[i] << "," << greater_int64_max_append_zeros[i] + << "," << less_equal_uint64_max_append_zeros[i] << "]\n"; + } + } else { + std::vector> records; + for (int i = 0; i < num_rows; ++i) { + records.push_back({ + {"0", std::to_string(small_int[i])}, // + {"1", std::to_string(less_equal_int64_max[i])}, // + {"2", std::to_string(greater_equal_int64_min[i])}, // + {"3", std::to_string(greater_int64_max[i])}, // + {"4", std::to_string(less_equal_uint64_max[i])}, // + {"5", small_int_append_zeros[i]}, // + {"6", less_equal_int64_max_append_zeros[i]}, // + {"7", greater_equal_int64_min_append_zeros[i]}, // + {"8", greater_int64_max_append_zeros[i]}, // + {"9", less_equal_uint64_max_append_zeros[i]}, // + }); + } + line << to_records_orient(records, "\n"); } + std::ofstream outfile(filepath, std::ofstream::out); outfile << line.str(); } cudf_io::json_reader_options in_options = - cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}).lines(true); + cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -782,8 +1031,12 @@ TEST_F(JsonReaderTest, ParseInRangeIntegers) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_less_equal_uint64_max, view.column(9)); } -TEST_F(JsonReaderTest, ParseOutOfRangeIntegers) +TEST_P(JsonReaderParamTest, ParseOutOfRangeIntegers) { + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + bool const row_orient = (test_opt == json_test_t::json_lines_row_orient); + constexpr auto num_rows = 4; std::vector out_of_range_positive = {"111111111111111111111", "2222222222222222222222", @@ -830,18 +1083,41 @@ TEST_F(JsonReaderTest, ParseOutOfRangeIntegers) auto filepath = temp_env->get_temp_dir() + "ParseOutOfRangeIntegers.json"; { std::ostringstream line; - for (int i = 0; i < num_rows; ++i) { - line << "[" << out_of_range_positive[i] << "," << out_of_range_negative[i] << "," - << greater_uint64_max[i] << "," << less_int64_min[i] << "," << mixed_range[i] << "," - << out_of_range_positive_append_zeros[i] << "," << out_of_range_negative_append_zeros[i] - << "," << greater_uint64_max_append_zeros[i] << "," << less_int64_min_append_zeros[i] - << "," << mixed_range_append_zeros[i] << "]\n"; + if (row_orient) { + for (int i = 0; i < num_rows; ++i) { + line << "[" << out_of_range_positive[i] << "," << out_of_range_negative[i] << "," + << greater_uint64_max[i] << "," << less_int64_min[i] << "," << mixed_range[i] << "," + << out_of_range_positive_append_zeros[i] << "," + << out_of_range_negative_append_zeros[i] << "," << greater_uint64_max_append_zeros[i] + << "," << less_int64_min_append_zeros[i] << "," << mixed_range_append_zeros[i] + << "]\n"; + } + } else { + std::vector> records; + for (int i = 0; i < num_rows; ++i) { + records.push_back({ + {"0", out_of_range_positive[i]}, // + {"1", out_of_range_negative[i]}, // + {"2", greater_uint64_max[i]}, // + {"3", less_int64_min[i]}, // + {"4", mixed_range[i]}, // + {"5", out_of_range_positive_append_zeros[i]}, // + {"6", out_of_range_negative_append_zeros[i]}, // + {"7", greater_uint64_max_append_zeros[i]}, // + {"8", less_int64_min_append_zeros[i]}, // + {"9", mixed_range_append_zeros[i]}, // + }); + } + line << to_records_orient(records, "\n"); } + std::ofstream outfile(filepath, std::ofstream::out); outfile << line.str(); } cudf_io::json_reader_options in_options = - cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}).lines(true); + cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}) + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -859,20 +1135,30 @@ TEST_F(JsonReaderTest, ParseOutOfRangeIntegers) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_mixed_range_append, view.column(9)); } -TEST_F(JsonReaderTest, JsonLinesMultipleFileInputs) +TEST_P(JsonReaderParamTest, JsonLinesMultipleFileInputs) { + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + std::vector row_orient{"[11, 1.1]\n[22, 2.2]\n", "[33, 3.3]\n[44, 4.4]"}; + std::vector record_orient{ + to_records_orient({{{"0", "11"}, {"1", "1.1"}}, {{"0", "22"}, {"1", "2.2"}}}, "\n") + "\n", + to_records_orient({{{"0", "33"}, {"1", "3.3"}}, {{"0", "44"}, {"1", "4.4"}}}, "\n") + "\n"}; + auto const& data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; + const std::string file1 = temp_env->get_temp_dir() + "JsonLinesFileTest1.json"; std::ofstream outfile(file1, std::ofstream::out); - outfile << "[11, 1.1]\n[22, 2.2]\n"; + outfile << data[0]; outfile.close(); const std::string file2 = temp_env->get_temp_dir() + "JsonLinesFileTest2.json"; std::ofstream outfile2(file2, std::ofstream::out); - outfile2 << "[33, 3.3]\n[44, 4.4]"; + outfile2 << data[1]; outfile2.close(); cudf_io::json_reader_options in_options = - cudf_io::json_reader_options::builder(cudf_io::source_info{{file1, file2}}).lines(true); + cudf_io::json_reader_options::builder(cudf_io::source_info{{file1, file2}}) + .lines(true) + .experimental(test_experimental); cudf_io::table_with_metadata result = cudf_io::read_json(in_options); @@ -1000,8 +1286,8 @@ TEST_F(JsonReaderTest, ExperimentalLinesNoOmissions) json_lines_options.enable_experimental(true); cudf::io::table_with_metadata new_reader_table = cudf::io::read_json(json_lines_options); - // Verify that the data read via non-nested JSON lines reader matches the data read via nested - // JSON reader + // Verify that the data read via non-nested JSON lines reader matches the data read via + // nested JSON reader CUDF_TEST_EXPECT_TABLES_EQUAL(current_reader_table.tbl->view(), new_reader_table.tbl->view()); } } @@ -1069,4 +1355,99 @@ TEST_F(JsonReaderTest, TestColumnOrder) CUDF_TEST_EXPECT_COLUMNS_EQUAL(col_a1, col_a.child(2)); } +TEST_P(JsonReaderParamTest, JsonDtypeSchema) +{ + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + std::string row_orient = "[1, 1.1, \"aa \"]\n[2, 2.2, \" bbb\"]"; + std::string record_orient = to_records_orient({{{"0", "1"}, {"1", "1.1"}, {"2", R"("aa ")"}}, + {{"0", "2"}, {"1", "2.2"}, {"2", R"(" bbb")"}}}, + "\n"); + + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; + + std::map dtype_schema{ + {"2", {dtype()}}, {"0", {dtype()}}, {"1", {dtype()}}}; + cudf_io::json_reader_options in_options = + cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()}) + .dtypes(dtype_schema) + .lines(true) + .experimental(test_experimental); + + cudf_io::table_with_metadata result = cudf_io::read_json(in_options); + + EXPECT_EQ(result.tbl->num_columns(), 3); + EXPECT_EQ(result.tbl->num_rows(), 2); + + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT32); + EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); + EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::STRING); + + EXPECT_EQ(result.metadata.schema_info[0].name, "0"); + EXPECT_EQ(result.metadata.schema_info[1].name, "1"); + EXPECT_EQ(result.metadata.schema_info[2].name, "2"); + + auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; }); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int_wrapper{{1, 2}, validity}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2}, validity}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), + cudf::test::strings_column_wrapper({"aa ", " bbb"})); +} + +TEST_F(JsonReaderTest, JsonNestedDtypeSchema) +{ + std::string json_string = R"( [{"a":[123, {"0": 123}], "b":1.0}, {"b":1.1}, {"b":2.1}])"; + + std::map dtype_schema{ + {"a", + { + data_type{cudf::type_id::LIST}, + {{"element", {data_type{cudf::type_id::STRUCT}, {{"0", {dtype()}}}}}}, + }}, + {"b", {dtype()}}, + }; + + cudf_io::json_reader_options in_options = + cudf_io::json_reader_options::builder( + cudf_io::source_info{json_string.data(), json_string.size()}) + .dtypes(dtype_schema) + .lines(false) + .experimental(true); + + cudf_io::table_with_metadata result = cudf_io::read_json(in_options); + + // Make sure we have columns "a" and "b" + ASSERT_EQ(result.tbl->num_columns(), 2); + ASSERT_EQ(result.metadata.schema_info.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "b"); + // Make sure column "a" is a list column (offsets and elements) + ASSERT_EQ(result.tbl->get_column(0).num_children(), 2); + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 2); + // Make sure column "b" is a leaf column + ASSERT_EQ(result.tbl->get_column(1).num_children(), 0); + ASSERT_EQ(result.metadata.schema_info[1].children.size(), 0); + // Offsets child with no other child columns + ASSERT_EQ(result.tbl->get_column(0).child(0).num_children(), 0); + ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 0); + EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "offsets"); + // Elements is the struct column with a single child column "0" + ASSERT_EQ(result.tbl->get_column(0).child(1).num_children(), 1); + ASSERT_EQ(result.metadata.schema_info[0].children[1].children.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].children[1].name, "element"); + + // Verify column "a" being a list column + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::LIST); + // Verify column "a->element->0" is a float column + EXPECT_EQ(result.tbl->get_column(0).child(1).child(0).type().id(), cudf::type_id::FLOAT32); + // Verify column "b" is an int column + EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::INT32); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0).child(1).child(0), + float_wrapper{{0.0, 123.0}, {false, true}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), + int_wrapper{{1, 1, 2}, {true, true, true}}); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index 65d21367bf4..bcfde4eedeb 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -470,8 +470,10 @@ TEST_F(JsonTest, ExtractColumn) auto const second_column_index = 1; EXPECT_EQ(cudf_table.tbl->num_columns(), expected_col_count); - auto expected_col1 = cudf::test::fixed_width_column_wrapper({0.0, 0.1, 0.2}); - auto expected_col2 = cudf::test::fixed_width_column_wrapper({1.0, 1.1, 1.2}); + auto expected_col1 = + cudf::test::fixed_width_column_wrapper({0.0, 0.1, 0.2}, {true, true, true}); + auto expected_col2 = + cudf::test::fixed_width_column_wrapper({1.0, 1.1, 1.2}, {true, true, true}); cudf::column_view parsed_col1 = cudf_table.tbl->get_column(first_column_index); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col1, parsed_col1); cudf::column_view parsed_col2 = cudf_table.tbl->get_column(second_column_index); @@ -540,7 +542,8 @@ TEST_F(JsonTest, ExtractColumnWithQuotes) auto expected_col1 = cudf::test::strings_column_wrapper({R"("0.0")", R"()", R"("2.0")"}, {true, false, true}); - auto expected_col2 = cudf::test::fixed_width_column_wrapper({1.0, 1.1, 2.1}); + auto expected_col2 = + cudf::test::fixed_width_column_wrapper({1.0, 1.1, 2.1}, {true, true, true}); cudf::column_view parsed_col1 = cudf_table.tbl->get_column(0); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col1, parsed_col1); cudf::column_view parsed_col2 = cudf_table.tbl->get_column(1);