From 38b0ad8359f892113874f1369765f2e65a0ee7a7 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 11 Oct 2022 13:49:53 +0530 Subject: [PATCH 1/5] fix empty list columns in cudf column creation (full gpu) --- cpp/src/io/json/json_column.cu | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index d54bb5c8ea9..a7a85f63068 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -689,19 +689,24 @@ std::pair, std::vector> device_json_co size_type num_rows = json_col.child_offsets.size() - 1; std::vector column_names{}; column_names.emplace_back("offsets"); - column_names.emplace_back(json_col.child_columns.begin()->first); + column_names.emplace_back( + json_col.child_columns.empty() ? "element" : json_col.child_columns.begin()->first); // Note: json_col modified here, reuse the memory auto offsets_column = std::make_unique( data_type{type_id::INT32}, num_rows + 1, json_col.child_offsets.release()); // Create children column auto [child_column, names] = - device_json_column_to_cudf_column(json_col.child_columns.begin()->second, - d_input, - options, - get_child_schema(json_col.child_columns.begin()->first), - stream, - mr); + json_col.child_columns.empty() + ? std::pair, + std::vector>{std::make_unique(), {}} + : device_json_column_to_cudf_column( + json_col.child_columns.begin()->second, + d_input, + options, + get_child_schema(json_col.child_columns.begin()->first), + stream, + mr); column_names.back().children = names; auto [result_bitmask, null_count] = make_validity(json_col); return {make_lists_column(num_rows, From 8f3aaa4903a8243dbae027142144120df53d9f41 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 11 Oct 2022 13:50:43 +0530 Subject: [PATCH 2/5] fix empty list columns in cudf column creation (partial gpu) --- cpp/src/io/json/nested_json_gpu.cu | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 5d60a564b9b..7c33e27ec9f 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1680,7 +1680,8 @@ std::pair, std::vector> json_column_to size_type num_rows = json_col.child_offsets.size(); std::vector column_names{}; column_names.emplace_back("offsets"); - column_names.emplace_back(json_col.child_columns.begin()->first); + column_names.emplace_back( + json_col.child_columns.empty() ? "element" : json_col.child_columns.begin()->first); rmm::device_uvector d_offsets = cudf::detail::make_device_uvector_async(json_col.child_offsets, stream, mr); @@ -1688,12 +1689,15 @@ std::pair, std::vector> json_column_to std::make_unique(data_type{type_id::INT32}, num_rows, d_offsets.release()); // Create children column auto [child_column, names] = - json_column_to_cudf_column(json_col.child_columns.begin()->second, - d_input, - options, - get_child_schema(json_col.child_columns.begin()->first), - stream, - mr); + json_col.child_columns.empty() + ? std::pair, + std::vector>{std::make_unique(), {}} + : json_column_to_cudf_column(json_col.child_columns.begin()->second, + d_input, + options, + get_child_schema(json_col.child_columns.begin()->first), + stream, + mr); column_names.back().children = names; auto [result_bitmask, null_count] = make_validity(json_col); return {make_lists_column(num_rows - 1, From bbb0474ea8058c11f59d99e033e526fbb5f6c65d Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 11 Oct 2022 13:51:27 +0530 Subject: [PATCH 3/5] add empty list, struct columns in cuDF json pytest --- python/cudf/cudf/tests/test_json.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 1fdef44546a..fb2c24b3757 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -649,6 +649,24 @@ def test_json_nested_data(): assert df.to_arrow().equals(pa_table_pdf) +def test_json_empty_types(): + json_str = """ {} + {"a": [], "b": {}} + {"a": []} + {"b": {}} + {"c": {"d": []}} + {"e": [{}]} + """ + df = cudf.read_json( + StringIO(json_str), + engine="cudf_experimental", + orient="records", + lines=True, + ) + pdf = pd.read_json(StringIO(json_str), orient="records", lines=True) + assert_eq(df, pdf) + + def test_json_types_data(): # 0:<0:string,1:float> # 1:list From 819de7b932dbe08cc8dace88ccc0f57011a5942c Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 11 Oct 2022 13:52:22 +0530 Subject: [PATCH 4/5] add empty list, struct unit test in json tree traversal enable empty input test in nested json reader --- cpp/tests/io/json_test.cpp | 11 +++++++---- cpp/tests/io/json_tree.cpp | 6 +++++- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index d7ab881861a..b8cd4622484 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -813,7 +813,6 @@ TEST_P(JsonReaderDualTest, JsonLinesObjectsOutOfOrder) cudf::test::strings_column_wrapper({"aaa", "bbb"})); } -/* // currently, the json reader is strict about having non-empty input. TEST_F(JsonReaderTest, EmptyFile) { @@ -824,7 +823,9 @@ TEST_F(JsonReaderTest, EmptyFile) } cudf::io::json_reader_options in_options = - cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}).lines(true); + cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) + .lines(true) + .experimental(true); auto result = cudf::io::read_json(in_options); const auto view = result.tbl->view(); @@ -832,6 +833,7 @@ TEST_F(JsonReaderTest, EmptyFile) } // currently, the json reader is strict about having non-empty input. +// experimental reader supports empty input TEST_F(JsonReaderTest, NoDataFile) { auto filepath = temp_env->get_temp_dir() + "NoDataFile.csv"; @@ -841,13 +843,14 @@ TEST_F(JsonReaderTest, NoDataFile) } cudf::io::json_reader_options in_options = - cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}).lines(true); + cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) + .lines(true) + .experimental(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); const auto view = result.tbl->view(); EXPECT_EQ(0, view.num_columns()); } -*/ TEST_F(JsonReaderTest, ArrowFileSource) { diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json_tree.cpp index 3d024fe8af8..6f7e28a2ca3 100644 --- a/cpp/tests/io/json_tree.cpp +++ b/cpp/tests/io/json_tree.cpp @@ -773,7 +773,11 @@ std::vector json_lines_list = { { "a": { "y" : 6, "z": [] }} { "a": { "y" : 6, "z": [2, 3, 4, 5] }} { "a": { "z": [4], "y" : 6 }} - { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} )"}; + { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} )", + // empty list, row. + R"( {"a" : [], "b" : {}} + {"a" : []} + {"b" : {}})"}; INSTANTIATE_TEST_SUITE_P(Mixed_And_Records, JsonTreeTraversalTest, ::testing::Combine(::testing::Values(false), From 61ae5af12a5d5eacd4a0129fa229231555773fd2 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Thu, 13 Oct 2022 06:20:14 +0530 Subject: [PATCH 5/5] move list_child_name = "element" as global constexpr placeholder --- cpp/src/io/json/json_column.cu | 4 ++-- cpp/src/io/json/nested_json.hpp | 3 +++ cpp/src/io/json/nested_json_gpu.cu | 7 ++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index a7a85f63068..872e742a5af 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -403,7 +403,7 @@ void make_device_json_column(device_span input, std::string name = ""; auto parent_col_id = column_parent_ids[this_col_id]; if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { - name = "element"; + name = list_child_name; } else if (column_categories[parent_col_id] == NC_FN) { auto field_name_col_id = parent_col_id; parent_col_id = column_parent_ids[parent_col_id]; @@ -690,7 +690,7 @@ std::pair, std::vector> device_json_co std::vector column_names{}; column_names.emplace_back("offsets"); column_names.emplace_back( - json_col.child_columns.empty() ? "element" : json_col.child_columns.begin()->first); + json_col.child_columns.empty() ? list_child_name : json_col.child_columns.begin()->first); // Note: json_col modified here, reuse the memory auto offsets_column = std::make_unique( diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 10d209b2ea6..8a0f3566d58 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -104,6 +104,9 @@ enum node_t : NodeT { */ enum class json_col_t : char { ListColumn, StructColumn, StringColumn, Unknown }; +// Default name for a list's child column +constexpr auto list_child_name{"element"}; + /** * @brief Intermediate representation of data from a nested JSON input */ diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 7c33e27ec9f..29a29a1f9d5 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1162,9 +1162,6 @@ void make_json_column(json_column& root_column, // Range of encapsulating function that parses to internal columnar data representation CUDF_FUNC_RANGE(); - // Default name for a list's child column - std::string const list_child_name = "element"; - // Parse the JSON and get the token stream const auto [d_tokens_gpu, d_token_indices_gpu] = get_token_stream(d_input, options, stream, mr); @@ -1286,7 +1283,7 @@ void make_json_column(json_column& root_column, * (b) a list, the selected child column corresponds to single child column of * the list column. In this case, the child column may not exist yet. */ - auto get_selected_column = [&list_child_name](std::stack& current_data_path) { + auto get_selected_column = [](std::stack& current_data_path) { json_column* selected_col = current_data_path.top().current_selected_col; // If the node does not have a selected column yet @@ -1681,7 +1678,7 @@ std::pair, std::vector> json_column_to std::vector column_names{}; column_names.emplace_back("offsets"); column_names.emplace_back( - json_col.child_columns.empty() ? "element" : json_col.child_columns.begin()->first); + json_col.child_columns.empty() ? list_child_name : json_col.child_columns.begin()->first); rmm::device_uvector d_offsets = cudf::detail::make_device_uvector_async(json_col.child_offsets, stream, mr);