diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 17fa7abdffe..e5e21e054a6 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -567,22 +567,22 @@ void make_device_json_column(device_span input, thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0); }; - auto initialize_json_columns = [&](auto i, auto& col) { - if (column_categories[i] == NC_ERR || column_categories[i] == NC_FN) { + auto initialize_json_columns = [&](auto i, auto& col, auto column_category) { + if (column_category == NC_ERR || column_category == NC_FN) { return; - } else if (column_categories[i] == NC_VAL || column_categories[i] == NC_STR) { + } else if (column_category == NC_VAL || column_category == NC_STR) { col.string_offsets.resize(max_row_offsets[i] + 1, stream); col.string_lengths.resize(max_row_offsets[i] + 1, stream); init_to_zero(col.string_offsets); init_to_zero(col.string_lengths); - } else if (column_categories[i] == NC_LIST) { + } else if (column_category == NC_LIST) { col.child_offsets.resize(max_row_offsets[i] + 2, stream); init_to_zero(col.child_offsets); } col.num_rows = max_row_offsets[i] + 1; col.validity = cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); - col.type = to_json_col_type(column_categories[i]); + col.type = to_json_col_type(column_category); }; auto reinitialize_as_string = [&](auto i, auto& col) { @@ -764,21 +764,23 @@ void make_device_json_column(device_span input, } } + auto this_column_category = column_categories[this_col_id]; if (is_enabled_mixed_types_as_string) { - // get path of this column, check if it is a struct forced as string, and enforce it + // get path of this column, check if it is a struct/list forced as string, and enforce it auto const nt = tree_path.get_path(this_col_id); std::optional const user_dtype = get_path_data_type(nt, options); - if (column_categories[this_col_id] == NC_STRUCT and user_dtype.has_value() and - user_dtype.value().id() == type_id::STRING) { + if ((column_categories[this_col_id] == NC_STRUCT or + column_categories[this_col_id] == NC_LIST) and + user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { is_mixed_type_column[this_col_id] = 1; - column_categories[this_col_id] = NC_STR; + this_column_category = NC_STR; } } CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name); // move into parent device_json_column col(stream, mr); - initialize_json_columns(this_col_id, col); + initialize_json_columns(this_col_id, col, this_column_category); auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second; CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent"); if (not replaced) parent_col.column_order.push_back(name); diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 993ab82f423..0a485e26b71 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -2351,7 +2351,7 @@ TEST_F(JsonReaderTest, MapTypes) // Testing function for mixed types in JSON (for spark json reader) auto test_fn = [](std::string_view json_string, bool lines, std::vector types) { std::map dtype_schema{ - {"foo1", {data_type{type_id::STRING}}}, // list won't be a string + {"foo1", {data_type{type_id::STRING}}}, // list forced as a string {"foo2", {data_type{type_id::STRING}}}, // struct forced as a string {"1", {data_type{type_id::STRING}}}, {"2", {data_type{type_id::STRING}}}, @@ -2378,17 +2378,17 @@ TEST_F(JsonReaderTest, MapTypes) test_fn(R"([{ "foo1": [1,2,3], "bar": 123 }, { "foo2": { "a": 1 }, "bar": 456 }])", false, - {type_id::LIST, type_id::INT32, type_id::STRING}); + {type_id::STRING, type_id::INT32, type_id::STRING}); // jsonl test_fn(R"( { "foo1": [1,2,3], "bar": 123 } { "foo2": { "a": 1 }, "bar": 456 })", true, - {type_id::LIST, type_id::INT32, type_id::STRING}); + {type_id::STRING, type_id::INT32, type_id::STRING}); // jsonl-array test_fn(R"([123, [1,2,3]] [456, null, { "a": 1 }])", true, - {type_id::INT64, type_id::LIST, type_id::STRING}); + {type_id::INT64, type_id::STRING, type_id::STRING}); // json-array test_fn(R"([[[1,2,3], null, 123], [null, { "a": 1 }, 456 ]])", @@ -2678,38 +2678,81 @@ TEST_F(JsonReaderTest, JsonNestedDtypeFilter) TEST_F(JsonReaderTest, JSONMixedTypeChildren) { - std::string const json_str = R"( -{ "Root": { "Key": [ { "EE": "A" } ] } } -{ "Root": { "Key": { } } } -{ "Root": { "Key": [{ "YY": 1}] } } -)"; - // Column "EE" is created and destroyed - // Column "YY" should not be created - - cudf::io::json_reader_options options = - cudf::io::json_reader_options::builder(cudf::io::source_info{json_str.c_str(), json_str.size()}) - .lines(true) - .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) - .normalize_single_quotes(true) - .normalize_whitespace(false) - .mixed_types_as_string(true) - .keep_quotes(true); - - auto result = cudf::io::read_json(options); + // struct mixed. + { + std::string const json_str = R"( + { "Root": { "Key": [ { "EE": "A" } ] } } + { "Root": { "Key": { } } } + { "Root": { "Key": [{ "YY": 1}] } } + )"; + // Column "EE" is created and destroyed + // Column "YY" should not be created + + cudf::io::json_reader_options options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_str.c_str(), json_str.size()}) + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .normalize_single_quotes(true) + .normalize_whitespace(false) + .mixed_types_as_string(true) + .keep_quotes(true); + + auto result = cudf::io::read_json(options); + + ASSERT_EQ(result.tbl->num_columns(), 1); + ASSERT_EQ(result.metadata.schema_info.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].name, "Root"); + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key"); + ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets"); + // types + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); + EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING); + cudf::test::strings_column_wrapper expected( + {R"([ { "EE": "A" } ])", "{ }", R"([{ "YY": 1}])"}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result.tbl->get_column(0).child(0)); + } - ASSERT_EQ(result.tbl->num_columns(), 1); - ASSERT_EQ(result.metadata.schema_info.size(), 1); - EXPECT_EQ(result.metadata.schema_info[0].name, "Root"); - ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1); - EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key"); - ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2); - EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets"); - // types - EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); - EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING); - cudf::test::strings_column_wrapper expected({R"([ { "EE": "A" } ])", "{ }", R"([{ "YY": 1}])"}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result.tbl->get_column(0).child(0)); + // list mixed. + { + std::string const json_str = R"( + { "Root": { "Key": [ { "EE": "A" } ] } } + { "Root": { "Key": "abc" } } + { "Root": { "Key": [{ "YY": 1}] } } + )"; + // Column "EE" is created and destroyed + // Column "YY" should not be created + + cudf::io::json_reader_options options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_str.c_str(), json_str.size()}) + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .normalize_single_quotes(true) + .normalize_whitespace(false) + .mixed_types_as_string(true) + .keep_quotes(true); + + auto result = cudf::io::read_json(options); + + ASSERT_EQ(result.tbl->num_columns(), 1); + ASSERT_EQ(result.metadata.schema_info.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].name, "Root"); + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key"); + ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets"); + // types + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); + EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING); + cudf::test::strings_column_wrapper expected( + {R"([ { "EE": "A" } ])", "\"abc\"", R"([{ "YY": 1}])"}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result.tbl->get_column(0).child(0)); + } } CUDF_TEST_PROGRAM_MAIN()