From 2fccbc0ba4af7a76c47553ea578d517d2db8e297 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Thu, 2 May 2024 14:45:03 -0500 Subject: [PATCH] Add JSON option to prune columns (#14996) Resolves https://github.com/rapidsai/cudf/issues/14951 This adds an option `prune_columns` to json_reader_options (default False) When set to True, the dtypes option is used as filter instead of type inference suggestion. If dtypes (vector of dtypes, map of dtypes or nested schema), is not specified, output is empty dataframe. Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) - MithunR (https://github.com/mythrocks) - Mike Wilson (https://github.com/hyperbolic2346) - Shruti Shivakumar (https://github.com/shrshi) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/14996 --- cpp/include/cudf/io/json.hpp | 40 +++++ cpp/src/io/json/json_column.cu | 143 ++++++++++++------ cpp/src/io/json/nested_json.hpp | 2 +- cpp/src/io/json/parser_features.cpp | 15 +- cpp/tests/io/json_test.cpp | 205 +++++++++++++++++++++++++- python/cudf/cudf/_lib/cpp/io/json.pxd | 5 + python/cudf/cudf/_lib/json.pyx | 4 +- python/cudf/cudf/io/json.py | 2 + python/cudf/cudf/utils/ioutils.py | 16 +- 9 files changed, 377 insertions(+), 55 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index a6112b8db4c..7374ffc37e6 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -101,6 +101,8 @@ class json_reader_options { bool _lines = false; // Parse mixed types as a string column bool _mixed_types_as_string = false; + // Prune columns on read, selected based on the _dtypes option + bool _prune_columns = false; // Bytes to skip from the start size_t _byte_range_offset = 0; @@ -241,6 +243,17 @@ class json_reader_options { */ bool is_enabled_mixed_types_as_string() const { return _mixed_types_as_string; } + /** + * @brief Whether to prune columns on read, selected based on the @ref set_dtypes option. + * + * When set as true, if the reader options include @ref set_dtypes, then + * the reader will only return those columns which are mentioned in @ref set_dtypes. + * If false, then all columns are returned, independent of the @ref set_dtypes setting. + * + * @return True if column pruning is enabled + */ + bool is_enabled_prune_columns() const { return _prune_columns; } + /** * @brief Whether to parse dates as DD/MM versus MM/DD. * @@ -342,6 +355,17 @@ class json_reader_options { */ void enable_mixed_types_as_string(bool val) { _mixed_types_as_string = val; } + /** + * @brief Set whether to prune columns on read, selected based on the @ref set_dtypes option. + * + * When set as true, if the reader options include @ref set_dtypes, then + * the reader will only return those columns which are mentioned in @ref set_dtypes. + * If false, then all columns are returned, independent of the @ref set_dtypes setting. + * + * @param val Boolean value to enable/disable column pruning + */ + void enable_prune_columns(bool val) { _prune_columns = val; } + /** * @brief Set whether to parse dates as DD/MM versus MM/DD. * @@ -508,6 +532,22 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set whether to prune columns on read, selected based on the @ref dtypes option. + * + * When set as true, if the reader options include @ref dtypes, then + * the reader will only return those columns which are mentioned in @ref dtypes. + * If false, then all columns are returned, independent of the @ref dtypes setting. + * + * @param val Boolean value to enable/disable column pruning + * @return this for chaining + */ + json_reader_options_builder& prune_columns(bool val) + { + options._prune_columns = val; + return *this; + } + /** * @brief Set whether to parse dates as DD/MM versus MM/DD. * diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 7117af8948b..631f8adbd6d 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -564,7 +564,7 @@ void make_device_json_column(device_span input, } }; auto init_to_zero = [stream](auto& v) { - thrust::uninitialized_fill(rmm::exec_policy(stream), v.begin(), v.end(), 0); + thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0); }; auto initialize_json_columns = [&](auto i, auto& col) { @@ -625,13 +625,14 @@ void make_device_json_column(device_span input, // find column_ids which are values, but should be ignored in validity std::vector ignore_vals(num_columns, 0); std::vector is_mixed_type_column(num_columns, 0); + std::vector is_pruned(num_columns, 0); columns.try_emplace(parent_node_sentinel, std::ref(root)); - for (auto const this_col_id : unique_col_ids) { - if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { - continue; - } - // Struct, List, String, Value + auto name_and_parent_index = [&is_array_of_arrays, + &row_array_parent_col_id, + &column_parent_ids, + &column_categories, + &column_names](auto this_col_id) { std::string name = ""; auto parent_col_id = column_parent_ids[this_col_id]; if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { @@ -647,11 +648,46 @@ void make_device_json_column(device_span input, } else { CUDF_FAIL("Unexpected parent column category"); } + return std::pair{name, parent_col_id}; + }; + + // Prune columns that are not required to be parsed. + if (options.is_enabled_prune_columns()) { + for (auto const this_col_id : unique_col_ids) { + if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { + continue; + } + // Struct, List, String, Value + auto [name, parent_col_id] = name_and_parent_index(this_col_id); + // get path of this column, and get its dtype if present in options + auto const nt = tree_path.get_path(this_col_id); + std::optional const user_dtype = get_path_data_type(nt, options); + if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) { + is_pruned[this_col_id] = 1; + continue; + } else { + // make sure all its parents are not pruned. + while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) { + is_pruned[parent_col_id] = 0; + parent_col_id = column_parent_ids[parent_col_id]; + } + } + } + } + + // Build the column tree, also, handles mixed types. + for (auto const this_col_id : unique_col_ids) { + if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { + continue; + } + // Struct, List, String, Value + auto [name, parent_col_id] = name_and_parent_index(this_col_id); - if (parent_col_id != parent_node_sentinel && is_mixed_type_column[parent_col_id] == 1) { - // if parent is mixed type column, ignore this column. - is_mixed_type_column[this_col_id] = 1; - ignore_vals[this_col_id] = 1; + // if parent is mixed type column or this column is pruned, ignore this column. + if (parent_col_id != parent_node_sentinel && + (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id])) { + ignore_vals[this_col_id] = 1; + if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; } continue; } @@ -714,12 +750,13 @@ void make_device_json_column(device_span input, "A mix of lists and structs within the same column is not supported"); } } + if (is_enabled_mixed_types_as_string) { // get path of this column, check if it is a struct forced as string, and enforce it - auto nt = tree_path.get_path(this_col_id); - std::optional user_dt = get_path_data_type(nt, options); - if (column_categories[this_col_id] == NC_STRUCT and user_dt.has_value() and - user_dt.value().id() == type_id::STRING) { + auto const nt = tree_path.get_path(this_col_id); + std::optional const user_dtype = get_path_data_type(nt, options); + if (column_categories[this_col_id] == NC_STRUCT and user_dtype.has_value() and + user_dtype.value().id() == type_id::STRING) { is_mixed_type_column[this_col_id] = 1; column_categories[this_col_id] = NC_STR; } @@ -873,25 +910,27 @@ void make_device_json_column(device_span input, for (auto& [id, col_ref] : columns) { auto& col = col_ref.get(); if (col.type == json_col_t::StringColumn) { - thrust::inclusive_scan(rmm::exec_policy(stream), + thrust::inclusive_scan(rmm::exec_policy_nosync(stream), col.string_offsets.begin(), col.string_offsets.end(), col.string_offsets.begin(), thrust::maximum{}); } else if (col.type == json_col_t::ListColumn) { - thrust::inclusive_scan(rmm::exec_policy(stream), + thrust::inclusive_scan(rmm::exec_policy_nosync(stream), col.child_offsets.begin(), col.child_offsets.end(), col.child_offsets.begin(), thrust::maximum{}); } } + stream.synchronize(); } std::pair, std::vector> device_json_column_to_cudf_column( device_json_column& json_col, device_span d_input, cudf::io::parse_options const& options, + bool prune_columns, std::optional schema, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) @@ -982,13 +1021,16 @@ std::pair, std::vector> device_json_co for (auto const& col_name : json_col.column_order) { auto const& col = json_col.child_columns.find(col_name); column_names.emplace_back(col->first); - auto& child_col = col->second; - auto [child_column, names] = device_json_column_to_cudf_column( - child_col, d_input, options, get_child_schema(col_name), stream, mr); - CUDF_EXPECTS(num_rows == child_column->size(), - "All children columns must have the same size"); - child_columns.push_back(std::move(child_column)); - column_names.back().children = names; + auto& child_col = col->second; + auto child_schema_element = get_child_schema(col_name); + if (!prune_columns or child_schema_element.has_value()) { + auto [child_column, names] = device_json_column_to_cudf_column( + child_col, d_input, options, prune_columns, child_schema_element, stream, mr); + CUDF_EXPECTS(num_rows == child_column->size(), + "All children columns must have the same size"); + child_columns.push_back(std::move(child_column)); + column_names.back().children = names; + } } auto [result_bitmask, null_count] = make_validity(json_col); // The null_mask is set after creation of struct column is to skip the superimpose_nulls and @@ -1011,8 +1053,11 @@ std::pair, std::vector> device_json_co rmm::device_buffer{}, 0); // Create children column + auto child_schema_element = json_col.child_columns.empty() + ? std::optional{} + : get_child_schema(json_col.child_columns.begin()->first); auto [child_column, names] = - json_col.child_columns.empty() + json_col.child_columns.empty() or (prune_columns and !child_schema_element.has_value()) ? std::pair, // EMPTY type could not used because gather throws exception on EMPTY type. std::vector>{std::make_unique( @@ -1022,13 +1067,13 @@ std::pair, std::vector> device_json_co rmm::device_buffer{}, 0), std::vector{}} - : device_json_column_to_cudf_column( - json_col.child_columns.begin()->second, - d_input, - options, - get_child_schema(json_col.child_columns.begin()->first), - stream, - mr); + : device_json_column_to_cudf_column(json_col.child_columns.begin()->second, + d_input, + options, + prune_columns, + child_schema_element, + stream, + mr); column_names.back().children = names; auto [result_bitmask, null_count] = make_validity(json_col); auto ret_col = make_lists_column(num_rows, @@ -1140,8 +1185,6 @@ table_with_metadata device_parse_nested_json(device_span d_input, size_type column_index = 0; for (auto const& col_name : root_struct_col.column_order) { auto& json_col = root_struct_col.child_columns.find(col_name)->second; - // Insert this columns name into the schema - out_column_names.emplace_back(col_name); std::optional child_schema_element = std::visit( cudf::detail::visitor_overload{ @@ -1184,18 +1227,28 @@ table_with_metadata device_parse_nested_json(device_span d_input, debug_schema_print(child_schema_element); #endif - // Get this JSON column's cudf column and schema info, (modifies json_col) - auto [cudf_col, col_name_info] = device_json_column_to_cudf_column( - json_col, d_input, parse_opt, child_schema_element, stream, mr); - // TODO: RangeIndex as DataFrame.columns names for array of arrays - // if (is_array_of_arrays) { - // col_name_info.back().name = ""; - // } - - out_column_names.back().children = std::move(col_name_info); - out_columns.emplace_back(std::move(cudf_col)); - - column_index++; + if (!options.is_enabled_prune_columns() or child_schema_element.has_value()) { + // Get this JSON column's cudf column and schema info, (modifies json_col) + auto [cudf_col, col_name_info] = + device_json_column_to_cudf_column(json_col, + d_input, + parse_opt, + options.is_enabled_prune_columns(), + child_schema_element, + stream, + mr); + // Insert this column's name into the schema + out_column_names.emplace_back(col_name); + // TODO: RangeIndex as DataFrame.columns names for array of arrays + // if (is_array_of_arrays) { + // col_name_info.back().name = ""; + // } + + out_column_names.back().children = std::move(col_name_info); + out_columns.emplace_back(std::move(cudf_col)); + + column_index++; + } } return table_with_metadata{std::make_unique(std::move(out_columns)), {out_column_names}}; diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index a302785cee8..52ea23c7f1c 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -319,7 +319,7 @@ table_with_metadata device_parse_nested_json(device_span input, * @return data type of the column if present */ std::optional get_path_data_type( - host_span> path, + host_span const> path, cudf::io::json_reader_options const& options); /** diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp index 740b7523cc1..4caa5cd9e24 100644 --- a/cpp/src/io/json/parser_features.cpp +++ b/cpp/src/io/json/parser_features.cpp @@ -58,8 +58,15 @@ std::optional child_schema_element(std::string const& col_name, // "a": [ null] {"a", list}, {"element", str} // back() is root. // front() is leaf. +/** + * @brief Get the path data type of a column by path if present in input schema + * + * @param path path of the json column + * @param root root of input schema element + * @return data type of the column if present, otherwise std::nullopt + */ std::optional get_path_data_type( - host_span> path, schema_element const& root) + host_span const> path, schema_element const& root) { if (path.empty() || path.size() == 1) { return root.type; @@ -81,7 +88,7 @@ std::optional get_path_data_type( } std::optional get_path_data_type( - host_span> path, + host_span const> path, cudf::io::json_reader_options const& options) { if (path.empty()) return {}; @@ -98,11 +105,11 @@ std::optional get_path_data_type( std::vector path_from_tree::get_path(NodeIndexT this_col_id) { std::vector path; - // TODO Need to stop at row root. so, how to find row root? + // stops at root. while (this_col_id != parent_node_sentinel) { auto type = column_categories[this_col_id]; std::string name = ""; - // TODO make this ifelse into a separate lambda function, along with parent_col_id. + // code same as name_and_parent_index lambda. auto parent_col_id = column_parent_ids[this_col_id]; if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) { diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index f0f72d4e794..b25822f6613 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -2233,9 +2233,6 @@ TEST_F(JsonReaderTest, MixedTypes) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); - static int num_case = 0; - num_case++; - std::cout << "case:" << num_case << "\n"; CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), expected); }; // value + string (not mixed type case) @@ -2437,4 +2434,206 @@ TEST_F(JsonReaderTest, MapTypes) {type_id::LIST, type_id::STRING, type_id::STRING}); } +// Test case for dtype prune: +// all paths, only one. +// one present, another not present, nothing present +// nested, flat, not-jsonlines +TEST_F(JsonReaderTest, JsonNestedDtypeFilter) +{ + std::string json_stringl = R"( + {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true} + {"a": 1, "b": {"0": "abc" }, "c": false} + {"a": 1, "b": {}} + {"a": 1, "c": null} + )"; + std::string json_string = R"([ + {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true}, + {"a": 1, "b": {"0": "abc" }, "c": false}, + {"a": 1, "b": {}}, + {"a": 1, "c": null} + ])"; + for (auto& [json_string, lines] : {std::pair{json_stringl, true}, {json_string, false}}) { + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.data(), json_string.size()}) + .prune_columns(true) + .lines(lines); + + // include all columns + //// schema + { + std::map dtype_schema{ + {"b", + {data_type{cudf::type_id::STRUCT}, + {{"0", {data_type{cudf::type_id::STRING}}}, + {"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype()}}}}}}}}, + {"a", {dtype()}}, + {"c", {dtype()}}, + }; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have columns "a", "b" and "c" + ASSERT_EQ(result.tbl->num_columns(), 3); + ASSERT_EQ(result.metadata.schema_info.size(), 3); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "b"); + EXPECT_EQ(result.metadata.schema_info[2].name, "c"); + // "b" children checks + ASSERT_EQ(result.metadata.schema_info[1].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[1].children[0].name, "0"); + EXPECT_EQ(result.metadata.schema_info[1].children[1].name, "1"); + ASSERT_EQ(result.metadata.schema_info[1].children[1].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[1].children[1].children[0].name, "offsets"); + EXPECT_EQ(result.metadata.schema_info[1].children[1].children[1].name, "element"); + // types + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT32); + EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRUCT); + EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::BOOL8); + EXPECT_EQ(result.tbl->get_column(1).child(0).type().id(), cudf::type_id::STRING); + EXPECT_EQ(result.tbl->get_column(1).child(1).type().id(), cudf::type_id::LIST); + EXPECT_EQ(result.tbl->get_column(1).child(1).child(0).type().id(), cudf::type_id::INT32); + EXPECT_EQ(result.tbl->get_column(1).child(1).child(1).type().id(), cudf::type_id::FLOAT32); + } + //// vector + { + std::vector types{ + {dtype()}, data_type{cudf::type_id::STRUCT}, {dtype()}}; + in_options.set_dtypes(types); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have columns "a", "b" and "c" + ASSERT_EQ(result.tbl->num_columns(), 3); + ASSERT_EQ(result.metadata.schema_info.size(), 3); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "b"); + EXPECT_EQ(result.metadata.schema_info[2].name, "c"); + } + //// map + { + std::map dtype_map{ + {"b", + { + data_type{cudf::type_id::STRUCT}, + }}, + {"a", {dtype()}}, + {"c", {dtype()}}, + }; + in_options.set_dtypes(dtype_map); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have columns "a", "b" and "c" + ASSERT_EQ(result.tbl->num_columns(), 3); + ASSERT_EQ(result.metadata.schema_info.size(), 3); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "b"); + EXPECT_EQ(result.metadata.schema_info[2].name, "c"); + } + + // include only one column + //// schema + { + std::map dtype_schema{ + {"a", {dtype()}}, + }; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have column "a" + ASSERT_EQ(result.tbl->num_columns(), 1); + ASSERT_EQ(result.metadata.schema_info.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + } + //// vector + { + std::vector types{{dtype()}}; + in_options.set_dtypes(types); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have column "a" + ASSERT_EQ(result.tbl->num_columns(), 1); + ASSERT_EQ(result.metadata.schema_info.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + } + //// map + { + std::map dtype_map{ + {"a", {dtype()}}, + }; + in_options.set_dtypes(dtype_map); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have column "a" + ASSERT_EQ(result.tbl->num_columns(), 1); + ASSERT_EQ(result.metadata.schema_info.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + } + + // include only one column (nested) + { + std::map dtype_schema{ + {"b", + {data_type{cudf::type_id::STRUCT}, + {{"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype()}}}}}}}}, + }; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have column "b":"1":[float] + ASSERT_EQ(result.tbl->num_columns(), 1); + ASSERT_EQ(result.metadata.schema_info.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].name, "b"); + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "1"); + ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets"); + EXPECT_EQ(result.metadata.schema_info[0].children[0].children[1].name, "element"); + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); + EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::LIST); + EXPECT_EQ(result.tbl->get_column(0).child(0).child(0).type().id(), cudf::type_id::INT32); + EXPECT_EQ(result.tbl->get_column(0).child(0).child(1).type().id(), cudf::type_id::FLOAT32); + } + // multiple - all present + { + std::map dtype_schema{ + {"a", {dtype()}}, + {"c", {dtype()}}, + }; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have columns "a", and "c" + ASSERT_EQ(result.tbl->num_columns(), 2); + ASSERT_EQ(result.metadata.schema_info.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "c"); + } + // multiple - not all present + { + std::map dtype_schema{ + {"a", {dtype()}}, + {"d", {dtype()}}, + }; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have column "a" + ASSERT_EQ(result.tbl->num_columns(), 1); + ASSERT_EQ(result.metadata.schema_info.size(), 1); + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + } + // multiple - not all present nested + { + std::map dtype_schema{ + + {"b", + {data_type{cudf::type_id::STRUCT}, + { + {"2", {data_type{cudf::type_id::STRING}}}, + }}}, + {"c", {dtype()}}, + }; + in_options.set_dtypes(dtype_schema); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + // Make sure we have columns "b" (empty struct) and "c" + ASSERT_EQ(result.tbl->num_columns(), 2); + ASSERT_EQ(result.metadata.schema_info.size(), 2); + EXPECT_EQ(result.metadata.schema_info[0].name, "b"); + ASSERT_EQ(result.metadata.schema_info[0].children.size(), 0); + EXPECT_EQ(result.metadata.schema_info[1].name, "c"); + } + } +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd index b916c2b7ad9..1e1057beede 100644 --- a/python/cudf/cudf/_lib/cpp/io/json.pxd +++ b/python/cudf/cudf/_lib/cpp/io/json.pxd @@ -28,6 +28,7 @@ cdef extern from "cudf/io/json.hpp" \ size_type get_byte_range_size() except + bool is_enabled_lines() except + bool is_enabled_mixed_types_as_string() except + + bool is_enabled_prune_columns() except + bool is_enabled_dayfirst() except + bool is_enabled_experimental() except + @@ -41,6 +42,7 @@ cdef extern from "cudf/io/json.hpp" \ void set_byte_range_size(size_type size) except + void enable_lines(bool val) except + void enable_mixed_types_as_string(bool val) except + + void enable_prune_columns(bool val) except + void enable_dayfirst(bool val) except + void enable_experimental(bool val) except + void enable_keep_quotes(bool val) except + @@ -79,6 +81,9 @@ cdef extern from "cudf/io/json.hpp" \ json_reader_options_builder& mixed_types_as_string( bool val ) except + + json_reader_options_builder& prune_columns( + bool val + ) except + json_reader_options_builder& dayfirst( bool val ) except + diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index f2e03391f08..cef71ed24a5 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -49,7 +49,8 @@ cpdef read_json(object filepaths_or_buffers, object byte_range, bool legacy, bool keep_quotes, - bool mixed_types_as_string): + bool mixed_types_as_string, + bool prune_columns): """ Cython function to call into libcudf API, see `read_json`. @@ -128,6 +129,7 @@ cpdef read_json(object filepaths_or_buffers, opts.enable_keep_quotes(keep_quotes) opts.enable_mixed_types_as_string(mixed_types_as_string) + opts.enable_prune_columns(prune_columns) # Read JSON cdef cudf_io_types.table_with_metadata c_result diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 5ef25a99590..03d07fc3a50 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -26,6 +26,7 @@ def read_json( keep_quotes=False, storage_options=None, mixed_types_as_string=False, + prune_columns=False, *args, **kwargs, ): @@ -101,6 +102,7 @@ def read_json( False, keep_quotes, mixed_types_as_string, + prune_columns, ) else: warnings.warn( diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 66e14f4b9de..6bd7558d322 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -692,7 +692,6 @@ This parameter is only supported with ``engine='cudf'``. - This parameter is only supported in ``cudf`` engine. If `True`, any string values are read literally (and wrapped in an additional set of quotes). If `False` string values are parsed into Python strings. @@ -703,7 +702,22 @@ For other URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more details. +mixed_types_as_string : bool, default False + .. admonition:: GPU-accelerated feature + + This parameter is only supported with ``engine='cudf'``. + + If True, mixed type columns are returned as string columns. + If `False` parsing mixed type columns will thrown an error. +prune_columns : bool, default False + + .. admonition:: GPU-accelerated feature + + This parameter is only supported with ``engine='cudf'``. + + If True, only return those columns mentioned in the dtype argument. + If `False` dtype argument is used a type inference suggestion. Returns ------- result : Series or DataFrame, depending on the value of `typ`.