Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support casting of Map type to string in JSON reader #14936

Merged
merged 28 commits into from
Mar 11, 2024
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
febcfff
map type as string in JSON reader (prototype)
karthikeyann Jan 30, 2024
11aa95b
fix nulls in mixed types instead of string
karthikeyann Jan 31, 2024
a3cbc4f
move parsing_options decl to header
karthikeyann Feb 1, 2024
ecf4e13
fix is_mixed_type condition
karthikeyann Feb 1, 2024
56585c5
add all testcases of mixed types (exhaustive)
karthikeyann Feb 1, 2024
e083b4a
Merge branch 'branch-24.04' into fix-mixed_nulls_json
karthikeyann Feb 1, 2024
9e07e0d
address review comments
karthikeyann Feb 5, 2024
56dacca
Merge branch 'branch-24.04' into fix-mixed_nulls_json
karthikeyann Feb 5, 2024
4cb8673
Merge branch 'branch-24.04' into json_map_as_string
karthikeyann Feb 5, 2024
55ef545
cleanup row array parent
karthikeyann Feb 6, 2024
9ce7875
move new functions to parser_features.cpp
karthikeyann Feb 6, 2024
ce16a53
cleanup debug prints
karthikeyann Feb 6, 2024
70dbc31
Merge branch 'fix-mixed_nulls_json' of github.com:karthikeyann/cudf i…
karthikeyann Feb 6, 2024
2ffdc8a
Fix a bug with map type null literal
karthikeyann Feb 7, 2024
0535d21
Doc update on options
karthikeyann Feb 7, 2024
4a432cd
Merge branch 'branch-24.04' into json_map_as_string
karthikeyann Feb 7, 2024
4cd6150
address review commments
karthikeyann Feb 20, 2024
6add1c7
Merge branch 'branch-24.04' into fix-mixed_nulls_json
karthikeyann Feb 20, 2024
53cba69
Merge branch 'branch-24.04' into json_map_as_string
ttnghia Feb 22, 2024
3cc7403
address review comments
karthikeyann Feb 29, 2024
b44f52c
Merge branch 'branch-24.04' into fix-mixed_nulls_json
karthikeyann Feb 29, 2024
3df4c41
add doc to is_all_nulls_str_column
karthikeyann Feb 29, 2024
2f0a2e0
name change to is_all_nulls_each_column
karthikeyann Feb 29, 2024
3c394e8
Merge branch 'branch-24.04' into fix-mixed_nulls_json
karthikeyann Mar 6, 2024
9e7abd7
address review comments
karthikeyann Mar 6, 2024
e600fb5
Merge branch 'fix-mixed_nulls_json' of github.com:karthikeyann/cudf i…
karthikeyann Mar 6, 2024
afbdcb8
Merge branch 'branch-24.04' into json_map_as_string
karthikeyann Mar 6, 2024
107259a
Merge branch 'branch-24.04' into json_map_as_string
karthikeyann Mar 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,7 @@ add_library(
src/io/json/read_json.cu
src/io/json/legacy/json_gpu.cu
src/io/json/legacy/reader_impl.cu
src/io/json/parser_features.cpp
src/io/json/write_json.cu
src/io/orc/aggregate_orc_metadata.cpp
src/io/orc/dict_enc.cu
Expand Down
2 changes: 2 additions & 0 deletions cpp/include/cudf/io/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,7 @@ class json_reader_options {

/**
* @brief Set whether to parse mixed types as a string column.
* Also enables forcing to read a struct as string column using schema.
*
* @param val Boolean value to enable/disable parsing mixed types as a string column
*/
Expand Down Expand Up @@ -473,6 +474,7 @@ class json_reader_options_builder {

/**
* @brief Set whether to parse mixed types as a string column.
* Also enables forcing to read a struct as string column using schema.
*
* @param val Boolean value to enable/disable parsing mixed types as a string column
* @return this for chaining
Expand Down
124 changes: 92 additions & 32 deletions cpp/src/io/json/json_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,41 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
return to_host(d_column_names->view());
}

rmm::device_uvector<uint8_t> is_all_nulls_str_column(device_span<SymbolT const> input,
tree_meta_t const& d_column_tree,
tree_meta_t const& tree,
device_span<NodeIndexT> col_ids,
cudf::io::json_reader_options const& options,
rmm::cuda_stream_view stream)
{
auto const num_nodes = col_ids.size();
auto const num_cols = d_column_tree.node_categories.size();
rmm::device_uvector<uint8_t> is_all_nulls(num_cols, stream);
thrust::fill(rmm::exec_policy(stream), is_all_nulls.begin(), is_all_nulls.end(), true);

auto parse_opt = parsing_options(options, stream);
thrust::for_each_n(
rmm::exec_policy(stream),
thrust::counting_iterator<size_type>(0),
num_nodes,
[options = parse_opt.view(),
data = input.data(),
column_categories = d_column_tree.node_categories.begin(),
col_ids = col_ids.begin(),
range_begin = tree.node_range_begin.begin(),
range_end = tree.node_range_end.begin(),
is_all_nulls = is_all_nulls.begin()] __device__(size_type i) {
auto const node_category = column_categories[col_ids[i]];
if (node_category == NC_STR or node_category == NC_VAL) {
auto const is_null_literal = serialized_trie_contains(
options.trie_na,
{data + range_begin[i], static_cast<size_t>(range_end[i] - range_begin[i])});
if (!is_null_literal) is_all_nulls[col_ids[i]] = false;
}
});
return is_all_nulls;
}

/**
* @brief Holds member data pointers of `d_json_column`
*
Expand Down Expand Up @@ -429,11 +464,12 @@ void make_device_json_column(device_span<SymbolT const> input,
bool is_array_of_arrays,
bool is_enabled_lines,
bool is_enabled_mixed_types_as_string,
cudf::io::json_reader_options const& options,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
auto num_nodes = col_ids.size();
auto const num_nodes = col_ids.size();
rmm::device_uvector<NodeIndexT> sorted_col_ids(col_ids.size(), stream); // make a copy
thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin());

Expand All @@ -444,15 +480,16 @@ void make_device_json_column(device_span<SymbolT const> input,
rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin());

NodeIndexT const row_array_parent_col_id = [&]() {
if (!is_array_of_arrays) return parent_node_sentinel;
auto const list_node_index = is_enabled_lines ? 0 : 1;
NodeIndexT value;
CUDF_CUDA_TRY(cudaMemcpyAsync(&value,
col_ids.data() + list_node_index,
sizeof(NodeIndexT),
cudaMemcpyDefault,
stream.value()));
stream.synchronize();
NodeIndexT value = parent_node_sentinel;
if (!col_ids.empty()) {
auto const list_node_index = is_enabled_lines ? 0 : 1;
CUDF_CUDA_TRY(cudaMemcpyAsync(&value,
col_ids.data() + list_node_index,
sizeof(NodeIndexT),
cudaMemcpyDefault,
stream.value()));
stream.synchronize();
}
return value;
}();

Expand Down Expand Up @@ -540,6 +577,12 @@ void make_device_json_column(device_span<SymbolT const> input,
col.column_order.clear();
};

path_from_tree tree_path{column_categories,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we need to know the tree path only for mixed types, can we create the object only when the option is enabled?

Copy link
Contributor Author

@karthikeyann karthikeyann Mar 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The object is light weight. It holds span and couple of primitives. So, it may not matter much if the suggestion is for reducing runtime or memory.
I added the struct because in future it can have a memoizer.

column_parent_ids,
column_names,
is_array_of_arrays,
row_array_parent_col_id};

// 2. generate nested columns tree and its device_memory
// reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order.
auto h_range_col_id_it =
Expand All @@ -548,6 +591,12 @@ void make_device_json_column(device_span<SymbolT const> input,
return thrust::get<0>(a) < thrust::get<0>(b);
});

std::vector<uint8_t> is_str_column_all_nulls{};
if (is_enabled_mixed_types_as_string) {
is_str_column_all_nulls = cudf::detail::make_std_vector_async(
is_all_nulls_str_column(input, d_column_tree, tree, col_ids, options, stream), stream);
}

// use hash map because we may skip field name's col_ids
std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>> columns;
// map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking
Expand Down Expand Up @@ -584,6 +633,7 @@ void make_device_json_column(device_span<SymbolT const> input,
ignore_vals[this_col_id] = 1;
continue;
}

// If the child is already found,
// replace if this column is a nested column and the existing was a value column
// ignore this column if this column is a value column and the existing was a nested column
Expand All @@ -592,29 +642,36 @@ void make_device_json_column(device_span<SymbolT const> input,
auto& parent_col = it->second.get();
bool replaced = false;
if (mapped_columns.count({parent_col_id, name}) > 0) {
auto const old_col_id = mapped_columns[{parent_col_id, name}];
// If mixed type as string is enabled, make both of them strings and merge them.
// All child columns will be ignored when parsing.
if (is_enabled_mixed_types_as_string) {
// VAL/STR or STRUCT or LIST
auto old_col_id = mapped_columns[{parent_col_id, name}];

is_mixed_type_column[this_col_id] = 1;
is_mixed_type_column[old_col_id] = 1;
// if old col type (not cat) is list or struct, replace with string.
auto& col = columns.at(old_col_id).get();
if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) {
reinitialize_as_string(old_col_id, col);
// all its children (which are already inserted) are ignored later.
bool is_mixed_type = true;
// If new or old is STR and they are all not null, make it mixed type, else ignore.
if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) {
if (is_str_column_all_nulls[this_col_id]) is_mixed_type = false;
}
if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) {
if (is_str_column_all_nulls[old_col_id]) is_mixed_type = false;
}
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
if (is_mixed_type) {
is_mixed_type_column[this_col_id] = 1;
is_mixed_type_column[old_col_id] = 1;
// if old col type (not cat) is list or struct, replace with string.
auto& col = columns.at(old_col_id).get();
if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) {
reinitialize_as_string(old_col_id, col);
// all its children (which are already inserted) are ignored later.
}
columns.try_emplace(this_col_id, columns.at(old_col_id));
continue;
}
columns.try_emplace(this_col_id, columns.at(old_col_id));
continue;
}

if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) {
ignore_vals[this_col_id] = 1;
continue;
}
auto old_col_id = mapped_columns[{parent_col_id, name}];
if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) {
// remap
ignore_vals[old_col_id] = 1;
Expand All @@ -632,6 +689,17 @@ void make_device_json_column(device_span<SymbolT const> input,
"A mix of lists and structs within the same column is not supported");
}
}
if (is_enabled_mixed_types_as_string) {
// get path of this column, check if it is a struct forced as string, and enforce it
auto nt = tree_path.get_path(this_col_id);
std::optional<data_type> user_dt = get_path_data_type(nt, options);
if (column_categories[this_col_id] == NC_STRUCT and user_dt.has_value() and
user_dt.value().id() == type_id::STRING) {
is_mixed_type_column[this_col_id] = 1;
column_categories[this_col_id] = NC_STR;
}
}

CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name);
// move into parent
device_json_column col(stream, mr);
Expand Down Expand Up @@ -795,15 +863,6 @@ void make_device_json_column(device_span<SymbolT const> input,
}
}

/**
* @brief Retrieves the parse_options to be used for type inference and type casting
*
* @param options The reader options to influence the relevant type inference and type casting
* options
*/
cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& options,
rmm::cuda_stream_view stream);

std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_column_to_cudf_column(
device_json_column& json_col,
device_span<SymbolT const> d_input,
Expand Down Expand Up @@ -1023,6 +1082,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
is_array_of_arrays,
options.is_enabled_lines(),
options.is_enabled_mixed_types_as_string(),
options,
stream,
mr);

Expand Down
39 changes: 39 additions & 0 deletions cpp/src/io/json/nested_json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@
#include <map>
#include <vector>

// Forward declaration of parse_options from parsing_utils.cuh
namespace cudf::io {
struct parse_options;
}
namespace cudf::io::json {

/**
Expand Down Expand Up @@ -284,6 +288,15 @@ reduce_to_column_tree(tree_meta_t& tree,
device_span<size_type> row_offsets,
rmm::cuda_stream_view stream);

/**
* @brief Retrieves the parse_options to be used for type inference and type casting
*
* @param options The reader options to influence the relevant type inference and type casting
* options
*/
cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& options,
rmm::cuda_stream_view stream);

/** @copydoc host_parse_nested_json
* All processing is done in device memory.
*
Expand All @@ -293,6 +306,32 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @brief Get the path data type of a column by path if present in input schema
*
* @param path path of the column
* @param options json reader options which holds schema
* @return data type of the column if present
*/
std::optional<data_type> get_path_data_type(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we have this as a member of the path_from_tree struct?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

path_from_tree struct functions on column tree. This function checks if a column path is present in the json options input schema. They need not be combined as get_path_data_type doesn't use any data from path_from_tree struct.

host_span<std::pair<std::string, cudf::io::json::NodeT>> path,
cudf::io::json_reader_options const& options);

/**
* @brief Helper class to get path of a column by column id from reduced column tree
*
*/
struct path_from_tree {
host_span<NodeT const> column_categories;
host_span<NodeIndexT const> column_parent_ids;
host_span<std::string const> column_names;
bool is_array_of_arrays;
NodeIndexT const row_array_parent_col_id;

using path_rep = std::pair<std::string, cudf::io::json::NodeT>;
std::vector<path_rep> get_path(NodeIndexT this_col_id);
};

/**
* @brief Parses the given JSON string and generates table from the given input.
*
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/io/json/nested_json_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2042,7 +2042,8 @@ void make_json_column(json_column& root_column,
* options
* @param stream The CUDA stream to which kernels are dispatched
*/
auto parsing_options(cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream)
cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& options,
rmm::cuda_stream_view stream)
{
auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'};

Expand Down
Loading
Loading