Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JSON - Parse mixed types as string in JSON reader #14572

Merged
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
49a73eb
Add mixed_types_as_string reader option
karthikeyann Dec 5, 2023
6bc0819
Extract correct Struct, List node range end
karthikeyann Dec 5, 2023
aa03a95
Force mixed types as string
karthikeyann Dec 5, 2023
3341109
Add simple mixed type testcase
karthikeyann Dec 5, 2023
7670cb0
Merge branch 'branch-24.02' into fea-json_mixed_type_as_string
karthikeyann Dec 7, 2023
12040a5
add is_strict_nested_boundaries
karthikeyann Dec 7, 2023
f8521f6
add more test cases for MixedTypes
karthikeyann Dec 7, 2023
377ac3d
bug fix for categeroy update of old col_id
karthikeyann Dec 8, 2023
53d25cc
Merge branch 'branch-24.02' into fea-json_mixed_type_as_string
karthikeyann Dec 11, 2023
d779c07
Java bindings for mixed types as strings (@andygrove)
andygrove Dec 13, 2023
c00a1a2
Merge branch 'branch-24.02' into fea-json_mixed_type_as_string
karthikeyann Dec 13, 2023
ed64288
newline at eof style fix.
karthikeyann Dec 14, 2023
cdbaac3
Merge branch 'branch-24.02' into fea-json_mixed_type_as_string
karthikeyann Jan 2, 2024
7b537e4
copyright year
karthikeyann Jan 4, 2024
20f586d
Merge branch 'branch-24.02' into fea-json_mixed_type_as_string
karthikeyann Jan 4, 2024
0dbc9f5
undo mixed type code
karthikeyann Jan 9, 2024
d3d509c
Merge branch 'branch-24.02' into fea-json_mixed_type_as_string
karthikeyann Jan 9, 2024
c09b776
remove debug prints
karthikeyann Jan 10, 2024
5837ca3
cleanup code and comments
karthikeyann Jan 10, 2024
2ba5e9b
testcase when the MixedTypesAsStrings feature is disabled
karthikeyann Jan 11, 2024
a3c1fe2
update mixed string, enable test for data source json
karthikeyann Jan 11, 2024
6a17262
Merge branch 'branch-24.02' into fea-json_mixed_type_as_string
karthikeyann Jan 11, 2024
e4da81e
add line to separate tests
karthikeyann Jan 11, 2024
12c4146
Merge branch 'branch-24.02' into fea-json_mixed_type_as_string
karthikeyann Jan 12, 2024
df4eb7d
addressed review comments (@elstehle)
karthikeyann Jan 12, 2024
02c3687
Merge branch 'branch-24.02' into fea-json_mixed_type_as_string
karthikeyann Jan 14, 2024
91bdad1
Merge branch 'branch-24.02' into fea-json_mixed_type_as_string
karthikeyann Jan 16, 2024
6ddac34
Apply suggestions from code review
karthikeyann Jan 16, 2024
390d941
Merge branch 'branch-24.02' into fea-json_mixed_type_as_string
karthikeyann Jan 17, 2024
ea40c55
address review comments (bdice), add test cases for max row offset test
karthikeyann Jan 18, 2024
b650848
Merge branch 'branch-24.02' into fea-json_mixed_type_as_string
karthikeyann Jan 18, 2024
4905808
renaming arguments
karthikeyann Jan 19, 2024
d2e0691
address review comments, rename, fix reinit condition
karthikeyann Jan 22, 2024
e7584ba
Merge branch 'branch-24.02' into fea-json_mixed_type_as_string
karthikeyann Jan 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 29 additions & 1 deletion cpp/include/cudf/io/json.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -98,6 +98,8 @@ class json_reader_options {

// Read the file as a json object per line
bool _lines = false;
// Read the mixed types as string column
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
bool _mixed_types_as_string = false;

// Bytes to skip from the start
size_t _byte_range_offset = 0;
Expand Down Expand Up @@ -225,6 +227,13 @@ class json_reader_options {
*/
bool is_enabled_lines() const { return _lines; }

/**
* @brief Whether to read the mixed types as string column.
*
* @return `true` if reading the mixed types as string column
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
*/
bool is_enabled_mixed_types_as_string() const { return _mixed_types_as_string; }

/**
* @brief Whether to parse dates as DD/MM versus MM/DD.
*
Expand Down Expand Up @@ -302,6 +311,13 @@ class json_reader_options {
*/
void enable_lines(bool val) { _lines = val; }

/**
* @brief Set whether to read the mixed types as string column.
*
* @param val Boolean value to enable/disable the option to read the mixed types as string column
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
*/
void enable_mixed_types_as_string(bool val) { _mixed_types_as_string = val; }

/**
* @brief Set whether to parse dates as DD/MM versus MM/DD.
*
Expand Down Expand Up @@ -437,6 +453,18 @@ class json_reader_options_builder {
return *this;
}

/**
* @brief Set whether to read the mixed types as string column.
*
* @param val Boolean value to enable/disable the option to read the mixed types as string column
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
* @return this for chaining
*/
json_reader_options_builder& mixed_types_as_string(bool val)
{
options._mixed_types_as_string = val;
return *this;
}

/**
* @brief Set whether to parse dates as DD/MM versus MM/DD.
*
Expand Down
112 changes: 98 additions & 14 deletions cpp/src/io/json/json_column.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -277,6 +277,16 @@ reduce_to_column_tree(tree_meta_t& tree,
return is_non_list_parent(parent_col_id);
});

// For Struct and List (to avoid copying entire strings when mixed type as string is enabled)
thrust::transform_if(
rmm::exec_policy(stream),
col_range_begin.begin(),
col_range_begin.end(),
column_categories.begin(),
col_range_end.begin(),
[] __device__(auto i) { return i + 1; },
[] __device__(NodeT type) { return type == NC_STRUCT || type == NC_LIST; });

return std::tuple{tree_meta_t{std::move(column_categories),
std::move(parent_col_ids),
std::move(column_levels),
Expand Down Expand Up @@ -418,6 +428,7 @@ void make_device_json_column(device_span<SymbolT const> input,
device_json_column& root,
bool is_array_of_arrays,
bool is_enabled_lines,
bool is_mixed_type_as_string_enabled,
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
Expand Down Expand Up @@ -516,6 +527,19 @@ void make_device_json_column(device_span<SymbolT const> input,
col.type = to_json_col_type(column_categories[i]);
};

auto reinitialize_as_string = [&](auto i, auto& col) {
col.string_offsets.resize(max_row_offsets[i] + 1, stream);
col.string_lengths.resize(max_row_offsets[i] + 1, stream);
init_to_zero(col.string_offsets);
init_to_zero(col.string_lengths);
col.num_rows = max_row_offsets[i] + 1;
col.validity =
cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
col.type = json_col_t::StringColumn;
col.child_columns.clear(); // their references should be deleted too.
col.column_order.clear();
};

// 2. generate nested columns tree and its device_memory
// reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order.
auto h_range_col_id_it =
Expand All @@ -530,6 +554,7 @@ void make_device_json_column(device_span<SymbolT const> input,
std::map<std::pair<NodeIndexT, std::string>, NodeIndexT> mapped_columns;
// find column_ids which are values, but should be ignored in validity
std::vector<uint8_t> ignore_vals(num_columns, 0);
std::vector<uint8_t> is_mixed_string_column(num_columns, 0);
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
columns.try_emplace(parent_node_sentinel, std::ref(root));

for (auto const this_col_id : unique_col_ids) {
Expand All @@ -552,6 +577,13 @@ void make_device_json_column(device_span<SymbolT const> input,
} else {
CUDF_FAIL("Unexpected parent column category");
}

if (parent_col_id != parent_node_sentinel && is_mixed_string_column[parent_col_id] == 1) {
// if parent is mixed string column, ignore this column.
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
is_mixed_string_column[this_col_id] = 1;
ignore_vals[this_col_id] = 1;
continue;
}
// If the child is already found,
// replace if this column is a nested column and the existing was a value column
// ignore this column if this column is a value column and the existing was a nested column
Expand All @@ -560,6 +592,25 @@ void make_device_json_column(device_span<SymbolT const> input,
auto& parent_col = it->second.get();
bool replaced = false;
if (mapped_columns.count({parent_col_id, name}) > 0) {
// If mixed type is enabled, make both of them as str, merge them.
// all its child columns will be ignored from parsing.
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
if (is_mixed_type_as_string_enabled) {
// VAL/STR or STRUCT or LIST
auto old_col_id = mapped_columns[{parent_col_id, name}];

is_mixed_string_column[this_col_id] = 1;
is_mixed_string_column[old_col_id] = 1;
// if old col type (not cat) is not string/val, replace with string.
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
auto& col = columns.at(old_col_id).get();
if (col.type != json_col_t::StringColumn) {
// TODO: old_col_id or this_col_id ? affects max_rowoffsets, need more tests.
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
reinitialize_as_string(old_col_id, col);
// all its children (which are already inserted) are ignored later.
}
columns.try_emplace(this_col_id, columns.at(old_col_id));
continue;
}

if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) {
ignore_vals[this_col_id] = 1;
continue;
Expand Down Expand Up @@ -592,6 +643,28 @@ void make_device_json_column(device_span<SymbolT const> input,
columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name)));
mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id);
}

if (is_mixed_type_as_string_enabled) {
// ignore all children of mixed type columns
for (auto const this_col_id : unique_col_ids) {
auto parent_col_id = column_parent_ids[this_col_id];
if (parent_col_id != parent_node_sentinel and is_mixed_string_column[parent_col_id] == 1) {
is_mixed_string_column[this_col_id] = 1;
ignore_vals[this_col_id] = 1;
columns.erase(this_col_id);
}
// Convert only mixed type columns as string (so to copy), but not its children
if (parent_col_id != parent_node_sentinel and is_mixed_string_column[parent_col_id] == 0 and
is_mixed_string_column[this_col_id] == 1)
column_categories[this_col_id] = NC_STR;
}
cudaMemcpyAsync(d_column_tree.node_categories.begin(),
column_categories.data(),
column_categories.size() * sizeof(column_categories[0]),
cudaMemcpyDefault,
stream.value());
}

// restore unique_col_ids order
std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
return thrust::get<1>(a) < thrust::get<1>(b);
Expand All @@ -617,14 +690,16 @@ void make_device_json_column(device_span<SymbolT const> input,
rmm::exec_policy(stream),
thrust::counting_iterator<size_type>(0),
num_nodes,
[node_categories = tree.node_categories.begin(),
col_ids = col_ids.begin(),
row_offsets = row_offsets.begin(),
range_begin = tree.node_range_begin.begin(),
range_end = tree.node_range_end.begin(),
d_ignore_vals = d_ignore_vals.begin(),
d_columns_data = d_columns_data.begin()] __device__(size_type i) {
switch (node_categories[i]) {
[column_categories = d_column_tree.node_categories.begin(),
col_ids = col_ids.begin(),
row_offsets = row_offsets.begin(),
range_begin = tree.node_range_begin.begin(),
range_end = tree.node_range_end.begin(),
d_ignore_vals = d_ignore_vals.begin(),
d_columns_data = d_columns_data.begin()] __device__(size_type i) {
if (d_ignore_vals[col_ids[i]]) return;
auto const node_category = column_categories[col_ids[i]];
switch (node_category) {
case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
case NC_STR: [[fallthrough]];
Expand Down Expand Up @@ -662,10 +737,15 @@ void make_device_json_column(device_span<SymbolT const> input,
num_nodes,
thrust::make_counting_iterator<size_type>(0),
thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()),
[node_categories = tree.node_categories.begin(),
parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) {
[d_ignore_vals = d_ignore_vals.begin(),
parent_node_ids = tree.parent_node_ids.begin(),
column_categories = d_column_tree.node_categories.begin(),
col_ids = col_ids.begin()] __device__(size_type node_id) {
auto parent_node_id = parent_node_ids[node_id];
return parent_node_id != parent_node_sentinel and node_categories[parent_node_id] == NC_LIST;
return parent_node_id != parent_node_sentinel and
column_categories[col_ids[parent_node_id]] == NC_LIST and
(!d_ignore_vals[col_ids[parent_node_id]]);
// node_categories[parent_node_id] == NC_LIST;
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
});

auto const num_list_children =
Expand Down Expand Up @@ -896,8 +976,11 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
const auto [tokens_gpu, token_indices_gpu] =
get_token_stream(d_input, options, stream, rmm::mr::get_current_device_resource());
// gpu tree generation
return get_tree_representation(
tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
return get_tree_representation(tokens_gpu,
token_indices_gpu,
options.is_enabled_mixed_types_as_string(),
stream,
rmm::mr::get_current_device_resource());
}(); // IILE used to free memory of token data.
#ifdef NJP_DEBUG_PRINT
auto h_input = cudf::detail::make_host_vector_async(d_input, stream);
Expand Down Expand Up @@ -941,6 +1024,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
root_column,
is_array_of_arrays,
options.is_enabled_lines(),
options.is_enabled_mixed_types_as_string(),
stream,
mr);

Expand Down
Loading
Loading