Skip to content

Commit

Permalink
Adds checks to make sure json reader won't overflow (#13115)
Browse files Browse the repository at this point in the history
The JSON reader is currently using 32-bit offsets to index into the input's characters to lower memory footprint and for performance reasons. Hence, currently, if an input larger than `UINT_MAX` is read, the parser may return incorrect data. 

This PR adds a check that fails for inputs that could overflow. 

The longer term plan is to make the finite-state transducer stage reentrant and split up inputs larger than `UINT_MAX` into smaller chunks.

Authors:
  - Elias Stehle (https://github.com/elstehle)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)

URL: #13115
  • Loading branch information
elstehle authored Apr 13, 2023
1 parent cae6132 commit 3069f1e
Showing 1 changed file with 21 additions and 2 deletions.
23 changes: 21 additions & 2 deletions cpp/src/io/json/nested_json_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
#include <thrust/iterator/transform_iterator.h>
#include <thrust/transform.h>

#include <limits>
#include <stack>

// Debug print flag
Expand Down Expand Up @@ -69,6 +70,20 @@ struct tree_node {

std::size_t num_children = 0;
};

/**
* @brief Verifies that the JSON input can be handled without corrupted data due to offset
* overflows.
*
* @param input_size The JSON inputs size in bytes
*/
void check_input_size(std::size_t input_size)
{
// Transduce() writes symbol offsets that may be as large input_size-1
CUDF_EXPECTS(input_size == 0 ||
(input_size - 1) <= std::numeric_limits<cudf::io::json::SymbolOffsetT>::max(),
"Given JSON input is too large");
}
} // namespace

namespace cudf::io::json {
Expand Down Expand Up @@ -1017,6 +1032,8 @@ void get_stack_context(device_span<SymbolT const> json_in,
SymbolT* d_top_of_stack,
rmm::cuda_stream_view stream)
{
check_input_size(json_in.size());

// Range of encapsulating function that comprises:
// -> DFA simulation for filtering out brackets and braces inside of quotes
// -> Logical stack to infer the stack context
Expand Down Expand Up @@ -1076,20 +1093,22 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
check_input_size(json_in.size());

// Range of encapsulating function that parses to internal columnar data representation
CUDF_FUNC_RANGE();

auto const new_line_delimited_json = options.is_enabled_lines();

// Prepare for PDA transducer pass, merging input symbols with stack symbols
rmm::device_uvector<PdaSymbolGroupIdT> pda_sgids = [json_in, stream]() {
rmm::device_uvector<PdaSymbolGroupIdT> pda_sgids{json_in.size(), stream};
// Memory holding the top-of-stack stack context for the input
rmm::device_uvector<StackSymbolT> stack_op_indices{json_in.size(), stream};

// Identify what is the stack context for each input character (JSON-root, struct, or list)
get_stack_context(json_in, stack_op_indices.data(), stream);

rmm::device_uvector<PdaSymbolGroupIdT> pda_sgids{json_in.size(), stream};
auto zip_in = thrust::make_zip_iterator(json_in.data(), stack_op_indices.data());
thrust::transform(rmm::exec_policy(stream),
zip_in,
Expand Down Expand Up @@ -1125,7 +1144,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
std::size_t constexpr max_tokens_per_struct = 6;
auto const max_token_out_count =
cudf::util::div_rounding_up_safe(json_in.size(), min_chars_per_struct) * max_tokens_per_struct;
rmm::device_scalar<SymbolOffsetT> num_written_tokens{stream};
rmm::device_scalar<std::size_t> num_written_tokens{stream};
rmm::device_uvector<PdaTokenT> tokens{max_token_out_count, stream, mr};
rmm::device_uvector<SymbolOffsetT> tokens_indices{max_token_out_count, stream, mr};

Expand Down

0 comments on commit 3069f1e

Please sign in to comment.