Adds checks to make sure json reader won't overflow (#13115)

The JSON reader is currently using 32-bit offsets to index into the input's characters to lower memory footprint and for performance reasons. Hence, currently, if an input larger than `UINT_MAX` is read, the parser may return incorrect data. This PR adds a check that fails for inputs that could overflow. The longer term plan is to make the finite-state transducer stage reentrant and split up inputs larger than `UINT_MAX` into smaller chunks. Authors: - Elias Stehle (https://github.com/elstehle) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Karthikeyan (https://github.com/karthikeyann) URL: #13115
rapidsai · Apr 13, 2023 · 3069f1e · 3069f1e
1 parent cae6132
commit 3069f1e
Showing 1 changed file with 21 additions and 2 deletions.
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
@@ -42,6 +42,7 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/transform.h>
 
+#include <limits>
 #include <stack>
 
 // Debug print flag
@@ -69,6 +70,20 @@ struct tree_node {
 
   std::size_t num_children = 0;
 };
+
+/**
+ * @brief Verifies that the JSON input can be handled without corrupted data due to offset
+ * overflows.
+ *
+ * @param input_size The JSON inputs size in bytes
+ */
+void check_input_size(std::size_t input_size)
+{
+  // Transduce() writes symbol offsets that may be as large input_size-1
+  CUDF_EXPECTS(input_size == 0 ||
+                 (input_size - 1) <= std::numeric_limits<cudf::io::json::SymbolOffsetT>::max(),
+               "Given JSON input is too large");
+}
 }  // namespace
 
 namespace cudf::io::json {
@@ -1017,6 +1032,8 @@ void get_stack_context(device_span<SymbolT const> json_in,
                        SymbolT* d_top_of_stack,
                        rmm::cuda_stream_view stream)
 {
+  check_input_size(json_in.size());
+
   // Range of encapsulating function that comprises:
   // -> DFA simulation for filtering out brackets and braces inside of quotes
   // -> Logical stack to infer the stack context
@@ -1076,20 +1093,22 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
+  check_input_size(json_in.size());
+
   // Range of encapsulating function that parses to internal columnar data representation
   CUDF_FUNC_RANGE();
 
   auto const new_line_delimited_json = options.is_enabled_lines();
 
   // Prepare for PDA transducer pass, merging input symbols with stack symbols
   rmm::device_uvector<PdaSymbolGroupIdT> pda_sgids = [json_in, stream]() {
-    rmm::device_uvector<PdaSymbolGroupIdT> pda_sgids{json_in.size(), stream};
     // Memory holding the top-of-stack stack context for the input
     rmm::device_uvector<StackSymbolT> stack_op_indices{json_in.size(), stream};
 
     // Identify what is the stack context for each input character (JSON-root, struct, or list)
     get_stack_context(json_in, stack_op_indices.data(), stream);
 
+    rmm::device_uvector<PdaSymbolGroupIdT> pda_sgids{json_in.size(), stream};
     auto zip_in = thrust::make_zip_iterator(json_in.data(), stack_op_indices.data());
     thrust::transform(rmm::exec_policy(stream),
                       zip_in,
@@ -1125,7 +1144,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   std::size_t constexpr max_tokens_per_struct = 6;
   auto const max_token_out_count =
     cudf::util::div_rounding_up_safe(json_in.size(), min_chars_per_struct) * max_tokens_per_struct;
-  rmm::device_scalar<SymbolOffsetT> num_written_tokens{stream};
+  rmm::device_scalar<std::size_t> num_written_tokens{stream};
   rmm::device_uvector<PdaTokenT> tokens{max_token_out_count, stream, mr};
   rmm::device_uvector<SymbolOffsetT> tokens_indices{max_token_out_count, stream, mr};