Performance improvement in JSON Tree traversal (#11919)

This PR improves performance of JSON Tree traversal - mainly in creation of column id. - Replaced per-level processing with two-level hash algorithm - Reduced memory usage for hash map (reduced oversubscription) Other changes are - Fail if tokens has error token in tree generation - Created device_span version of device_parse_nested_json Hits 2 GB/s in GV100 from 128MB json. Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - Tobias Ribizel (https://github.com/upsj) - Nghia Truong (https://github.com/ttnghia) URL: #11919
rapidsai · Oct 28, 2022 · aaf251d · aaf251d
1 parent c915523
commit aaf251d
Show file tree

Hide file tree

Showing 4 changed files with 296 additions and 315 deletions.
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
@@ -722,24 +722,22 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
   }
 }
 
-table_with_metadata device_parse_nested_json(host_span<SymbolT const> input,
+table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                                              cudf::io::json_reader_options const& options,
                                              rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
-  // Allocate device memory for the JSON input & copy over to device
-  rmm::device_uvector<SymbolT> d_input = cudf::detail::make_device_uvector_async(input, stream);
-
   auto gpu_tree = [&]() {
     // Parse the JSON and get the token stream
     const auto [tokens_gpu, token_indices_gpu] = get_token_stream(d_input, options, stream);
     // gpu tree generation
     return get_tree_representation(tokens_gpu, token_indices_gpu, stream);
   }();  // IILE used to free memory of token data.
 #ifdef NJP_DEBUG_PRINT
-  print_tree(input, gpu_tree, stream);
+  auto h_input = cudf::detail::make_host_vector_async(d_input, stream);
+  print_tree(h_input, gpu_tree, stream);
 #endif
 
   auto [gpu_col_id, gpu_row_offsets] = records_orient_tree_traversal(d_input, gpu_tree, stream);
@@ -841,5 +839,17 @@ table_with_metadata device_parse_nested_json(host_span<SymbolT const> input,
                              {{}, out_column_names}};
 }
 
+table_with_metadata device_parse_nested_json(host_span<SymbolT const> input,
+                                             cudf::io::json_reader_options const& options,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+
+  // Allocate device memory for the JSON input & copy over to device
+  rmm::device_uvector<SymbolT> d_input = cudf::detail::make_device_uvector_async(input, stream);
+
+  return device_parse_nested_json(device_span<SymbolT const>{d_input}, options, stream, mr);
+}
 }  // namespace detail
 }  // namespace cudf::io::json