From 204a09cb9de58485df2c931db9a56d19a6eda356 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Thu, 22 Sep 2022 21:47:36 +0200
Subject: [PATCH] Reduces memory requirements in JSON parser and adds bytes/s
 and peak memory usage to benchmarks (#11732)

This PR reduces memory requirements in the new nested JSON parser and adds `bytes_per_second` and `peak_memory_usage` usage to benchmarks

Authors:
  - Elias Stehle (https://github.com/elstehle)

Approvers:
  - Tobias Ribizel (https://github.com/upsj)
  - Karthikeyan (https://github.com/karthikeyann)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/11732
---
 cpp/benchmarks/io/json/nested_json.cpp |  9 +++++-
 cpp/src/io/json/nested_json_gpu.cu     | 38 ++++++++++++++------------
 2 files changed, 28 insertions(+), 19 deletions(-)
diff --git a/cpp/benchmarks/io/json/nested_json.cpp b/cpp/benchmarks/io/json/nested_json.cpp
index 1e84d0ee7c5..e2d4c3b77d8 100644
--- a/cpp/benchmarks/io/json/nested_json.cpp
+++ b/cpp/benchmarks/io/json/nested_json.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/fixture/rmm_pool_raii.hpp>
 
 #include <nvbench/nvbench.cuh>
@@ -71,15 +72,21 @@ void BM_NESTED_JSON(nvbench::state& state)
   state.add_element_count(input.size());
 
   // Run algorithm
+  auto const mem_stats_logger = cudf::memory_stats_logger();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     // Allocate device-side temporary storage & run algorithm
     cudf::io::json::detail::parse_nested_json(input, default_options, cudf::default_stream_value);
   });
+
+  auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(string_size) / time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
 }
 
 NVBENCH_BENCH(BM_NESTED_JSON)
   .set_name("nested_json_gpu_parser")
-  .add_int64_power_of_two_axis("string_size", nvbench::range(20, 31, 1));
+  .add_int64_power_of_two_axis("string_size", nvbench::range(20, 30, 1));
 
 }  // namespace cudf
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 7e567aae9fe..552cd1e6167 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1079,26 +1079,25 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   // Range of encapsulating function that parses to internal columnar data representation
   CUDF_FUNC_RANGE();
 
-  rmm::device_uvector<PdaTokenT> tokens{json_in.size(), stream, mr};
-  rmm::device_uvector<SymbolOffsetT> tokens_indices{json_in.size(), stream, mr};
-  rmm::device_scalar<SymbolOffsetT> num_written_tokens{stream, mr};
-
   auto const new_line_delimited_json = options.is_enabled_lines();
 
-  // Memory holding the top-of-stack stack context for the input
-  rmm::device_uvector<StackSymbolT> stack_op_indices{json_in.size(), stream};
-
-  // Identify what is the stack context for each input character (is it: JSON-root, struct, or list)
-  get_stack_context(json_in, stack_op_indices.data(), stream);
-
   // Prepare for PDA transducer pass, merging input symbols with stack symbols
-  rmm::device_uvector<PdaSymbolGroupIdT> pda_sgids{json_in.size(), stream};
-  auto zip_in = thrust::make_zip_iterator(json_in.data(), stack_op_indices.data());
-  thrust::transform(rmm::exec_policy(stream),
-                    zip_in,
-                    zip_in + json_in.size(),
-                    pda_sgids.data(),
-                    tokenizer_pda::PdaSymbolToSymbolGroupId{});
+  rmm::device_uvector<PdaSymbolGroupIdT> pda_sgids = [json_in, stream]() {
+    rmm::device_uvector<PdaSymbolGroupIdT> pda_sgids{json_in.size(), stream};
+    // Memory holding the top-of-stack stack context for the input
+    rmm::device_uvector<StackSymbolT> stack_op_indices{json_in.size(), stream};
+
+    // Identify what is the stack context for each input character (JSON-root, struct, or list)
+    get_stack_context(json_in, stack_op_indices.data(), stream);
+
+    auto zip_in = thrust::make_zip_iterator(json_in.data(), stack_op_indices.data());
+    thrust::transform(rmm::exec_policy(stream),
+                      zip_in,
+                      zip_in + json_in.size(),
+                      pda_sgids.data(),
+                      tokenizer_pda::PdaSymbolToSymbolGroupId{});
+    return pda_sgids;
+  }();
 
   // PDA transducer alias
   using ToTokenStreamFstT =
@@ -1118,6 +1117,9 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
                                        stream};
 
   // Perform a PDA-transducer pass
+  rmm::device_scalar<SymbolOffsetT> num_written_tokens{stream};
+  rmm::device_uvector<PdaTokenT> tokens{json_in.size(), stream, mr};
+  rmm::device_uvector<SymbolOffsetT> tokens_indices{json_in.size(), stream, mr};
   json_to_tokens_fst.Transduce(pda_sgids.begin(),
                                static_cast<SymbolOffsetT>(json_in.size()),
                                tokens.data(),
@@ -1126,7 +1128,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
                                tokenizer_pda::start_state,
                                stream);
 
-  auto num_total_tokens = num_written_tokens.value(stream);
+  auto const num_total_tokens = num_written_tokens.value(stream);
   tokens.resize(num_total_tokens, stream);
   tokens_indices.resize(num_total_tokens, stream);