Skip to content

Commit

Permalink
Nested JSON depth benchmark (#12371)
Browse files Browse the repository at this point in the history
Adds benchmark with depth axis for nested json.
It generates json with list of lists, and struct of structs for given depth, with types - int, float, string, bool, null literal.
The json string generation is in host (so the benchmark data generation will slow for higher depth value.

Depends on #12314

Authors:
  - Karthikeyan (https://github.com/karthikeyann)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: #12371
  • Loading branch information
karthikeyann authored Jan 11, 2023
1 parent 8d8d0ee commit 6a59b7e
Showing 1 changed file with 134 additions and 6 deletions.
140 changes: 134 additions & 6 deletions cpp/benchmarks/io/json/nested_json.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,106 @@
#include <cudf/strings/repeat_strings.hpp>
#include <cudf/types.hpp>

#include <cstdlib>
#include <string>
#include <vector>

namespace cudf {
namespace {
auto make_test_json_data(size_type string_size, rmm::cuda_stream_view stream)

// pre-generate all the number strings
std::vector<std::string> _num_to_string;
std::string num_to_string(int32_t num) { return _num_to_string.at(num); }

// List of List nested.
std::string generate_list_of_lists(int32_t max_depth, int32_t max_rows, std::string elem)
{
std::string json = "[";
if (max_depth > 1) json += std::string(max_depth - 1, '[');
for (int32_t row = 0; row < max_rows; ++row) {
json += elem;
if (row < max_rows - 1) { json += ", "; }
}
if (max_depth > 1) json += std::string(max_depth - 1, ']');
json += "]";
return json;
}

// Struct of Struct nested.
std::string generate_struct_of_structs(int32_t max_depth, int32_t max_rows, std::string elem)
{
if (max_depth <= 0) return "{}";
std::string json;
for (int32_t depth = 0; depth < max_depth / 2; ++depth) {
json += R"({"a)" + num_to_string(depth) + R"(": )";
}
if (max_rows == 0) json += "{}";

for (int32_t row = 0; row < max_rows; ++row) {
json += elem;
if (row < max_rows - 1) {
json += R"(, "a)" + num_to_string(max_depth / 2 - 1) + "_" + num_to_string(row) + R"(": )";
}
}
if (max_depth > 0) json += std::string(max_depth / 2, '}');
return json;
}

// Memoize the generated rows so we don't have to regenerate them.
std::map<std::tuple<int, int, int, int>, std::string> _row_cache;

std::string generate_row(
int num_columns, int max_depth, int max_list_size, int max_struct_size, size_t max_bytes)
{
std::string s = "{";
const std::vector<std::string> elems{
R"(1)", R"(-2)", R"(3.4)", R"("5")", R"("abcdefghij")", R"(true)", R"(null)"};
for (int i = 0; i < num_columns; i++) {
s += R"("col)" + num_to_string(i) + R"(": )";
if (auto it = _row_cache.find({i % 2, max_depth - 2, max_struct_size, i % elems.size()});
it != _row_cache.end()) {
s += it->second;
} else {
auto r =
(i % 2 == 0)
? generate_struct_of_structs(max_depth - 2, max_struct_size, elems[i % elems.size()])
: generate_list_of_lists(max_depth - 2, max_struct_size, elems[i % elems.size()]);
_row_cache[{i % 2, max_depth - 2, max_struct_size, i % elems.size()}] = r;
s += r;
}
if (s.length() > max_bytes) break;
if (i < num_columns - 1) s += ", ";
}
s += "}";
return s;
}

std::string generate_json(int num_rows,
int num_columns,
int max_depth,
int max_list_size,
int max_struct_size,
size_t max_json_bytes)
{
// std::to_string is slow, so we pre-generate all number strings we need.
_num_to_string.clear();
auto max_num_str =
std::max(std::max(num_columns, max_depth), std::max(max_list_size, max_struct_size));
for (int i = 0; i < max_num_str; i++)
_num_to_string.emplace_back(std::to_string(i));
_row_cache.clear();

std::string s = "[\n";
s.reserve(max_json_bytes + 1024);
for (int i = 0; i < num_rows; i++) {
s += generate_row(
num_columns, max_depth - 2, max_list_size, max_struct_size, max_json_bytes - s.length());
if (s.length() > max_json_bytes) break;
if (i != num_rows - 1) s += ",\n";
}
s += "\n]";
return s;
}

auto make_test_json_data(cudf::size_type string_size, rmm::cuda_stream_view stream)
{
// Test input
std::string input = R"(
Expand All @@ -46,7 +141,7 @@ auto make_test_json_data(size_type string_size, rmm::cuda_stream_view stream)
{"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}},
{"a": 1, "b": 8.0, "d": { "author": "Jean-Jacques Rousseau"}},)";

const size_type repeat_times = string_size / input.size();
const cudf::size_type repeat_times = string_size / input.size();

auto d_input_scalar = cudf::make_string_scalar(input, stream);
auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_input_scalar);
Expand All @@ -65,7 +160,7 @@ void BM_NESTED_JSON(nvbench::state& state)
// TODO: to be replaced by nvbench fixture once it's ready
cudf::rmm_pool_raii rmm_pool;

auto const string_size{size_type(state.get_int64("string_size"))};
auto const string_size{cudf::size_type(state.get_int64("string_size"))};
auto const default_options = cudf::io::json_reader_options{};

auto input = make_test_json_data(string_size, cudf::get_default_stream());
Expand All @@ -92,4 +187,37 @@ NVBENCH_BENCH(BM_NESTED_JSON)
.set_name("nested_json_gpu_parser")
.add_int64_power_of_two_axis("string_size", nvbench::range(20, 30, 1));

} // namespace cudf
void BM_NESTED_JSON_DEPTH(nvbench::state& state)
{
// TODO: to be replaced by nvbench fixture once it's ready
cudf::rmm_pool_raii rmm_pool;

auto const string_size{cudf::size_type(state.get_int64("string_size"))};
auto const depth{cudf::size_type(state.get_int64("depth"))};

auto d_scalar = cudf::string_scalar(
generate_json(100'000'000, 10, depth, 10, 10, string_size), true, cudf::get_default_stream());
auto input = cudf::device_span<const char>(d_scalar.data(), d_scalar.size());

state.add_element_count(input.size());
auto const default_options = cudf::io::json_reader_options{};

// Run algorithm
auto const mem_stats_logger = cudf::memory_stats_logger();
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
// Allocate device-side temporary storage & run algorithm
cudf::io::json::detail::device_parse_nested_json(
input, default_options, cudf::get_default_stream());
});

auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
state.add_element_count(static_cast<double>(string_size) / time, "bytes_per_second");
state.add_buffer_size(
mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
}

NVBENCH_BENCH(BM_NESTED_JSON_DEPTH)
.set_name("nested_json_gpu_parser_depth")
.add_int64_power_of_two_axis("depth", nvbench::range(1, 4, 1))
.add_int64_power_of_two_axis("string_size", nvbench::range(20, 30, 2));

0 comments on commit 6a59b7e

Please sign in to comment.