Skip to content

Commit

Permalink
Performance optimization of JSON validation (#16996)
Browse files Browse the repository at this point in the history
As part of JSON validation, field, value and string tokens are validated. Right now the code has single transform_inclusive_scan. Since this transform functor is a heavy operation, it slows down the entire scan drastically. 
This PR splits transform and scan in validation. The runtime of validation went from 200ms to 20ms.

Also, a few hardcoded string comparisons are moved to trie.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: #16996
  • Loading branch information
karthikeyann authored Oct 8, 2024
1 parent cc23474 commit 553d8ec
Showing 1 changed file with 50 additions and 38 deletions.
88 changes: 50 additions & 38 deletions cpp/src/io/json/process_tokens.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/io/detail/tokenize_json.hpp>
#include <cudf/utilities/memory_resource.hpp>

#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>
Expand Down Expand Up @@ -87,38 +88,41 @@ void validate_token_stream(device_span<char const> d_input,
{
CUDF_FUNC_RANGE();
if (!options.is_strict_validation()) { return; }

rmm::device_uvector<bool> d_invalid = cudf::detail::make_zeroed_device_uvector_async<bool>(
tokens.size(), stream, cudf::get_current_device_resource_ref());

using token_t = cudf::io::json::token_t;
cudf::detail::optional_trie trie_na =
cudf::detail::create_serialized_trie(options.get_na_values(), stream);
auto trie_na_view = cudf::detail::make_trie_view(trie_na);
auto literals = options.get_na_values();
literals.emplace_back("null"); // added these too to single trie
literals.emplace_back("true");
literals.emplace_back("false");

cudf::detail::optional_trie trie_literals =
cudf::detail::create_serialized_trie(literals, stream);
cudf::detail::optional_trie trie_nonnumeric = cudf::detail::create_serialized_trie(
{"NaN", "Infinity", "+INF", "+Infinity", "-INF", "-Infinity"}, stream);

auto validate_values = cuda::proclaim_return_type<bool>(
[data = d_input.data(),
trie_na = trie_na_view,
trie_literals = cudf::detail::make_trie_view(trie_literals),
trie_nonnumeric = cudf::detail::make_trie_view(trie_nonnumeric),
allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(),
allow_nonnumeric =
options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start,
SymbolOffsetT end) -> bool {
// This validates an unquoted value. A value must match https://www.json.org/json-en.html
// but the leading and training whitespace should already have been removed, and is not
// a string
auto c = data[start];
auto is_null_literal = serialized_trie_contains(trie_na, {data + start, end - start});
if (is_null_literal) {
return true;
} else if ('n' == c) {
return substr_eq(data, start, end, 4, "null");
} else if ('t' == c) {
return substr_eq(data, start, end, 4, "true");
} else if ('f' == c) {
return substr_eq(data, start, end, 5, "false");
} else if (allow_nonnumeric && c == 'N') {
return substr_eq(data, start, end, 3, "NaN");
} else if (allow_nonnumeric && c == 'I') {
return substr_eq(data, start, end, 8, "Infinity");
} else if (allow_nonnumeric && c == '+') {
return substr_eq(data, start, end, 4, "+INF") ||
substr_eq(data, start, end, 9, "+Infinity");
} else if ('-' == c || c <= '9' && 'c' >= '0') {
auto const is_literal = serialized_trie_contains(trie_literals, {data + start, end - start});
if (is_literal) { return true; }
if (allow_nonnumeric) {
auto const is_nonnumeric =
serialized_trie_contains(trie_nonnumeric, {data + start, end - start});
if (is_nonnumeric) { return true; }
}
auto c = data[start];
if ('-' == c || c <= '9' && 'c' >= '0') {
// number
auto num_state = number_state::START;
for (auto at = start; at < end; at++) {
Expand All @@ -140,9 +144,6 @@ void validate_token_stream(device_span<char const> d_input,
num_state = number_state::LEADING_ZERO;
} else if (c >= '1' && c <= '9') {
num_state = number_state::WHOLE;
} else if (allow_nonnumeric && 'I' == c) {
return substr_eq(data, start, end, 4, "-INF") ||
substr_eq(data, start, end, 9, "-Infinity");
} else {
return false;
}
Expand Down Expand Up @@ -273,33 +274,44 @@ void validate_token_stream(device_span<char const> d_input,

auto num_tokens = tokens.size();
auto count_it = thrust::make_counting_iterator(0);
auto predicate = [tokens = tokens.begin(),
token_indices = token_indices.begin(),
validate_values,
validate_strings] __device__(auto i) -> bool {
auto predicate = cuda::proclaim_return_type<bool>([tokens = tokens.begin(),
token_indices = token_indices.begin(),
validate_values,
validate_strings] __device__(auto i) -> bool {
if (tokens[i] == token_t::ValueEnd) {
return !validate_values(token_indices[i - 1], token_indices[i]);
} else if (tokens[i] == token_t::FieldNameEnd || tokens[i] == token_t::StringEnd) {
return !validate_strings(token_indices[i - 1], token_indices[i]);
}
return false;
};
});

auto conditional_invalidout_it =
cudf::detail::make_tabulate_output_iterator(cuda::proclaim_return_type<void>(
[d_invalid = d_invalid.begin()] __device__(size_type i, bool x) -> void {
if (x) { d_invalid[i] = true; }
}));
thrust::transform(rmm::exec_policy_nosync(stream),
count_it,
count_it + num_tokens,
conditional_invalidout_it,
predicate);

using scan_type = write_if::scan_type;
auto conditional_write = write_if{tokens.begin(), num_tokens};
auto conditional_output_it = cudf::detail::make_tabulate_output_iterator(conditional_write);
auto transform_op = cuda::proclaim_return_type<scan_type>(
[predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type {
if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd};
return {static_cast<token_t>(tokens[i]), tokens[i] == token_t::LineEnd};
});
auto binary_op = cuda::proclaim_return_type<scan_type>(
auto binary_op = cuda::proclaim_return_type<scan_type>(
[] __device__(scan_type prev, scan_type curr) -> scan_type {
auto op_result = (prev.first == token_t::ErrorBegin ? prev.first : curr.first);
return scan_type((curr.second ? curr.first : op_result), prev.second | curr.second);
return {(curr.second ? curr.first : op_result), prev.second | curr.second};
});
auto transform_op = cuda::proclaim_return_type<scan_type>(
[d_invalid = d_invalid.begin(), tokens = tokens.begin()] __device__(auto i) -> scan_type {
if (d_invalid[i]) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd};
return {static_cast<token_t>(tokens[i]), tokens[i] == token_t::LineEnd};
});

thrust::transform_inclusive_scan(rmm::exec_policy(stream),
thrust::transform_inclusive_scan(rmm::exec_policy_nosync(stream),
count_it,
count_it + num_tokens,
conditional_output_it,
Expand Down

0 comments on commit 553d8ec

Please sign in to comment.