Skip to content

Commit

Permalink
JSON reader validation of values (#15968)
Browse files Browse the repository at this point in the history
Addresses part of #15222
This change adds validation stage in JSON reader at tokens level. If any validation fails in a row, it will make the entire row as null.

- [x] validation functor - implement spark validation rules. (@revans2 implemented all validation rules)
- [x] move output iterator to thrust. (already merged by NVIDIA/cccl#2282)
- [x] Fix failing tests and infer data type for Float.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)
  - Nghia Truong (https://github.com/ttnghia)

URL: #15968
  • Loading branch information
karthikeyann authored Sep 11, 2024
1 parent 750adca commit 9acbaf8
Show file tree
Hide file tree
Showing 12 changed files with 1,113 additions and 28 deletions.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,7 @@ add_library(
src/io/json/nested_json_gpu.cu
src/io/json/read_json.cu
src/io/json/parser_features.cpp
src/io/json/process_tokens.cu
src/io/json/write_json.cu
src/io/orc/aggregate_orc_metadata.cpp
src/io/orc/dict_enc.cu
Expand Down
190 changes: 190 additions & 0 deletions cpp/include/cudf/io/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include <cudf/table/table_view.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/memory_resource.hpp>

#include <map>
Expand Down Expand Up @@ -128,6 +129,19 @@ class json_reader_options {
// Whether to recover after an invalid JSON line
json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL;

// Validation checks for spark
// Should the json validation be strict or not
// Note: strict validation enforces the JSON specification https://www.json.org/json-en.html
bool _strict_validation = false;
// Allow leading zeros for numeric values.
bool _allow_numeric_leading_zeros = true;
// Allow non-numeric numbers: NaN, +INF, -INF, +Infinity, Infinity, -Infinity
bool _allow_nonnumeric_numbers = true;
// Allow unquoted control characters
bool _allow_unquoted_control_chars = true;
// Additional values to recognize as null values
std::vector<std::string> _na_values;

/**
* @brief Constructor from source info.
*
Expand Down Expand Up @@ -298,6 +312,55 @@ class json_reader_options {
*/
[[nodiscard]] json_recovery_mode_t recovery_mode() const { return _recovery_mode; }

/**
* @brief Whether json validation should be enforced strictly or not.
*
* @return true if it should be.
*/
[[nodiscard]] bool is_strict_validation() const { return _strict_validation; }

/**
* @brief Whether leading zeros are allowed in numeric values.
*
* @note: This validation is enforced only if strict validation is enabled.
*
* @return true if leading zeros are allowed in numeric values
*/
[[nodiscard]] bool is_allowed_numeric_leading_zeros() const
{
return _allow_numeric_leading_zeros;
}

/**
* @brief Whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, Infinity,
* and -Infinity.
*
* @note: This validation is enforced only if strict validation is enabled.
*
* @return true if leading zeros are allowed in numeric values
*/
[[nodiscard]] bool is_allowed_nonnumeric_numbers() const { return _allow_nonnumeric_numbers; }

/**
* @brief Whether in a quoted string should characters greater than or equal to 0 and less than 32
* be allowed without some form of escaping.
*
* @note: This validation is enforced only if strict validation is enabled.
*
* @return true if unquoted control chars are allowed.
*/
[[nodiscard]] bool is_allowed_unquoted_control_chars() const
{
return _allow_unquoted_control_chars;
}

/**
* @brief Returns additional values to recognize as null values.
*
* @return Additional values to recognize as null values
*/
[[nodiscard]] std::vector<std::string> const& get_na_values() const { return _na_values; }

/**
* @brief Set data types for columns to be read.
*
Expand Down Expand Up @@ -427,6 +490,63 @@ class json_reader_options {
* @param val An enum value to indicate the JSON reader's behavior on invalid JSON lines.
*/
void set_recovery_mode(json_recovery_mode_t val) { _recovery_mode = val; }

/**
* @brief Set whether strict validation is enabled or not.
*
* @param val Boolean value to indicate whether strict validation is enabled.
*/
void set_strict_validation(bool val) { _strict_validation = val; }

/**
* @brief Set whether leading zeros are allowed in numeric values. Strict validation
* must be enabled for this to work.
*
* @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
*
* @param val Boolean value to indicate whether leading zeros are allowed in numeric values
*/
void allow_numeric_leading_zeros(bool val)
{
CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work.");
_allow_numeric_leading_zeros = val;
}

/**
* @brief Set whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity,
* Infinity, and -Infinity. Strict validation must be enabled for this to work.
*
* @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
*
* @param val Boolean value to indicate whether leading zeros are allowed in numeric values
*/
void allow_nonnumeric_numbers(bool val)
{
CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work.");
_allow_nonnumeric_numbers = val;
}

/**
* @brief Set whether in a quoted string should characters greater than or equal to 0
* and less than 32 be allowed without some form of escaping. Strict validation must
* be enabled for this to work.
*
* @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
*
* @param val true to indicate whether unquoted control chars are allowed.
*/
void allow_unquoted_control_chars(bool val)
{
CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work.");
_allow_unquoted_control_chars = val;
}

/**
* @brief Sets additional values to recognize as null values.
*
* @param vals Vector of values to be considered to be null
*/
void set_na_values(std::vector<std::string> vals) { _na_values = std::move(vals); }
};

/**
Expand Down Expand Up @@ -638,6 +758,76 @@ class json_reader_options_builder {
return *this;
}

/**
* @brief Set whether json validation should be strict or not.
*
* @param val Boolean value to indicate whether json validation should be strict or not.
* @return this for chaining
*/
json_reader_options_builder& strict_validation(bool val)
{
options.set_strict_validation(val);
return *this;
}

/**
* @brief Set Whether leading zeros are allowed in numeric values. Strict validation must
* be enabled for this to have any effect.
*
* @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
*
* @param val Boolean value to indicate whether leading zeros are allowed in numeric values
* @return this for chaining
*/
json_reader_options_builder& numeric_leading_zeros(bool val)
{
options.allow_numeric_leading_zeros(val);
return *this;
}

/**
* @brief Set whether specific unquoted number values are valid JSON. The values are NaN,
* +INF, -INF, +Infinity, Infinity, and -Infinity.
* Strict validation must be enabled for this to have any effect.
*
* @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
*
* @param val Boolean value to indicate if unquoted nonnumeric values are valid json or not.
* @return this for chaining
*/
json_reader_options_builder& nonnumeric_numbers(bool val)
{
options.allow_nonnumeric_numbers(val);
return *this;
}

/**
* @brief Set whether chars >= 0 and < 32 are allowed in a quoted string without
* some form of escaping. Strict validation must be enabled for this to have any effect.
*
* @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
*
* @param val Boolean value to indicate if unquoted control chars are allowed or not.
* @return this for chaining
*/
json_reader_options_builder& unquoted_control_chars(bool val)
{
options.allow_unquoted_control_chars(val);
return *this;
}

/**
* @brief Sets additional values to recognize as null values.
*
* @param vals Vector of values to be considered to be null
* @return this for chaining
*/
json_reader_options_builder& na_values(std::vector<std::string> vals)
{
options.set_na_values(std::move(vals));
return *this;
}

/**
* @brief move json_reader_options member once it's built.
*/
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/io/json/json_normalization.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include "io/fst/lookup_tables.cuh"

#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/io/detail/json.hpp>
#include <cudf/types.hpp>
#include <cudf/utilities/memory_resource.hpp>
Expand Down Expand Up @@ -302,6 +303,7 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& inda
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
static constexpr std::int32_t min_out = 0;
static constexpr std::int32_t max_out = 2;
auto parser =
Expand Down Expand Up @@ -330,6 +332,7 @@ void normalize_whitespace(datasource::owning_buffer<rmm::device_buffer>& indata,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
static constexpr std::int32_t min_out = 0;
static constexpr std::int32_t max_out = 2;
auto parser =
Expand Down
15 changes: 15 additions & 0 deletions cpp/src/io/json/nested_json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,21 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
device_span<SymbolOffsetT const> token_indices,
rmm::cuda_stream_view stream);

/**
* @brief Validate the tokens conforming to behavior given in options.
*
* @param d_input The string of input characters
* @param tokens The tokens to be post-processed
* @param token_indices The tokens' corresponding indices that are post-processed
* @param options Parsing options specifying the parsing behaviour
* @param stream The cuda stream to dispatch GPU kernels to
*/
void validate_token_stream(device_span<char const> d_input,
device_span<PdaTokenT> tokens,
device_span<SymbolOffsetT> token_indices,
cudf::io::json_reader_options const& options,
rmm::cuda_stream_view stream);

/**
* @brief Parses the given JSON string and generates a tree representation of the given input.
*
Expand Down
5 changes: 4 additions & 1 deletion cpp/src/io/json/nested_json_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1660,6 +1660,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge

if (delimiter_offset == 1) {
tokens.set_element(0, token_t::LineEnd, stream);
validate_token_stream(json_in, tokens, tokens_indices, options, stream);
auto [filtered_tokens, filtered_tokens_indices] =
process_token_stream(tokens, tokens_indices, stream);
tokens = std::move(filtered_tokens);
Expand Down Expand Up @@ -2082,7 +2083,9 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt
parse_opts.keepquotes = options.is_enabled_keep_quotes();
parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream);
parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
parse_opts.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream);
std::vector<std::string> na_values{"", "null"};
na_values.insert(na_values.end(), options.get_na_values().begin(), options.get_na_values().end());
parse_opts.trie_na = cudf::detail::create_serialized_trie(na_values, stream);
return parse_opts;
}

Expand Down
Loading

0 comments on commit 9acbaf8

Please sign in to comment.