diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 3d0d263e2bb..ad9f9a4aa5c 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -131,10 +131,14 @@ class json_reader_options { json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL; // Validation checks for spark + // Should the json validation be strict of not + bool _strict_validation = false; // Allow leading zeros for numeric values. bool _allow_numeric_leading_zeros = true; + // Allow nonnumeric numbers. NaN/Inf + bool _allow_nonnumeric_numbers = true; // Allow unquoted control characters - bool allowUnquotedControlChars = true; + bool _allow_unquoted_control_chars = true; // Additional values to recognize as null values std::vector _na_values; @@ -309,7 +313,15 @@ class json_reader_options { [[nodiscard]] json_recovery_mode_t recovery_mode() const { return _recovery_mode; } /** - * @brief Whether leading zeros are allowed in numeric values. + * @brief Whether json validation should be enforced strictly or not. + * + * @return true if it should be. + */ + [[nodiscard]] bool is_strict_validation() const { return _strict_validation; } + + /** + * @brief Whether leading zeros are allowed in numeric values. strict validation + * must be enabled for this to work. * * @return true if leading zeros are allowed in numeric values */ @@ -318,6 +330,22 @@ class json_reader_options { return _allow_numeric_leading_zeros; } + /** + * @brief Whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, Infinity, and + * -Infinity. strict validation must be enabled for this to work. + * + * @return true if leading zeros are allowed in numeric values + */ + [[nodiscard]] bool is_allowed_nonnumeric_numbers() const { return _allow_nonnumeric_numbers; } + + /** + * @brief Whether in a quoted string should characters greater than or equal to 0 and less than 32 be allowed + * without some form of escaping. Strict validation must be enabled for this to work. + * + * @return true if unquoted control chars are allowed. + */ + [[nodiscard]] bool is_allowed_unquoted_control_chars() const { return _allow_unquoted_control_chars; } + /** * @brief Returns additional values to recognize as null values. * @@ -456,12 +484,37 @@ class json_reader_options { void set_recovery_mode(json_recovery_mode_t val) { _recovery_mode = val; } /** - * @brief Set Whether leading zeros are allowed in numeric values. + * @brief Set whether strict validation is enabled or not. + * + * @param val Boolean value to indicate whether strict validation is enabled. + */ + void set_strict_validation(bool val) { _strict_validation = val; } + + /** + * @brief Set whether leading zeros are allowed in numeric values. strict validation + * must be enabled for this to work. * * @param val Boolean value to indicate whether leading zeros are allowed in numeric values */ void allow_numeric_leading_zeros(bool val) { _allow_numeric_leading_zeros = val; } + /** + * @brief Set whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, + * Infinity, and -Infinity. strict validation must be enabled for this to work. + * + * @param val Boolean value to indicate whether leading zeros are allowed in numeric values + */ + void allow_nonnumeric_numbers(bool val) { _allow_nonnumeric_numbers = val; } + + /** + * @brief Set whether in a quoted string should characters greater than or equal to 0 + * and less than 32 be allowed without some form of escaping. Strict validation must + * be enabled for this to work. + * + * @param val true to indicate wether unquoted control chars are allowed. + */ + void allow_unquoted_control_chars(bool val) { _allow_unquoted_control_chars = val; } + /** * @brief Sets additional values to recognize as null values. * @@ -680,7 +733,19 @@ class json_reader_options_builder { } /** - * @brief Set Whether leading zeros are allowed in numeric values. + * @brief Set whether json validation should be strict or not. + * + * @param val Boolean value to indicate whether json validation should be strict or not. + */ + json_reader_options_builder& strict_validation(bool val) + { + options.set_strict_validation(val); + return *this; + } + + /** + * @brief Set Whether leading zeros are allowed in numeric values. strict validation must + * be enabled for this to have any effect. * * @param val Boolean value to indicate whether leading zeros are allowed in numeric values * @return this for chaining @@ -691,6 +756,32 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set whether specific unquoted number values are valid JSON. The values are NaN, + * +INF, -INF, +Infinity, Infinity, and -Infinity. + * strict validation must be enabled for this to have any effect. + * + * @param val Boolean value to indicate if unquoted nonnumeric values are + * valid json or not. + */ + json_reader_options_builder& nonnumeric_numbers(bool val) + { + options.allow_nonnumeric_numbers(val); + return *this; + } + + /** + * @brief Set whether chars >= 0 and < 32 are allowed in a quoted string without + * some form of escaping. strict validation must be enabled for this to have any effect. + * + * @param val Boolean value to indicate if unquoted control chars are allowed or not. + */ + json_reader_options_builder& unquoted_control_chars(bool val) + { + options.allow_unquoted_control_chars(val); + return *this; + } + /** * @brief Sets additional values to recognize as null values. * diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index 96ae4f7817b..aee22e3fdf0 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -52,30 +52,246 @@ struct write_if { } }; +enum class number_state { + start = 0, + saw_neg, // not a complete state + leading_zero, + whole, + saw_radix, // not a complete state + fraction, + start_exponent, // not a complete state + after_sign_exponent, // not a complete state + exponent +}; + +enum class string_state { + normal = 0, + escaped, // not a complete state + escaped_u // not a complete state +}; + +__device__ inline bool substr_eq(const char * data, + SymbolOffsetT const start, + SymbolOffsetT const end, + SymbolOffsetT const expected_len, + const char * expected) { + if (end - start != expected_len) { + return false; + } else { + for (auto idx = 0; idx < expected_len; idx++) { + if (data[start + idx] != expected[idx]) { + return false; + } + } + } + return true; +} + void validate_token_stream(device_span d_input, device_span tokens, device_span token_indices, cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream) { - if (getenv("SPARK_JSON")) { + if (options.is_strict_validation()) { using token_t = cudf::io::json::token_t; - auto validate_tokens = + auto validate_values = [data = d_input.data(), allow_numeric_leading_zeros = - options.is_allowed_numeric_leading_zeros()] __device__(SymbolOffsetT start, - SymbolOffsetT end) -> bool { - // Leading zeros. - if (!allow_numeric_leading_zeros and data[start] == '0') return false; - return true; + options.is_allowed_numeric_leading_zeros(), + allow_nonnumeric = + options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start, + SymbolOffsetT end) -> bool { + // This validates an unquoted value. A value must match https://www.json.org/json-en.html + // but the leading and training whitespace should already have been removed, and is not + // a string + auto c = data[start]; + if ('n' == c) { + return substr_eq(data, start, end, 4, "null"); + } else if ('t' == c) { + return substr_eq(data, start, end, 4, "true"); + } else if ('f' == c) { + return substr_eq(data, start, end, 5, "false"); + } else if (allow_nonnumeric && c == 'N') { + return substr_eq(data, start, end, 3, "NaN"); + } else if (allow_nonnumeric && c == 'I') { + return substr_eq(data, start, end, 8, "Infinity"); + } else if (allow_nonnumeric && c == '+') { + return substr_eq(data, start, end, 4, "+INF") || substr_eq(data, start, end, 9, "+Infinity"); + } else if ('-' == c || c <= '9' && 'c' >= '0') { + // number + auto num_state = number_state::start; + for (auto at = start; at < end; at++) { + c = data[at]; + switch (num_state) { + case number_state::start: + if ('-' == c) { + num_state = number_state::saw_neg; + } else if ('0' == c) { + num_state = number_state::leading_zero; + } else if (c >= '1' && c <= '9') { + num_state = number_state::whole; + } else { + return false; + } + break; + case number_state::saw_neg: + if ('0' == c) { + num_state = number_state::leading_zero; + } else if (c >= '1' && c <= '9') { + num_state = number_state::whole; + } else if (allow_nonnumeric && 'I' == c) { + return substr_eq(data, start, end, 4, "-INF") || substr_eq(data, start, end, 9, "-Infinity"); + } else { + return false; + } + break; + case number_state::leading_zero: + if (allow_numeric_leading_zeros && c >= '0' && c <= '9') { + num_state = number_state::whole; + } else if ('.' == c) { + num_state = number_state::saw_radix; + } else if ('e' == c || 'E' == c) { + num_state = number_state::start_exponent; + } else { + return false; + } + break; + case number_state::whole: + if (c >= '0' && c <= '9') { + num_state = number_state::whole; + } else if ('.' == c) { + num_state = number_state::saw_radix; + } else if ('e' == c || 'E' == c) { + num_state = number_state::start_exponent; + } else { + return false; + } + break; + case number_state::saw_radix: + if (c >= '0' && c <= '9') { + num_state = number_state::fraction; + } else if ('e' == c || 'E' == c) { + num_state = number_state::start_exponent; + } else { + return false; + } + break; + case number_state::fraction: + if (c >= '0' && c <= '9') { + num_state = number_state::fraction; + } else if ('e' == c || 'E' == c) { + num_state = number_state::start_exponent; + } else { + return false; + } + break; + case number_state::start_exponent: + if ('+' == c || '-' == c) { + num_state = number_state::after_sign_exponent; + } else if (c >= '0' && c <= '9') { + num_state = number_state::exponent; + } else { + return false; + } + break; + case number_state::after_sign_exponent: + if (c >= '0' && c <= '9') { + num_state = number_state::exponent; + } else { + return false; + } + break; + case number_state::exponent: + if (c >= '0' && c <= '9') { + num_state = number_state::exponent; + } else { + return false; + } + break; + } + } + return num_state != number_state::after_sign_exponent && + num_state != number_state::start_exponent && + num_state != number_state::saw_neg && + num_state != number_state::saw_radix; + } else { + return false; + } + }; + + auto validate_strings = + [data = d_input.data(), + allow_unquoted_control_chars = + options.is_allowed_unquoted_control_chars()] __device__(SymbolOffsetT start, + SymbolOffsetT end) -> bool { + // This validates a quoted string. A string must match https://www.json.org/json-en.html + // but we already know that it has a starting and ending " and all white space has been + // stripped out. Also the base CUDF validation makes sure escaped chars are correct + // so we only need to worry about unquoted control chars + + auto state = string_state::normal; + auto u_count = 0; + for (SymbolOffsetT idx = start + 1; idx < end; idx++) { + auto c = data[idx]; + if (!allow_unquoted_control_chars && c >= 0 && c < 32) { + return false; + } + + switch (state) { + case string_state::normal: + if (c == '\\') { + state = string_state::escaped; + } + break; + case string_state::escaped: + // in Spark you can allow any char to be escaped, but CUDF + // validates it in some cases so we need to also validate it. + if (c == 'u') { + state = string_state::escaped_u; + u_count = 0; + } else if (c == '"' || + c == '\\' || + c == '/' || + c == 'b' || + c == 'f' || + c == 'n' || + c == 'r' || + c == 't') { + state = string_state::normal; + } else { + return false; + } + break; + case string_state::escaped_u: + if ((c >= '0' && c <= '9') || + (c >= 'a' && c <= 'f') || + (c >= 'A' && c <= 'F')) { + u_count++; + if (u_count == 4) { + state = string_state::normal; + u_count = 0; + } + } else { + return false; + } + break; + } + } + return string_state::normal == state; }; + auto num_tokens = tokens.size(); auto count_it = thrust::make_counting_iterator(0); auto predicate = [tokens = tokens.begin(), token_indices = token_indices.begin(), - validate_tokens] __device__(auto i) -> bool { + validate_values, + validate_strings] __device__(auto i) -> bool { if (tokens[i] == token_t::ValueEnd) { - return !validate_tokens(token_indices[i - 1], token_indices[i]); + return !validate_values(token_indices[i - 1], token_indices[i]); + } else if (tokens[i] == token_t::FieldNameEnd || + tokens[i] == token_t::StringEnd) { + return !validate_strings(token_indices[i - 1], token_indices[i]); } return false; }; @@ -102,10 +318,10 @@ void validate_token_stream(device_span d_input, count_it + num_tokens, error.begin(), predicate); // in-place scan - printf("error:"); - for (auto tk : cudf::detail::make_std_vector_sync(error, stream)) - printf("%d ", tk); - printf("\n"); + //printf("error:"); + //for (auto tk : cudf::detail::make_std_vector_sync(error, stream)) + // printf("%d ", tk); + //printf("\n"); thrust::transform_inclusive_scan(rmm::exec_policy(stream), count_it, @@ -114,10 +330,10 @@ void validate_token_stream(device_span d_input, transform_op, binary_op); // in-place scan } - printf("pre_process_token:"); - for (auto tk : cudf::detail::make_std_vector_sync(device_span(tokens), stream)) - printf("%d ", tk); - printf("\n"); + //printf("pre_process_token:"); + //for (auto tk : cudf::detail::make_std_vector_sync(device_span(tokens), stream)) + // printf("%d ", tk); + //printf("\n"); // LE SB FB FE VB VE SE LE SB ER LE SB LB VB VE SE LE LE // 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index b37d0d88ec9..50cce4590c1 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -34,6 +34,10 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean normalizeWhitespace; private final boolean mixedTypesAsStrings; private final boolean keepStringQuotes; + private final boolean allowLeadingZeros; + private final boolean strictValidation; + private final boolean allowNonNumericNumbers; + private final boolean allowUnquotedControlChars; private JSONOptions(Builder builder) { super(builder); @@ -44,6 +48,10 @@ private JSONOptions(Builder builder) { normalizeWhitespace = builder.normalizeWhitespace; mixedTypesAsStrings = builder.mixedTypesAsStrings; keepStringQuotes = builder.keepQuotes; + strictValidation = builder.strictValidation; + allowLeadingZeros = builder.allowLeadingZeros; + allowNonNumericNumbers = builder.allowNonNumericNumbers; + allowUnquotedControlChars = builder.allowUnquotedControlChars; } public boolean isDayFirst() { @@ -75,6 +83,22 @@ public boolean keepStringQuotes() { return keepStringQuotes; } + public boolean strictValidation() { + return strictValidation; + } + + public boolean leadingZerosAllowed() { + return allowLeadingZeros; + } + + public boolean nonNumericNumbersAllowed() { + return allowNonNumericNumbers; + } + + public boolean unquotedControlChars() { + return allowUnquotedControlChars; + } + @Override String[] getIncludeColumnNames() { throw new UnsupportedOperationException("JSON reader didn't support column prune"); @@ -85,6 +109,10 @@ public static Builder builder() { } public static final class Builder extends ColumnFilterOptions.Builder { + private boolean strictValidation = false; + private boolean allowUnquotedControlChars = true; + private boolean allowNonNumericNumbers = false; + private boolean allowLeadingZeros = false; private boolean dayFirst = false; private boolean lines = true; @@ -95,10 +123,47 @@ public static final class Builder extends ColumnFilterOptions.Builder(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .strict_validation(strict_validation) + .numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control) .keep_quotes(keep_quotes); auto result = @@ -1649,7 +1657,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string, - jboolean keep_quotes) + jboolean keep_quotes, + jboolean strict_validation, + jboolean allow_leading_zeros, + jboolean allow_nonnumeric_numbers, + jboolean allow_unquoted_control) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); if (buffer_length <= 0) { @@ -1671,6 +1683,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) + .strict_validation(strict_validation) + .numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control) .mixed_types_as_string(mixed_types_as_string) .keep_quotes(keep_quotes); @@ -1777,6 +1793,10 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, jboolean normalize_whitespace, jboolean mixed_types_as_string, jboolean keep_quotes, + jboolean strict_validation, + jboolean allow_leading_zeros, + jboolean allow_nonnumeric_numbers, + jboolean allow_unquoted_control, jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1811,6 +1831,10 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .strict_validation(strict_validation) + .numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control) .keep_quotes(keep_quotes); if (!n_types.is_null()) { @@ -1861,7 +1885,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string, - jboolean keep_quotes) + jboolean keep_quotes, + jboolean strict_validation, + jboolean allow_leading_zeros, + jboolean allow_nonnumeric_numbers, + jboolean allow_unquoted_control) { bool read_buffer = true; if (buffer == 0) { @@ -1910,6 +1938,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .strict_validation(strict_validation) + .numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control) .keep_quotes(keep_quotes); if (!n_types.is_null()) { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 050bcbb268f..56fe63598d9 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -437,6 +437,7 @@ void testReadWhitespacesJSONFile() throws IOException { } } + @Test void testReadSingleQuotesJSONFileKeepQuotes() throws IOException { Schema schema = Schema.builder() .column(DType.STRING, "A") @@ -455,6 +456,206 @@ void testReadSingleQuotesJSONFileKeepQuotes() throws IOException { } } + private static final byte[] JSON_VALIDATION_BUFFER = ( + "{\"a\":true}\n" + + "{\"a\":false}\n" + + "{\"a\":null}\n" + + "{\"a\":true, \"b\":truee}\n" + + "{\"a\":true, \"b\":\"nulll\"}\n" + + "{\"a\": 1}\n" + + "{\"a\": 0}\n" + + "{\"a\": -}\n" + + "{\"a\": -0}\n" + + "{\"a\": -01}\n" + + + "{\"a\": 01}\n" + + "{\"a\": -0.1}\n" + + "{\"a\": -00.1}\n" + + "{\"a\": NaN}\n" + + "{\"a\": INF}\n" + + "{\"a\": +INF}\n" + + "{\"a\": -INF}\n" + + "{\"a\": +Infinity}\n" + + "{\"a\": Infinity}\n" + + "{\"a\": -Infinity}\n" + + + "{\"a\": INFinity}\n" + + "{\"a\":\"3710-11-10T02:46:58.732Z\"}\n" + + "{\"a\":12.}\n" + + "{\"a\": -3.4e+38}\n" + + "{\"a\": -3.4e-38}\n" + + "{\"a\": 1.4e38}\n" + + "{\"a\": -3.4E+38}\n" + + "{\"a\": -3.4E-38}\n" + + "{\"a\": 1.4E38}\n" + + "{\"a\": -3.4E+}\n" + + + "{\"a\": -3.4E-}\n" + + "{\"a\": \"A\u0000B\"}\n" + + "{\"a\": \"A\\u0000B\"}\n" + + "{\"a\": \"A\u0001B\"}\n" + + "{\"a\": \"A\\u0001B\"}\n" + + "{\"a\": \"A\u001FB\"}\n" + + "{\"a\": \"A\\u001FB\"}\n" + + "{\"a\": \"A\u0020B\"}\n" + + "{\"a\": \"A\\u0020B\"}\n" + + "{\"a\": \"\\u12\"}\n" + + + "{\"a\": \"\\z\"}\n" + + "{\"a\": \"\\r\"}\n" + + "{\"a\": \"something\", \"b\": \"\\z\"}\n" + ).getBytes(StandardCharsets.UTF_8); + + @Test + void testJSONValidationNoStrict() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withRecoverWithNull(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) + .withStrictValidation(false) + .withLeadingZeros(false) + .withNonNumericNumbers(false) + .withUnquotedControlChars(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column( + "true", "false", null, "true", "true", "1", "0", "-", "-0", "-01", + "01", "-0.1", "-00.1", "NaN", "INF", "+INF", "-INF", "+Infinity", "Infinity", "-Infinity", + "INFinity", "\"3710-11-10T02:46:58.732Z\"", "12.", "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", "-3.4E+", + "-3.4E-", "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"", "\"something\"") + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testJSONValidation() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withRecoverWithNull(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) + .withStrictValidation(true) + .withLeadingZeros(false) + .withNonNumericNumbers(false) + .withUnquotedControlChars(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column( + "true", "false", null, null, "true", "1", "0", null, "-0", null, + null, "-0.1", null, null, null, null, null, null, null, null, + null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, + null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"", null) + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testJSONValidationLeadingZeros() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withRecoverWithNull(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) + .withStrictValidation(true) + .withLeadingZeros(true) + .withNonNumericNumbers(false) + .withUnquotedControlChars(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column( + "true", "false", null, null, "true", "1", "0", null, "-0", "-01", + "01", "-0.1", "-00.1", null, null, null, null, null, null, null, + null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, + null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"", null) + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testJSONValidationNonNumeric() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withRecoverWithNull(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) + .withStrictValidation(true) + .withLeadingZeros(false) + .withNonNumericNumbers(true) + .withUnquotedControlChars(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column( + "true", "false", null, null, "true", "1", "0", null, "-0", null, + null, "-0.1", null, "NaN", null, "+INF", "-INF", "+Infinity", "Infinity", "-Infinity", + null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, + null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"", null) + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testJSONValidationUnquotedControl() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withRecoverWithNull(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) + .withStrictValidation(true) + .withLeadingZeros(false) + .withNonNumericNumbers(false) + .withUnquotedControlChars(false) + .build(); + try (Table expected = new Table.TestBuilder() + .column( + "true", "false", null, null, "true", "1", "0", null, "-0", null, + null, "-0.1", null, null, null, null, null, null, null, null, + null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, + null, null, "\"A\u0000B\"", null, "\"A\u0001B\"", null, "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"", null) + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + private static final byte[] NESTED_JSON_DATA_BUFFER = ("{\"a\":{\"c\":\"C1\"}}\n" + "{\"a\":{\"c\":\"C2\", \"b\":\"B2\"}}\n" + "{\"d\":[1,2,3]}\n" +