From a21cb8947677b76fc656c4bfa7b021a76a6d4962 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Fri, 14 Jun 2024 12:56:42 -0500 Subject: [PATCH 1/9] Add in some initial configs so I can test --- .../main/java/ai/rapids/cudf/JSONOptions.java | 19 ++++++++++++++- java/src/main/java/ai/rapids/cudf/Table.java | 23 ++++++++++++++----- java/src/main/native/src/TableJni.cpp | 12 ++++++++-- 3 files changed, 45 insertions(+), 9 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index b37d0d88ec9..9f83dbdc8d1 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -34,6 +34,7 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean normalizeWhitespace; private final boolean mixedTypesAsStrings; private final boolean keepStringQuotes; + private final boolean allowLeadingZeros; private JSONOptions(Builder builder) { super(builder); @@ -44,6 +45,7 @@ private JSONOptions(Builder builder) { normalizeWhitespace = builder.normalizeWhitespace; mixedTypesAsStrings = builder.mixedTypesAsStrings; keepStringQuotes = builder.keepQuotes; + allowLeadingZeros = builder.allowLeadingZeros; } public boolean isDayFirst() { @@ -75,6 +77,10 @@ public boolean keepStringQuotes() { return keepStringQuotes; } + public boolean leadingZerosAllowed() { + return allowLeadingZeros; + } + @Override String[] getIncludeColumnNames() { throw new UnsupportedOperationException("JSON reader didn't support column prune"); @@ -85,6 +91,7 @@ public static Builder builder() { } public static final class Builder extends ColumnFilterOptions.Builder { + public boolean allowLeadingZeros = false; private boolean dayFirst = false; private boolean lines = true; @@ -95,10 +102,20 @@ public static final class Builder extends ColumnFilterOptions.Builder(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .numeric_leading_zeros(allow_leading_zeros) .keep_quotes(keep_quotes); auto result = @@ -1649,7 +1651,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string, - jboolean keep_quotes) + jboolean keep_quotes, + jboolean allow_leading_zeros) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); if (buffer_length <= 0) { @@ -1671,6 +1674,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) + .numeric_leading_zeros(allow_leading_zeros) .mixed_types_as_string(mixed_types_as_string) .keep_quotes(keep_quotes); @@ -1777,6 +1781,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, jboolean normalize_whitespace, jboolean mixed_types_as_string, jboolean keep_quotes, + jboolean allow_leading_zeros, jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1811,6 +1816,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .numeric_leading_zeros(allow_leading_zeros) .keep_quotes(keep_quotes); if (!n_types.is_null()) { @@ -1861,7 +1867,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string, - jboolean keep_quotes) + jboolean keep_quotes, + jboolean allow_leading_zeros) { bool read_buffer = true; if (buffer == 0) { @@ -1910,6 +1917,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .numeric_leading_zeros(allow_leading_zeros) .keep_quotes(keep_quotes); if (!n_types.is_null()) { From dc5e50b11537d800a5c04aecb84b62b1ec836fe5 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Fri, 14 Jun 2024 16:34:14 -0500 Subject: [PATCH 2/9] Some of the validation is working, but more needs to be done --- cpp/src/io/json/process_tokens.cu | 129 +++++++++++++++++- java/src/main/java/ai/rapids/cudf/Table.java | 6 +- .../test/java/ai/rapids/cudf/TableTest.java | 37 +++++ 3 files changed, 162 insertions(+), 10 deletions(-) diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index d1ed02052dc..a715dbc05cd 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -52,22 +52,137 @@ struct write_if { } }; +enum class number_state { + start = 0, + saw_neg, // not a complete state + leading_zero, + whole, + fraction, + start_exponent, // not a complete state + after_sign_exponent, // not a complete state + exponent +}; + void validate_token_stream(device_span d_input, device_span tokens, device_span token_indices, cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream) { - if (getenv("SPARK_JSON")) { + //if (getenv("SPARK_JSON")) { + if (true) { using token_t = cudf::io::json::token_t; auto validate_tokens = [data = d_input.data(), allow_numeric_leading_zeros = - options.is_allowed_numeric_leading_zeros()] __device__(SymbolOffsetT start, + options.is_allowed_numeric_leading_zeros()] __device__(int32_t i, + SymbolOffsetT start, SymbolOffsetT end) -> bool { - // Leading zeros. - if (!allow_numeric_leading_zeros and data[start] == '0') return false; - return true; + // This validates an unquoted value. A value must match https://www.json.org/json-en.html + // but the leading and training whitespace should already have been removed, and is not + // a string + for (SymbolOffsetT idx = start; idx < end; idx++) { + printf("%i/%i VALUE CHAR %i/%i => '%c'\n", threadIdx.x, i, idx, end, data[idx]); + } + + // TODO do I need to worry about an empty value??? + auto len = end - start; + auto c = data[start]; + if ('n' == c) { + return (4 == len) && data[start + 1] == 'u' && data[start + 2] == 'l' && data[start + 3] == 'l'; + } else if ('t' == c) { + return (4 == len) && data[start + 1] == 'r' && data[start + 2] == 'u' && data[start + 3] == 'e'; + } else if ('f' == c) { + return (5 == len) && data[start + 1] == 'a' && data[start + 2] == 'l' && data[start + 3] == 's' && data[start + 4] == 'e'; + } else if ('-' == c || c <= '9' && 'c' >= '0') { + // number + auto num_state = number_state::start; + for (auto at = start; at < end; at++) { + c = data[at]; + switch (num_state) { + case number_state::start: + if ('-' == c) { + num_state = number_state::saw_neg; + } else if ('0' == c) { + num_state = number_state::leading_zero; + } else if (c >= '1' && c <= '9') { + num_state = number_state::whole; + } else { + return false; + } + break; + case number_state::saw_neg: + if ('0' == c) { + num_state = number_state::leading_zero; + } else if (c >= '1' && c <= '9') { + num_state = number_state::whole; + } else { + return false; + } + break; + case number_state::leading_zero: + if (allow_numeric_leading_zeros && c >= '0' && c <= '9') { + num_state = number_state::whole; + } else if ('.' == c) { + num_state = number_state::fraction; + } else if ('e' == c || 'E' == c) { + num_state = number_state::start_exponent; + } else { + return false; + } + break; + case number_state::whole: + if (c >= '0' && c <= '9') { + num_state = number_state::whole; + } else if ('.' == c) { + num_state = number_state::fraction; + } else if ('e' == c || 'E' == c) { + num_state = number_state::start_exponent; + } else { + return false; + } + break; + case number_state::fraction: + if (c >= '0' && c <= '9') { + num_state = number_state::fraction; + } else if ('e' == c || 'E' == c) { + num_state = number_state::start_exponent; + } else { + return false; + } + break; + case number_state::start_exponent: + if ('-' == c || '-' == c) { + num_state = number_state::after_sign_exponent; + } else if (c >= '0' && c <= '9') { + num_state = number_state::exponent; + } else { + return false; + } + break; + case number_state::after_sign_exponent: + if (c >= '0' && c <= '9') { + num_state = number_state::exponent; + } else { + return false; + } + break; + case number_state::exponent: + if (c >= '0' && c <= '9') { + num_state = number_state::exponent; + } else { + return false; + } + break; + } + } + return num_state != number_state::after_sign_exponent && + num_state != number_state::start_exponent && + num_state != number_state::saw_neg; + } else { + printf("%i/%i OTHER\n", threadIdx.x, i); + return false; + } }; auto num_tokens = tokens.size(); auto count_it = thrust::make_counting_iterator(0); @@ -75,7 +190,7 @@ void validate_token_stream(device_span d_input, token_indices = token_indices.begin(), validate_tokens] __device__(auto i) -> bool { if (tokens[i] == token_t::ValueEnd) { - return !validate_tokens(token_indices[i - 1], token_indices[i]); + return !validate_tokens(i, token_indices[i - 1], token_indices[i]); } return false; }; @@ -158,4 +273,4 @@ void validate_token_stream(device_span d_input, // (if not error row & not lineEnd token) -> decoupled look back for output indices, // CopyIf (if not error row & not lineEnd token) write to output. } // namespace detail -} // namespace cudf::io::json \ No newline at end of file +} // namespace cudf::io::json diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 3603ef7bdf9..d4ac91835dc 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -1443,10 +1443,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) { * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. * @param opts various JSON parsing options. * @param ds the DataSource to read from. - * @param emtpyRowCount the number of rows to return if no columns were read. + * @param emptyRowCount the number of rows to return if no columns were read. * @return the data parsed as a table on the GPU. */ - public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emtpyRowCount) { + public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) { long dsHandle = DataSourceHelper.createWrapperDataSource(ds); try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), opts.isDayFirst(), @@ -1454,7 +1454,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int opts.isNormalizeWhitespace(), opts.isMixedTypesAsStrings(), opts.keepStringQuotes(), opts.leadingZerosAllowed(), dsHandle))) { - return gatherJSONColumns(schema, twm, emtpyRowCount); + return gatherJSONColumns(schema, twm, emptyRowCount); } finally { DataSourceHelper.destroyWrapperDataSource(dsHandle); } diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index dc6eb55fc6a..0af87dbeb2c 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -437,6 +437,7 @@ void testReadWhitespacesJSONFile() throws IOException { } } + @Test void testReadSingleQuotesJSONFileKeepQuotes() throws IOException { Schema schema = Schema.builder() .column(DType.STRING, "A") @@ -455,6 +456,42 @@ void testReadSingleQuotesJSONFileKeepQuotes() throws IOException { } } + private static final byte[] JSON_VALIDATION_BUFFER = ( + "{\"a\":true}\n" + + "{\"a\":false}\n" + + "{\"a\":null}\n" + + "{\"a\":true, \"b\":truee}\n" + + "{\"a\":true, \"b\":\"nulll\"}\n" + + "{\"a\": 1}\n" + + "{\"a\": 0}\n" + + "{\"a\": -}\n" + + "{\"a\": -0}\n" + + "{\"a\": -01}\n" + + "{\"a\": 01}\n" + + "{\"a\": -0.1}\n" + ).getBytes(StandardCharsets.UTF_8); + + @Test + void testJSONValidation() throws IOException { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .withLeadingZeros(false) + .withRecoverWithNull(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column("true", "false", null, null, "true", "1", "0", null, "-0", null, null, "-0.1") + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + private static final byte[] NESTED_JSON_DATA_BUFFER = ("{\"a\":{\"c\":\"C1\"}}\n" + "{\"a\":{\"c\":\"C2\", \"b\":\"B2\"}}\n" + "{\"d\":[1,2,3]}\n" + From 8a3d4ede77f86cb5f2eec5781b2a68195cd3cc44 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Mon, 17 Jun 2024 09:33:10 -0500 Subject: [PATCH 3/9] Added in some more validation configs --- cpp/include/cudf/io/json.hpp | 70 +++++++++++++++++-- cpp/src/io/json/process_tokens.cu | 31 ++++---- .../main/java/ai/rapids/cudf/JSONOptions.java | 40 +++++++++-- java/src/main/java/ai/rapids/cudf/Table.java | 51 ++++++++++---- java/src/main/native/src/TableJni.cpp | 20 +++++- .../test/java/ai/rapids/cudf/TableTest.java | 57 ++++++++++++++- 6 files changed, 230 insertions(+), 39 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 2017b72ca28..7d7f93b9ed4 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -131,10 +131,14 @@ class json_reader_options { json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL; // Validation checks for spark + // Should the json validation be strict of not + bool _strict_validation = false; // Allow leading zeros for numeric values. bool _allow_numeric_leading_zeros = true; + // Allow nonnumeric numbers. NaN/Inf + bool _allow_nonnumeric_numbers = true; // Allow unquoted control characters - bool allowUnquotedControlChars = true; + bool _allow_unquoted_control_chars = true; // Additional values to recognize as null values std::vector _na_values; @@ -309,12 +313,28 @@ class json_reader_options { [[nodiscard]] json_recovery_mode_t recovery_mode() const { return _recovery_mode; } /** - * @brief Whether leading zeros are allowed in numeric values. + * @brief Whether json validation should be enforced strictly or not. + * + * @return true if it should be. + */ + [[nodiscard]] bool is_strict_validation() const { return _strict_validation; } + + /** + * @brief Whether leading zeros are allowed in numeric values. strict validation + * must be enabled for this to work. * * @return true if leading zeros are allowed in numeric values */ [[nodiscard]] bool is_allowed_numeric_leading_zeros() const { return _allow_numeric_leading_zeros; } + /** + * @brief Whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, Infinity, and + * -Infinity. strict validation must be enabled for this to work. + * + * @return true if leading zeros are allowed in numeric values + */ + [[nodiscard]] bool is_allowed_nonnumeric_numbers() const { return _allow_nonnumeric_numbers; } + /** * @brief Returns additional values to recognize as null values. * @@ -453,12 +473,28 @@ class json_reader_options { void set_recovery_mode(json_recovery_mode_t val) { _recovery_mode = val; } /** - * @brief Set Whether leading zeros are allowed in numeric values. + * @brief Set Whether strict validation is enabled or not. + * + * @param val Boolean value to indicate whether strict validation is enabled. + */ + void set_strict_validation(bool val) { _strict_validation = val; } + + /** + * @brief Set Whether leading zeros are allowed in numeric values. strict validation + * must be enabled for this to work. * * @param val Boolean value to indicate whether leading zeros are allowed in numeric values */ void allow_numeric_leading_zeros(bool val) { _allow_numeric_leading_zeros = val; } + /** + * @brief Set whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, + * Infinity, and -Infinity. strict validation must be enabled for this to work. + * + * @param val Boolean value to indicate whether leading zeros are allowed in numeric values + */ + void allow_nonnumeric_numbers(bool val) { _allow_nonnumeric_numbers = val; } + /** * @brief Sets additional values to recognize as null values. * @@ -677,7 +713,19 @@ class json_reader_options_builder { } /** - * @brief Set Whether leading zeros are allowed in numeric values. + * @brief Set whether json validation should be strict or not. + * + * @param val Boolean value to indicate whether json validation should be strict or not. + */ + json_reader_options_builder& strict_validation(bool val) + { + options.set_strict_validation(val); + return *this; + } + + /** + * @brief Set Whether leading zeros are allowed in numeric values. strict validation must + * be enabled for this to have any effect. * * @param val Boolean value to indicate whether leading zeros are allowed in numeric values */ @@ -687,6 +735,20 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set whether unquoted number values are valid JSON. The values are NaN, + * +INF, -INF, +Infinity, Infinity, and -Infinity. + * strict validation must be enabled for this to have any effect. + * + * @param val Boolean value to indicate if unquoted nonnumeric values are + * valid json or not. + */ + json_reader_options_builder& nonnumeric_numbers(bool val) + { + options.allow_nonnumeric_numbers(val); + return *this; + } + /** * @brief Sets additional values to recognize as null values. * diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index a715dbc05cd..86cc40e1382 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -60,7 +60,9 @@ enum class number_state { fraction, start_exponent, // not a complete state after_sign_exponent, // not a complete state - exponent + exponent, + infinity_partial, // not a complete state and includes a counter + infinity }; void validate_token_stream(device_span d_input, @@ -69,8 +71,7 @@ void validate_token_stream(device_span d_input, cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream) { - //if (getenv("SPARK_JSON")) { - if (true) { + if (options.is_strict_validation()) { using token_t = cudf::io::json::token_t; auto validate_tokens = [data = d_input.data(), @@ -81,9 +82,9 @@ void validate_token_stream(device_span d_input, // This validates an unquoted value. A value must match https://www.json.org/json-en.html // but the leading and training whitespace should already have been removed, and is not // a string - for (SymbolOffsetT idx = start; idx < end; idx++) { - printf("%i/%i VALUE CHAR %i/%i => '%c'\n", threadIdx.x, i, idx, end, data[idx]); - } + //for (SymbolOffsetT idx = start; idx < end; idx++) { + // printf("%i/%i VALUE CHAR %i/%i => '%c'\n", threadIdx.x, i, idx, end, data[idx]); + //} // TODO do I need to worry about an empty value??? auto len = end - start; @@ -180,7 +181,7 @@ void validate_token_stream(device_span d_input, num_state != number_state::start_exponent && num_state != number_state::saw_neg; } else { - printf("%i/%i OTHER\n", threadIdx.x, i); + //printf("%i/%i OTHER\n", threadIdx.x, i); return false; } }; @@ -217,10 +218,10 @@ void validate_token_stream(device_span d_input, count_it + num_tokens, error.begin(), predicate); // in-place scan - printf("error:"); - for (auto tk : cudf::detail::make_std_vector_sync(error, stream)) - printf("%d ", tk); - printf("\n"); + //printf("error:"); + //for (auto tk : cudf::detail::make_std_vector_sync(error, stream)) + // printf("%d ", tk); + //printf("\n"); thrust::transform_inclusive_scan(rmm::exec_policy(stream), count_it, @@ -229,10 +230,10 @@ void validate_token_stream(device_span d_input, transform_op, binary_op); // in-place scan } - printf("pre_process_token:"); - for (auto tk : cudf::detail::make_std_vector_sync(device_span(tokens), stream)) - printf("%d ", tk); - printf("\n"); + //printf("pre_process_token:"); + //for (auto tk : cudf::detail::make_std_vector_sync(device_span(tokens), stream)) + // printf("%d ", tk); + //printf("\n"); // LE SB FB FE VB VE SE LE SB ER LE SB LB VB VE SE LE LE // 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index 9f83dbdc8d1..d3f00514fc9 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -35,6 +35,8 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean mixedTypesAsStrings; private final boolean keepStringQuotes; private final boolean allowLeadingZeros; + private final boolean strictValidation; + private final boolean allowNonNumericNumbers; private JSONOptions(Builder builder) { super(builder); @@ -45,7 +47,9 @@ private JSONOptions(Builder builder) { normalizeWhitespace = builder.normalizeWhitespace; mixedTypesAsStrings = builder.mixedTypesAsStrings; keepStringQuotes = builder.keepQuotes; + strictValidation = builder.strictValidation; allowLeadingZeros = builder.allowLeadingZeros; + allowNonNumericNumbers = builder.allowNonNumericNumbers; } public boolean isDayFirst() { @@ -77,10 +81,18 @@ public boolean keepStringQuotes() { return keepStringQuotes; } + public boolean strictValidation() { + return strictValidation; + } + public boolean leadingZerosAllowed() { return allowLeadingZeros; } + public boolean nonNumericNumbersAllowed() { + return allowNonNumericNumbers; + } + @Override String[] getIncludeColumnNames() { throw new UnsupportedOperationException("JSON reader didn't support column prune"); @@ -91,7 +103,9 @@ public static Builder builder() { } public static final class Builder extends ColumnFilterOptions.Builder { - public boolean allowLeadingZeros = false; + private boolean strictValidation = false; + private boolean allowNonNumericNumbers = false; + private boolean allowLeadingZeros = false; private boolean dayFirst = false; private boolean lines = true; @@ -103,10 +117,28 @@ public static final class Builder extends ColumnFilterOptions.Builder(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .strict_validation(strict_validation) .numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) .keep_quotes(keep_quotes); auto result = @@ -1652,7 +1656,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, jboolean normalize_whitespace, jboolean mixed_types_as_string, jboolean keep_quotes, - jboolean allow_leading_zeros) + jboolean strict_validation, + jboolean allow_leading_zeros, + jboolean allow_nonnumeric_numbers) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); if (buffer_length <= 0) { @@ -1674,7 +1680,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) + .strict_validation(strict_validation) .numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) .mixed_types_as_string(mixed_types_as_string) .keep_quotes(keep_quotes); @@ -1781,7 +1789,9 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, jboolean normalize_whitespace, jboolean mixed_types_as_string, jboolean keep_quotes, + jboolean strict_validation, jboolean allow_leading_zeros, + jboolean allow_nonnumeric_numbers, jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1816,7 +1826,9 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .strict_validation(strict_validation) .numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) .keep_quotes(keep_quotes); if (!n_types.is_null()) { @@ -1868,7 +1880,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, jboolean normalize_whitespace, jboolean mixed_types_as_string, jboolean keep_quotes, - jboolean allow_leading_zeros) + jboolean strict_validation, + jboolean allow_leading_zeros, + jboolean allow_nonnumeric_numbers) { bool read_buffer = true; if (buffer == 0) { @@ -1917,7 +1931,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .strict_validation(strict_validation) .numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) .keep_quotes(keep_quotes); if (!n_types.is_null()) { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 0af87dbeb2c..f88926cff8b 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -467,10 +467,36 @@ void testReadSingleQuotesJSONFileKeepQuotes() throws IOException { "{\"a\": -}\n" + "{\"a\": -0}\n" + "{\"a\": -01}\n" + + "{\"a\": 01}\n" + - "{\"a\": -0.1}\n" + "{\"a\": -0.1}\n" + + "{\"a\": -00.1}\n" ).getBytes(StandardCharsets.UTF_8); + @Test + void testJSONValidationNoStrict() throws IOException { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .withRecoverWithNull(true) + .withStrictValidation(false) + .withLeadingZeros(false) + .withNonNumericNumbers(false) + .build(); + try (Table expected = new Table.TestBuilder() + .column("true", "false", null, "true", "true", "1", "0", "-", "-0", "-01", + "01", "-0.1", "-00.1") + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + @Test void testJSONValidation() throws IOException { Schema schema = Schema.builder() @@ -480,11 +506,38 @@ void testJSONValidation() throws IOException { .withLines(true) .withMixedTypesAsStrings(true) .withNormalizeWhitespace(true) + .withRecoverWithNull(true) + .withStrictValidation(true) .withLeadingZeros(false) + .withNonNumericNumbers(false) + .build(); + try (Table expected = new Table.TestBuilder() + .column("true", "false", null, null, "true", "1", "0", null, "-0", null, + null, "-0.1", null) + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testJSONValidationLeadingZeros() throws IOException { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) .withRecoverWithNull(true) + .withStrictValidation(true) + .withLeadingZeros(true) + .withNonNumericNumbers(false) .build(); try (Table expected = new Table.TestBuilder() - .column("true", "false", null, null, "true", "1", "0", null, "-0", null, null, "-0.1") + .column("true", "false", null, null, "true", "1", "0", null, "-0", "-01", + "01", "-0.1", "-00.1") .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { From 22147c88aed29c099128573e5cdaa17b5caf8248 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Tue, 18 Jun 2024 08:43:57 -0500 Subject: [PATCH 4/9] WIP --- cpp/src/io/json/process_tokens.cu | 53 ++++++++++---- .../test/java/ai/rapids/cudf/TableTest.java | 71 +++++++++++++++---- 2 files changed, 97 insertions(+), 27 deletions(-) diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index 86cc40e1382..3ea2ab2103f 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -60,11 +60,26 @@ enum class number_state { fraction, start_exponent, // not a complete state after_sign_exponent, // not a complete state - exponent, - infinity_partial, // not a complete state and includes a counter - infinity + exponent }; +__device__ inline bool substr_eq(const char * data, + SymbolOffsetT const start, + SymbolOffsetT const end, + SymbolOffsetT const expected_len, + const char * expected) { + if (end - start != expected_len) { + return false; + } else { + for (auto idx = 0; idx < expected_len; idx++) { + if (data[start + idx] != expected[idx]) { + return false; + } + } + } + return true; +} + void validate_token_stream(device_span d_input, device_span tokens, device_span token_indices, @@ -76,25 +91,33 @@ void validate_token_stream(device_span d_input, auto validate_tokens = [data = d_input.data(), allow_numeric_leading_zeros = - options.is_allowed_numeric_leading_zeros()] __device__(int32_t i, - SymbolOffsetT start, - SymbolOffsetT end) -> bool { + options.is_allowed_numeric_leading_zeros(), + allow_nonnumeric = + options.is_allowed_nonnumeric_numbers()] __device__(int32_t i, + SymbolOffsetT start, + SymbolOffsetT end) -> bool { // This validates an unquoted value. A value must match https://www.json.org/json-en.html // but the leading and training whitespace should already have been removed, and is not // a string - //for (SymbolOffsetT idx = start; idx < end; idx++) { - // printf("%i/%i VALUE CHAR %i/%i => '%c'\n", threadIdx.x, i, idx, end, data[idx]); - //} + for (SymbolOffsetT idx = start; idx < end; idx++) { + printf("%i VALUE CHAR %i/%i => '%c'\n", i, idx, end, data[idx]); + } + printf("\t%i VALUE CHAR END\n", i); // TODO do I need to worry about an empty value??? - auto len = end - start; auto c = data[start]; if ('n' == c) { - return (4 == len) && data[start + 1] == 'u' && data[start + 2] == 'l' && data[start + 3] == 'l'; + return substr_eq(data, start, end, 4, "null"); } else if ('t' == c) { - return (4 == len) && data[start + 1] == 'r' && data[start + 2] == 'u' && data[start + 3] == 'e'; + return substr_eq(data, start, end, 4, "true"); } else if ('f' == c) { - return (5 == len) && data[start + 1] == 'a' && data[start + 2] == 'l' && data[start + 3] == 's' && data[start + 4] == 'e'; + return substr_eq(data, start, end, 5, "false"); + } else if (allow_nonnumeric && c == 'N') { + return substr_eq(data, start, end, 3, "NaN"); + } else if (allow_nonnumeric && c == 'I') { + return substr_eq(data, start, end, 3, "INF") || substr_eq(data, start, end, 8, "Infinity"); + } else if (allow_nonnumeric && c == '+') { + return substr_eq(data, start, end, 9, "+Infinity"); } else if ('-' == c || c <= '9' && 'c' >= '0') { // number auto num_state = number_state::start; @@ -117,6 +140,8 @@ void validate_token_stream(device_span d_input, num_state = number_state::leading_zero; } else if (c >= '1' && c <= '9') { num_state = number_state::whole; + } else if (allow_nonnumeric && 'I' == c) { + return substr_eq(data, start, end, 4, "-INF") || substr_eq(data, start, end, 9, "-Infinity"); } else { return false; } @@ -181,7 +206,7 @@ void validate_token_stream(device_span d_input, num_state != number_state::start_exponent && num_state != number_state::saw_neg; } else { - //printf("%i/%i OTHER\n", threadIdx.x, i); + printf("%i OTHER %c\n", i, c); return false; } }; diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index f88926cff8b..4f167805c25 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -470,7 +470,16 @@ void testReadSingleQuotesJSONFileKeepQuotes() throws IOException { "{\"a\": 01}\n" + "{\"a\": -0.1}\n" + - "{\"a\": -00.1}\n" + "{\"a\": -00.1}\n" + + "{\"a\": NaN}\n" + + "{\"a\": INF}\n" + + "{\"a\": -INF}\n" + + "{\"a\": +Infinity}\n" + + "{\"a\": Infinity}\n" + + "{\"a\": -Infinity}\n" + + "{\"a\": INFinity}\n" + + + "{\"a\":\"3710-11-10T02:46:58.732Z\"}\n" ).getBytes(StandardCharsets.UTF_8); @Test @@ -479,17 +488,20 @@ void testJSONValidationNoStrict() throws IOException { .column(DType.STRING, "a") .build(); JSONOptions opts = JSONOptions.builder() - .withLines(true) + .withRecoverWithNull(true) .withMixedTypesAsStrings(true) .withNormalizeWhitespace(true) - .withRecoverWithNull(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) .withStrictValidation(false) .withLeadingZeros(false) .withNonNumericNumbers(false) .build(); try (Table expected = new Table.TestBuilder() - .column("true", "false", null, "true", "true", "1", "0", "-", "-0", "-01", - "01", "-0.1", "-00.1") + .column( + "true", "false", null, "true", "true", "1", "0", "-", "-0", "-01", + "01", "-0.1", "-00.1", "NaN", "INF", "-INF", "+Infinity", "Infinity", "-Infinity", "INFinity", + "\"3710-11-10T02:46:58.732Z\"") .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { @@ -503,17 +515,20 @@ void testJSONValidation() throws IOException { .column(DType.STRING, "a") .build(); JSONOptions opts = JSONOptions.builder() - .withLines(true) + .withRecoverWithNull(true) .withMixedTypesAsStrings(true) .withNormalizeWhitespace(true) - .withRecoverWithNull(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) .withStrictValidation(true) .withLeadingZeros(false) .withNonNumericNumbers(false) .build(); try (Table expected = new Table.TestBuilder() - .column("true", "false", null, null, "true", "1", "0", null, "-0", null, - null, "-0.1", null) + .column( + "true", "false", null, null, "true", "1", "0", null, "-0", null, + null, "-0.1", null, null, null, null, null, null, null, null, + "\"3710-11-10T02:46:58.732Z\"") .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { @@ -527,17 +542,47 @@ void testJSONValidationLeadingZeros() throws IOException { .column(DType.STRING, "a") .build(); JSONOptions opts = JSONOptions.builder() - .withLines(true) + .withRecoverWithNull(true) .withMixedTypesAsStrings(true) .withNormalizeWhitespace(true) - .withRecoverWithNull(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) .withStrictValidation(true) .withLeadingZeros(true) .withNonNumericNumbers(false) .build(); try (Table expected = new Table.TestBuilder() - .column("true", "false", null, null, "true", "1", "0", null, "-0", "-01", - "01", "-0.1", "-00.1") + .column( + "true", "false", null, null, "true", "1", "0", null, "-0", "-01", + "01", "-0.1", "-00.1", null, null, null, null, null, null, null, + "\"3710-11-10T02:46:58.732Z\"") + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testJSONValidationNonNumeric() throws IOException { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withRecoverWithNull(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) + .withStrictValidation(true) + .withLeadingZeros(false) + .withNonNumericNumbers(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column( + "true", "false", null, null, "true", "1", "0", null, "-0", null, + null, "-0.1", null, "NaN", "INF", "-INF", "+Infinity", "Infinity", "-Infinity", null, + "\"3710-11-10T02:46:58.732Z\"") .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { From 0459fadb6f5da4a7d3dc6322f581d5ea2847e10f Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Mon, 24 Jun 2024 15:49:27 -0500 Subject: [PATCH 5/9] Fix for +INF --- cpp/src/io/json/process_tokens.cu | 4 ++-- java/src/test/java/ai/rapids/cudf/TableTest.java | 15 ++++++++------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index 3ea2ab2103f..12b56ed6886 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -115,9 +115,9 @@ void validate_token_stream(device_span d_input, } else if (allow_nonnumeric && c == 'N') { return substr_eq(data, start, end, 3, "NaN"); } else if (allow_nonnumeric && c == 'I') { - return substr_eq(data, start, end, 3, "INF") || substr_eq(data, start, end, 8, "Infinity"); + return substr_eq(data, start, end, 8, "Infinity"); } else if (allow_nonnumeric && c == '+') { - return substr_eq(data, start, end, 9, "+Infinity"); + return substr_eq(data, start, end, 4, "+INF") || substr_eq(data, start, end, 9, "+Infinity"); } else if ('-' == c || c <= '9' && 'c' >= '0') { // number auto num_state = number_state::start; diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 016f333c036..aca97910e48 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -473,12 +473,13 @@ void testReadSingleQuotesJSONFileKeepQuotes() throws IOException { "{\"a\": -00.1}\n" + "{\"a\": NaN}\n" + "{\"a\": INF}\n" + + "{\"a\": +INF}\n" + "{\"a\": -INF}\n" + "{\"a\": +Infinity}\n" + "{\"a\": Infinity}\n" + "{\"a\": -Infinity}\n" + - "{\"a\": INFinity}\n" + + "{\"a\": INFinity}\n" + "{\"a\":\"3710-11-10T02:46:58.732Z\"}\n" ).getBytes(StandardCharsets.UTF_8); @@ -500,8 +501,8 @@ void testJSONValidationNoStrict() throws IOException { try (Table expected = new Table.TestBuilder() .column( "true", "false", null, "true", "true", "1", "0", "-", "-0", "-01", - "01", "-0.1", "-00.1", "NaN", "INF", "-INF", "+Infinity", "Infinity", "-Infinity", "INFinity", - "\"3710-11-10T02:46:58.732Z\"") + "01", "-0.1", "-00.1", "NaN", "INF", "+INF", "-INF", "+Infinity", "Infinity", "-Infinity", + "INFinity", "\"3710-11-10T02:46:58.732Z\"") .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { @@ -528,7 +529,7 @@ void testJSONValidation() throws IOException { .column( "true", "false", null, null, "true", "1", "0", null, "-0", null, null, "-0.1", null, null, null, null, null, null, null, null, - "\"3710-11-10T02:46:58.732Z\"") + null, "\"3710-11-10T02:46:58.732Z\"") .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { @@ -555,7 +556,7 @@ void testJSONValidationLeadingZeros() throws IOException { .column( "true", "false", null, null, "true", "1", "0", null, "-0", "-01", "01", "-0.1", "-00.1", null, null, null, null, null, null, null, - "\"3710-11-10T02:46:58.732Z\"") + null, "\"3710-11-10T02:46:58.732Z\"") .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { @@ -581,8 +582,8 @@ void testJSONValidationNonNumeric() throws IOException { try (Table expected = new Table.TestBuilder() .column( "true", "false", null, null, "true", "1", "0", null, "-0", null, - null, "-0.1", null, "NaN", "INF", "-INF", "+Infinity", "Infinity", "-Infinity", null, - "\"3710-11-10T02:46:58.732Z\"") + null, "-0.1", null, "NaN", null, "+INF", "-INF", "+Infinity", "Infinity", "-Infinity", + null, "\"3710-11-10T02:46:58.732Z\"") .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { From 52b038919d97d6c320d8c8b93aa314cf88f38626 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Tue, 25 Jun 2024 09:38:05 -0500 Subject: [PATCH 6/9] Fixed a few issues with number formats --- cpp/src/io/json/process_tokens.cu | 27 +++++++++++----- .../test/java/ai/rapids/cudf/TableTest.java | 32 +++++++++++++------ 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index 12b56ed6886..74d546b399d 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -57,6 +57,7 @@ enum class number_state { saw_neg, // not a complete state leading_zero, whole, + saw_radix, // not a complete state fraction, start_exponent, // not a complete state after_sign_exponent, // not a complete state @@ -99,10 +100,10 @@ void validate_token_stream(device_span d_input, // This validates an unquoted value. A value must match https://www.json.org/json-en.html // but the leading and training whitespace should already have been removed, and is not // a string - for (SymbolOffsetT idx = start; idx < end; idx++) { - printf("%i VALUE CHAR %i/%i => '%c'\n", i, idx, end, data[idx]); - } - printf("\t%i VALUE CHAR END\n", i); + //for (SymbolOffsetT idx = start; idx < end; idx++) { + // printf("%i VALUE CHAR %i/%i => '%c'\n", i, idx, end, data[idx]); + //} + //printf("\t%i VALUE CHAR END\n", i); // TODO do I need to worry about an empty value??? auto c = data[start]; @@ -150,7 +151,7 @@ void validate_token_stream(device_span d_input, if (allow_numeric_leading_zeros && c >= '0' && c <= '9') { num_state = number_state::whole; } else if ('.' == c) { - num_state = number_state::fraction; + num_state = number_state::saw_radix; } else if ('e' == c || 'E' == c) { num_state = number_state::start_exponent; } else { @@ -161,6 +162,15 @@ void validate_token_stream(device_span d_input, if (c >= '0' && c <= '9') { num_state = number_state::whole; } else if ('.' == c) { + num_state = number_state::saw_radix; + } else if ('e' == c || 'E' == c) { + num_state = number_state::start_exponent; + } else { + return false; + } + break; + case number_state::saw_radix: + if (c >= '0' && c <= '9') { num_state = number_state::fraction; } else if ('e' == c || 'E' == c) { num_state = number_state::start_exponent; @@ -178,7 +188,7 @@ void validate_token_stream(device_span d_input, } break; case number_state::start_exponent: - if ('-' == c || '-' == c) { + if ('+' == c || '-' == c) { num_state = number_state::after_sign_exponent; } else if (c >= '0' && c <= '9') { num_state = number_state::exponent; @@ -204,9 +214,10 @@ void validate_token_stream(device_span d_input, } return num_state != number_state::after_sign_exponent && num_state != number_state::start_exponent && - num_state != number_state::saw_neg; + num_state != number_state::saw_neg && + num_state != number_state::saw_radix; } else { - printf("%i OTHER %c\n", i, c); + //printf("%i OTHER %c\n", i, c); return false; } }; diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index aca97910e48..e90437d3a12 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -480,11 +480,21 @@ void testReadSingleQuotesJSONFileKeepQuotes() throws IOException { "{\"a\": -Infinity}\n" + "{\"a\": INFinity}\n" + - "{\"a\":\"3710-11-10T02:46:58.732Z\"}\n" + "{\"a\":\"3710-11-10T02:46:58.732Z\"}\n" + + "{\"a\":12.}\n" + + "{\"a\": -3.4e+38}\n" + + "{\"a\": -3.4e-38}\n" + + "{\"a\": 1.4e38}\n" + + "{\"a\": -3.4E+38}\n" + + "{\"a\": -3.4E-38}\n" + + "{\"a\": 1.4E38}\n" + + "{\"a\": -3.4E+}\n" + + + "{\"a\": -3.4E-}\n" ).getBytes(StandardCharsets.UTF_8); @Test - void testJSONValidationNoStrict() throws IOException { + void testJSONValidationNoStrict() { Schema schema = Schema.builder() .column(DType.STRING, "a") .build(); @@ -502,7 +512,8 @@ void testJSONValidationNoStrict() throws IOException { .column( "true", "false", null, "true", "true", "1", "0", "-", "-0", "-01", "01", "-0.1", "-00.1", "NaN", "INF", "+INF", "-INF", "+Infinity", "Infinity", "-Infinity", - "INFinity", "\"3710-11-10T02:46:58.732Z\"") + "INFinity", "\"3710-11-10T02:46:58.732Z\"", "12.", "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", "-3.4E+", + "-3.4E-") .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { @@ -511,7 +522,7 @@ void testJSONValidationNoStrict() throws IOException { } @Test - void testJSONValidation() throws IOException { + void testJSONValidation() { Schema schema = Schema.builder() .column(DType.STRING, "a") .build(); @@ -529,7 +540,8 @@ void testJSONValidation() throws IOException { .column( "true", "false", null, null, "true", "1", "0", null, "-0", null, null, "-0.1", null, null, null, null, null, null, null, null, - null, "\"3710-11-10T02:46:58.732Z\"") + null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, + null) .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { @@ -538,7 +550,7 @@ void testJSONValidation() throws IOException { } @Test - void testJSONValidationLeadingZeros() throws IOException { + void testJSONValidationLeadingZeros() { Schema schema = Schema.builder() .column(DType.STRING, "a") .build(); @@ -556,7 +568,8 @@ void testJSONValidationLeadingZeros() throws IOException { .column( "true", "false", null, null, "true", "1", "0", null, "-0", "-01", "01", "-0.1", "-00.1", null, null, null, null, null, null, null, - null, "\"3710-11-10T02:46:58.732Z\"") + null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, + null) .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { @@ -565,7 +578,7 @@ void testJSONValidationLeadingZeros() throws IOException { } @Test - void testJSONValidationNonNumeric() throws IOException { + void testJSONValidationNonNumeric() { Schema schema = Schema.builder() .column(DType.STRING, "a") .build(); @@ -583,7 +596,8 @@ void testJSONValidationNonNumeric() throws IOException { .column( "true", "false", null, null, "true", "1", "0", null, "-0", null, null, "-0.1", null, "NaN", null, "+INF", "-INF", "+Infinity", "Infinity", "-Infinity", - null, "\"3710-11-10T02:46:58.732Z\"") + null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, + null) .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { From 4716288b069212b9634bea15e027f3b1025a1c0a Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Wed, 26 Jun 2024 11:59:27 -0500 Subject: [PATCH 7/9] Added support for unquoted control chars --- cpp/include/cudf/io/json.hpp | 35 +++++++++++-- cpp/src/io/json/process_tokens.cu | 39 ++++++++++++-- .../main/java/ai/rapids/cudf/JSONOptions.java | 18 ++++++- java/src/main/java/ai/rapids/cudf/Table.java | 19 +++++-- java/src/main/native/src/TableJni.cpp | 12 ++++- .../test/java/ai/rapids/cudf/TableTest.java | 51 +++++++++++++++++-- 6 files changed, 155 insertions(+), 19 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 7d7f93b9ed4..b739b7d7292 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -335,6 +335,14 @@ class json_reader_options { */ [[nodiscard]] bool is_allowed_nonnumeric_numbers() const { return _allow_nonnumeric_numbers; } + /** + * @brief Whether in a quoted string should characters greater than or equal to 0 and less than 32 be allowed + * without some form of escaping. Strict validation must be enabled for this to work. + * + * @return true if unquoted control chars are allowed. + */ + [[nodiscard]] bool is_allowed_unquoted_control_chars() const { return _allow_unquoted_control_chars; } + /** * @brief Returns additional values to recognize as null values. * @@ -473,14 +481,14 @@ class json_reader_options { void set_recovery_mode(json_recovery_mode_t val) { _recovery_mode = val; } /** - * @brief Set Whether strict validation is enabled or not. + * @brief Set whether strict validation is enabled or not. * * @param val Boolean value to indicate whether strict validation is enabled. */ void set_strict_validation(bool val) { _strict_validation = val; } /** - * @brief Set Whether leading zeros are allowed in numeric values. strict validation + * @brief Set whether leading zeros are allowed in numeric values. strict validation * must be enabled for this to work. * * @param val Boolean value to indicate whether leading zeros are allowed in numeric values @@ -495,6 +503,15 @@ class json_reader_options { */ void allow_nonnumeric_numbers(bool val) { _allow_nonnumeric_numbers = val; } + /** + * @brief Set whether in a quoted string should characters greater than or equal to 0 + * and less than 32 be allowed without some form of escaping. Strict validation must + * be enabled for this to work. + * + * @param val true to indicate wether unquoted control chars are allowed. + */ + void allow_unquoted_control_chars(bool val) { _allow_unquoted_control_chars = val; } + /** * @brief Sets additional values to recognize as null values. * @@ -736,7 +753,7 @@ class json_reader_options_builder { } /** - * @brief Set whether unquoted number values are valid JSON. The values are NaN, + * @brief Set whether specific unquoted number values are valid JSON. The values are NaN, * +INF, -INF, +Infinity, Infinity, and -Infinity. * strict validation must be enabled for this to have any effect. * @@ -749,6 +766,18 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set whether chars >= 0 and < 32 are allowed in a quoted string without + * some form of escaping. strict validation must be enabled for this to have any effect. + * + * @param val Boolean value to indicate if unquoted control chars are allowed or not. + */ + json_reader_options_builder& unquoted_control_chars(bool val) + { + options.allow_unquoted_control_chars(val); + return *this; + } + /** * @brief Sets additional values to recognize as null values. * diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index 74d546b399d..49f8503351d 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -89,7 +89,7 @@ void validate_token_stream(device_span d_input, { if (options.is_strict_validation()) { using token_t = cudf::io::json::token_t; - auto validate_tokens = + auto validate_values = [data = d_input.data(), allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(), @@ -221,13 +221,46 @@ void validate_token_stream(device_span d_input, return false; } }; + + auto validate_strings = + [data = d_input.data(), + allow_unquoted_control_chars = + options.is_allowed_unquoted_control_chars()] __device__(int32_t i, + SymbolOffsetT start, + SymbolOffsetT end) -> bool { + // This validates a quoted string. A string must match https://www.json.org/json-en.html + // but we already know that it has a starting and ending " and all white space has been + // stripped out. + //for (SymbolOffsetT idx = start + 1; idx < end; idx++) { + // printf("%i STR CHAR %i/%i => '%c'\n", i, idx, end, data[idx]); + //} + //printf("\t%i STR CHAR END\n", i); + + for (SymbolOffsetT idx = start + 1; idx < end; idx++) { + auto c = data[idx]; + if (!allow_unquoted_control_chars && c >= 0 && c < 32) { + //printf("%i FOUND INVALID CHAR AT %i %i\n", i, idx, c); + return false; + //} else { + // printf("%i FOUND GOOD CHAR AT %i '%c'\n", i, idx, c); + } + } + //printf("\t%i STR CHAR END\n", i); + + return true; + }; + auto num_tokens = tokens.size(); auto count_it = thrust::make_counting_iterator(0); auto predicate = [tokens = tokens.begin(), token_indices = token_indices.begin(), - validate_tokens] __device__(auto i) -> bool { + validate_values, + validate_strings] __device__(auto i) -> bool { if (tokens[i] == token_t::ValueEnd) { - return !validate_tokens(i, token_indices[i - 1], token_indices[i]); + return !validate_values(i, token_indices[i - 1], token_indices[i]); + } else if (tokens[i] == token_t::FieldNameEnd || + tokens[i] == token_t::StringEnd) { + return !validate_strings(i, token_indices[i - 1], token_indices[i]); } return false; }; diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index d3f00514fc9..50cce4590c1 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -37,6 +37,7 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean allowLeadingZeros; private final boolean strictValidation; private final boolean allowNonNumericNumbers; + private final boolean allowUnquotedControlChars; private JSONOptions(Builder builder) { super(builder); @@ -50,6 +51,7 @@ private JSONOptions(Builder builder) { strictValidation = builder.strictValidation; allowLeadingZeros = builder.allowLeadingZeros; allowNonNumericNumbers = builder.allowNonNumericNumbers; + allowUnquotedControlChars = builder.allowUnquotedControlChars; } public boolean isDayFirst() { @@ -93,6 +95,10 @@ public boolean nonNumericNumbersAllowed() { return allowNonNumericNumbers; } + public boolean unquotedControlChars() { + return allowUnquotedControlChars; + } + @Override String[] getIncludeColumnNames() { throw new UnsupportedOperationException("JSON reader didn't support column prune"); @@ -104,6 +110,7 @@ public static Builder builder() { public static final class Builder extends ColumnFilterOptions.Builder { private boolean strictValidation = false; + private boolean allowUnquotedControlChars = true; private boolean allowNonNumericNumbers = false; private boolean allowLeadingZeros = false; private boolean dayFirst = false; @@ -134,7 +141,7 @@ public Builder withLeadingZeros(boolean isAllowed) { } /** - * Should non-numeric numbers be allowed or not Strict validation + * Should non-numeric numbers be allowed or not. Strict validation * must be enabled for this to have any effect. */ public Builder withNonNumericNumbers(boolean isAllowed) { @@ -142,6 +149,15 @@ public Builder withNonNumericNumbers(boolean isAllowed) { return this; } + /** + * Should unquoted control chars be allowed in strings. Strict validation + * must be enabled for this to have any effect. + */ + public Builder withUnquotedControlChars(boolean isAllowed) { + allowUnquotedControlChars = isAllowed; + return this; + } + // TODO need to finish this for other configs... /** diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index e53032b564e..2ac3ed01d3a 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -257,7 +257,8 @@ private static native long readJSON(int[] numChildren, String[] columnNames, boolean keepStringQuotes, boolean strictValidation, boolean allowLeadingZeros, - boolean allowNonNumericNumbers) throws CudfException; + boolean allowNonNumericNumbers, + boolean allowUnquotedControl) throws CudfException; private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames, int[] dTypeIds, int[] dTypeScales, @@ -270,6 +271,7 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co boolean strictValidation, boolean allowLeadingZeros, boolean allowNonNumericNumbers, + boolean allowUnquotedControl, long dsHandle) throws CudfException; private static native long readAndInferJSONFromDataSource(boolean dayFirst, boolean lines, @@ -281,6 +283,7 @@ private static native long readAndInferJSONFromDataSource(boolean dayFirst, bool boolean strictValidation, boolean allowLeadingZeros, boolean allowNonNumericNumbers, + boolean allowUnquotedControl, long dsHandle) throws CudfException; private static native long readAndInferJSON(long address, long length, @@ -293,7 +296,8 @@ private static native long readAndInferJSON(long address, long length, boolean keepStringQuotes, boolean strictValidation, boolean allowLeadingZeros, - boolean allowNonNumericNumbers) throws CudfException; + boolean allowNonNumericNumbers, + boolean allowUnquotedControl) throws CudfException; /** * Read in Parquet formatted data. @@ -1287,7 +1291,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) { opts.keepStringQuotes(), opts.strictValidation(), opts.leadingZerosAllowed(), - opts.nonNumericNumbersAllowed()))) { + opts.nonNumericNumbersAllowed(), + opts.unquotedControlChars()))) { return gatherJSONColumns(schema, twm, -1); } @@ -1369,7 +1374,8 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer, opts.keepStringQuotes(), opts.strictValidation(), opts.leadingZerosAllowed(), - opts.nonNumericNumbersAllowed())); + opts.nonNumericNumbersAllowed(), + opts.unquotedControlChars())); } /** @@ -1390,6 +1396,7 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) { opts.strictValidation(), opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), + opts.unquotedControlChars(), dsHandle)); return twm; } finally { @@ -1442,7 +1449,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b opts.keepStringQuotes(), opts.strictValidation(), opts.leadingZerosAllowed(), - opts.nonNumericNumbersAllowed()))) { + opts.nonNumericNumbersAllowed(), + opts.unquotedControlChars()))) { return gatherJSONColumns(schema, twm, emptyRowCount); } } @@ -1480,6 +1488,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int opts.strictValidation(), opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), + opts.unquotedControlChars(), dsHandle))) { return gatherJSONColumns(schema, twm, emptyRowCount); } finally { diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 39638727569..424ace416e8 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1613,6 +1613,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env, jboolean strict_validation, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, + jboolean allow_unquoted_control, jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1635,6 +1636,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env, .strict_validation(strict_validation) .numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control) .keep_quotes(keep_quotes); auto result = @@ -1658,7 +1660,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, jboolean keep_quotes, jboolean strict_validation, jboolean allow_leading_zeros, - jboolean allow_nonnumeric_numbers) + jboolean allow_nonnumeric_numbers, + jboolean allow_unquoted_control) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); if (buffer_length <= 0) { @@ -1683,6 +1686,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, .strict_validation(strict_validation) .numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control) .mixed_types_as_string(mixed_types_as_string) .keep_quotes(keep_quotes); @@ -1792,6 +1796,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, jboolean strict_validation, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, + jboolean allow_unquoted_control, jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1829,6 +1834,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, .strict_validation(strict_validation) .numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control) .keep_quotes(keep_quotes); if (!n_types.is_null()) { @@ -1882,7 +1888,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, jboolean keep_quotes, jboolean strict_validation, jboolean allow_leading_zeros, - jboolean allow_nonnumeric_numbers) + jboolean allow_nonnumeric_numbers, + jboolean allow_unquoted_control) { bool read_buffer = true; if (buffer == 0) { @@ -1934,6 +1941,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, .strict_validation(strict_validation) .numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control) .keep_quotes(keep_quotes); if (!n_types.is_null()) { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index e90437d3a12..10db8ff12b2 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -490,7 +490,15 @@ void testReadSingleQuotesJSONFileKeepQuotes() throws IOException { "{\"a\": 1.4E38}\n" + "{\"a\": -3.4E+}\n" + - "{\"a\": -3.4E-}\n" + "{\"a\": -3.4E-}\n" + + "{\"a\": \"A\u0000B\"}\n" + + "{\"a\": \"A\\u0000B\"}\n" + + "{\"a\": \"A\u0001B\"}\n" + + "{\"a\": \"A\\u0001B\"}\n" + + "{\"a\": \"A\u001FB\"}\n" + + "{\"a\": \"A\\u001FB\"}\n" + + "{\"a\": \"A\u0020B\"}\n" + + "{\"a\": \"A\\u0020B\"}\n" ).getBytes(StandardCharsets.UTF_8); @Test @@ -507,13 +515,14 @@ void testJSONValidationNoStrict() { .withStrictValidation(false) .withLeadingZeros(false) .withNonNumericNumbers(false) + .withUnquotedControlChars(true) .build(); try (Table expected = new Table.TestBuilder() .column( "true", "false", null, "true", "true", "1", "0", "-", "-0", "-01", "01", "-0.1", "-00.1", "NaN", "INF", "+INF", "-INF", "+Infinity", "Infinity", "-Infinity", "INFinity", "\"3710-11-10T02:46:58.732Z\"", "12.", "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", "-3.4E+", - "-3.4E-") + "-3.4E-", "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"") .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { @@ -535,13 +544,14 @@ void testJSONValidation() { .withStrictValidation(true) .withLeadingZeros(false) .withNonNumericNumbers(false) + .withUnquotedControlChars(true) .build(); try (Table expected = new Table.TestBuilder() .column( "true", "false", null, null, "true", "1", "0", null, "-0", null, null, "-0.1", null, null, null, null, null, null, null, null, null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, - null) + null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"") .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { @@ -563,13 +573,14 @@ void testJSONValidationLeadingZeros() { .withStrictValidation(true) .withLeadingZeros(true) .withNonNumericNumbers(false) + .withUnquotedControlChars(true) .build(); try (Table expected = new Table.TestBuilder() .column( "true", "false", null, null, "true", "1", "0", null, "-0", "-01", "01", "-0.1", "-00.1", null, null, null, null, null, null, null, null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, - null) + null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"") .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { @@ -591,13 +602,43 @@ void testJSONValidationNonNumeric() { .withStrictValidation(true) .withLeadingZeros(false) .withNonNumericNumbers(true) + .withUnquotedControlChars(true) .build(); try (Table expected = new Table.TestBuilder() .column( "true", "false", null, null, "true", "1", "0", null, "-0", null, null, "-0.1", null, "NaN", null, "+INF", "-INF", "+Infinity", "Infinity", "-Infinity", null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, - null) + null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"") + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testJSONValidationUnquotedControl() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withRecoverWithNull(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) + .withStrictValidation(true) + .withLeadingZeros(false) + .withNonNumericNumbers(false) + .withUnquotedControlChars(false) + .build(); + try (Table expected = new Table.TestBuilder() + .column( + "true", "false", null, null, "true", "1", "0", null, "-0", null, + null, "-0.1", null, null, null, null, null, null, null, null, + null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, + null, null, "\"A\u0000B\"", null, "\"A\u0001B\"", null, "\"A\u001FB\"", "\"A B\"", "\"A B\"") .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { From 5640d4ca5a9fe3a17355adb201c863bbd161e724 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Wed, 26 Jun 2024 14:22:14 -0500 Subject: [PATCH 8/9] Some code cleanup --- cpp/src/io/json/process_tokens.cu | 41 ++++++------------- .../test/java/ai/rapids/cudf/TableTest.java | 21 +++++++--- 2 files changed, 28 insertions(+), 34 deletions(-) diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index 49f8503351d..3dbabb6944b 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -93,19 +93,12 @@ void validate_token_stream(device_span d_input, [data = d_input.data(), allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(), - allow_nonnumeric = - options.is_allowed_nonnumeric_numbers()] __device__(int32_t i, - SymbolOffsetT start, + allow_nonnumeric = + options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start, SymbolOffsetT end) -> bool { // This validates an unquoted value. A value must match https://www.json.org/json-en.html // but the leading and training whitespace should already have been removed, and is not // a string - //for (SymbolOffsetT idx = start; idx < end; idx++) { - // printf("%i VALUE CHAR %i/%i => '%c'\n", i, idx, end, data[idx]); - //} - //printf("\t%i VALUE CHAR END\n", i); - - // TODO do I need to worry about an empty value??? auto c = data[start]; if ('n' == c) { return substr_eq(data, start, end, 4, "null"); @@ -217,7 +210,6 @@ void validate_token_stream(device_span d_input, num_state != number_state::saw_neg && num_state != number_state::saw_radix; } else { - //printf("%i OTHER %c\n", i, c); return false; } }; @@ -225,28 +217,21 @@ void validate_token_stream(device_span d_input, auto validate_strings = [data = d_input.data(), allow_unquoted_control_chars = - options.is_allowed_unquoted_control_chars()] __device__(int32_t i, - SymbolOffsetT start, + options.is_allowed_unquoted_control_chars()] __device__(SymbolOffsetT start, SymbolOffsetT end) -> bool { // This validates a quoted string. A string must match https://www.json.org/json-en.html // but we already know that it has a starting and ending " and all white space has been - // stripped out. - //for (SymbolOffsetT idx = start + 1; idx < end; idx++) { - // printf("%i STR CHAR %i/%i => '%c'\n", i, idx, end, data[idx]); - //} - //printf("\t%i STR CHAR END\n", i); + // stripped out. Also the base CUDF validation makes sure escaped chars are correct + // so we only need to worry about unquoted control chars - for (SymbolOffsetT idx = start + 1; idx < end; idx++) { - auto c = data[idx]; - if (!allow_unquoted_control_chars && c >= 0 && c < 32) { - //printf("%i FOUND INVALID CHAR AT %i %i\n", i, idx, c); - return false; - //} else { - // printf("%i FOUND GOOD CHAR AT %i '%c'\n", i, idx, c); + if (!allow_unquoted_control_chars) { + for (SymbolOffsetT idx = start + 1; idx < end; idx++) { + auto c = data[idx]; + if (c >= 0 && c < 32) { + return false; + } } } - //printf("\t%i STR CHAR END\n", i); - return true; }; @@ -257,10 +242,10 @@ void validate_token_stream(device_span d_input, validate_values, validate_strings] __device__(auto i) -> bool { if (tokens[i] == token_t::ValueEnd) { - return !validate_values(i, token_indices[i - 1], token_indices[i]); + return !validate_values(token_indices[i - 1], token_indices[i]); } else if (tokens[i] == token_t::FieldNameEnd || tokens[i] == token_t::StringEnd) { - return !validate_strings(i, token_indices[i - 1], token_indices[i]); + return !validate_strings(token_indices[i - 1], token_indices[i]); } return false; }; diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 10db8ff12b2..285dc5c644c 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -498,7 +498,11 @@ void testReadSingleQuotesJSONFileKeepQuotes() throws IOException { "{\"a\": \"A\u001FB\"}\n" + "{\"a\": \"A\\u001FB\"}\n" + "{\"a\": \"A\u0020B\"}\n" + - "{\"a\": \"A\\u0020B\"}\n" + "{\"a\": \"A\\u0020B\"}\n" + + "{\"a\": \"\\u12\"}\n" + + + "{\"a\": \"\\z\"}\n" + + "{\"a\": \"\\r\"}\n" ).getBytes(StandardCharsets.UTF_8); @Test @@ -522,7 +526,8 @@ void testJSONValidationNoStrict() { "true", "false", null, "true", "true", "1", "0", "-", "-0", "-01", "01", "-0.1", "-00.1", "NaN", "INF", "+INF", "-INF", "+Infinity", "Infinity", "-Infinity", "INFinity", "\"3710-11-10T02:46:58.732Z\"", "12.", "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", "-3.4E+", - "-3.4E-", "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"") + "-3.4E-", "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"") .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { @@ -551,7 +556,8 @@ void testJSONValidation() { "true", "false", null, null, "true", "1", "0", null, "-0", null, null, "-0.1", null, null, null, null, null, null, null, null, null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, - null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"") + null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"") .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { @@ -580,7 +586,8 @@ void testJSONValidationLeadingZeros() { "true", "false", null, null, "true", "1", "0", null, "-0", "-01", "01", "-0.1", "-00.1", null, null, null, null, null, null, null, null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, - null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"") + null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"") .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { @@ -609,7 +616,8 @@ void testJSONValidationNonNumeric() { "true", "false", null, null, "true", "1", "0", null, "-0", null, null, "-0.1", null, "NaN", null, "+INF", "-INF", "+Infinity", "Infinity", "-Infinity", null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, - null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"") + null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"") .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { @@ -638,7 +646,8 @@ void testJSONValidationUnquotedControl() { "true", "false", null, null, "true", "1", "0", null, "-0", null, null, "-0.1", null, null, null, null, null, null, null, null, null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, - null, null, "\"A\u0000B\"", null, "\"A\u0001B\"", null, "\"A\u001FB\"", "\"A B\"", "\"A B\"") + null, null, "\"A\u0000B\"", null, "\"A\u0001B\"", null, "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"") .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { From 277d14cd986ba15076d727c34b9b0ac59f9bcb21 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Fri, 28 Jun 2024 08:29:48 -0500 Subject: [PATCH 9/9] More fixes --- cpp/src/io/json/process_tokens.cu | 60 ++++++++++++++++--- .../test/java/ai/rapids/cudf/TableTest.java | 13 ++-- 2 files changed, 60 insertions(+), 13 deletions(-) diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index 3dbabb6944b..aee22e3fdf0 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -64,6 +64,12 @@ enum class number_state { exponent }; +enum class string_state { + normal = 0, + escaped, // not a complete state + escaped_u // not a complete state +}; + __device__ inline bool substr_eq(const char * data, SymbolOffsetT const start, SymbolOffsetT const end, @@ -224,15 +230,55 @@ void validate_token_stream(device_span d_input, // stripped out. Also the base CUDF validation makes sure escaped chars are correct // so we only need to worry about unquoted control chars - if (!allow_unquoted_control_chars) { - for (SymbolOffsetT idx = start + 1; idx < end; idx++) { - auto c = data[idx]; - if (c >= 0 && c < 32) { - return false; - } + auto state = string_state::normal; + auto u_count = 0; + for (SymbolOffsetT idx = start + 1; idx < end; idx++) { + auto c = data[idx]; + if (!allow_unquoted_control_chars && c >= 0 && c < 32) { + return false; + } + + switch (state) { + case string_state::normal: + if (c == '\\') { + state = string_state::escaped; + } + break; + case string_state::escaped: + // in Spark you can allow any char to be escaped, but CUDF + // validates it in some cases so we need to also validate it. + if (c == 'u') { + state = string_state::escaped_u; + u_count = 0; + } else if (c == '"' || + c == '\\' || + c == '/' || + c == 'b' || + c == 'f' || + c == 'n' || + c == 'r' || + c == 't') { + state = string_state::normal; + } else { + return false; + } + break; + case string_state::escaped_u: + if ((c >= '0' && c <= '9') || + (c >= 'a' && c <= 'f') || + (c >= 'A' && c <= 'F')) { + u_count++; + if (u_count == 4) { + state = string_state::normal; + u_count = 0; + } + } else { + return false; + } + break; } } - return true; + return string_state::normal == state; }; auto num_tokens = tokens.size(); diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 285dc5c644c..56fe63598d9 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -502,7 +502,8 @@ void testReadSingleQuotesJSONFileKeepQuotes() throws IOException { "{\"a\": \"\\u12\"}\n" + "{\"a\": \"\\z\"}\n" + - "{\"a\": \"\\r\"}\n" + "{\"a\": \"\\r\"}\n" + + "{\"a\": \"something\", \"b\": \"\\z\"}\n" ).getBytes(StandardCharsets.UTF_8); @Test @@ -527,7 +528,7 @@ void testJSONValidationNoStrict() { "01", "-0.1", "-00.1", "NaN", "INF", "+INF", "-INF", "+Infinity", "Infinity", "-Infinity", "INFinity", "\"3710-11-10T02:46:58.732Z\"", "12.", "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", "-3.4E+", "-3.4E-", "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, - null, "\"\r\"") + null, "\"\r\"", "\"something\"") .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { @@ -557,7 +558,7 @@ void testJSONValidation() { null, "-0.1", null, null, null, null, null, null, null, null, null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, - null, "\"\r\"") + null, "\"\r\"", null) .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { @@ -587,7 +588,7 @@ void testJSONValidationLeadingZeros() { "01", "-0.1", "-00.1", null, null, null, null, null, null, null, null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, - null, "\"\r\"") + null, "\"\r\"", null) .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { @@ -617,7 +618,7 @@ void testJSONValidationNonNumeric() { null, "-0.1", null, "NaN", null, "+INF", "-INF", "+Infinity", "Infinity", "-Infinity", null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, - null, "\"\r\"") + null, "\"\r\"", null) .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { @@ -647,7 +648,7 @@ void testJSONValidationUnquotedControl() { null, "-0.1", null, null, null, null, null, null, null, null, null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, null, null, "\"A\u0000B\"", null, "\"A\u0001B\"", null, "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, - null, "\"\r\"") + null, "\"\r\"", null) .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {