diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index 49f8503351d..3dbabb6944b 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -93,19 +93,12 @@ void validate_token_stream(device_span d_input, [data = d_input.data(), allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(), - allow_nonnumeric = - options.is_allowed_nonnumeric_numbers()] __device__(int32_t i, - SymbolOffsetT start, + allow_nonnumeric = + options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start, SymbolOffsetT end) -> bool { // This validates an unquoted value. A value must match https://www.json.org/json-en.html // but the leading and training whitespace should already have been removed, and is not // a string - //for (SymbolOffsetT idx = start; idx < end; idx++) { - // printf("%i VALUE CHAR %i/%i => '%c'\n", i, idx, end, data[idx]); - //} - //printf("\t%i VALUE CHAR END\n", i); - - // TODO do I need to worry about an empty value??? auto c = data[start]; if ('n' == c) { return substr_eq(data, start, end, 4, "null"); @@ -217,7 +210,6 @@ void validate_token_stream(device_span d_input, num_state != number_state::saw_neg && num_state != number_state::saw_radix; } else { - //printf("%i OTHER %c\n", i, c); return false; } }; @@ -225,28 +217,21 @@ void validate_token_stream(device_span d_input, auto validate_strings = [data = d_input.data(), allow_unquoted_control_chars = - options.is_allowed_unquoted_control_chars()] __device__(int32_t i, - SymbolOffsetT start, + options.is_allowed_unquoted_control_chars()] __device__(SymbolOffsetT start, SymbolOffsetT end) -> bool { // This validates a quoted string. A string must match https://www.json.org/json-en.html // but we already know that it has a starting and ending " and all white space has been - // stripped out. - //for (SymbolOffsetT idx = start + 1; idx < end; idx++) { - // printf("%i STR CHAR %i/%i => '%c'\n", i, idx, end, data[idx]); - //} - //printf("\t%i STR CHAR END\n", i); + // stripped out. Also the base CUDF validation makes sure escaped chars are correct + // so we only need to worry about unquoted control chars - for (SymbolOffsetT idx = start + 1; idx < end; idx++) { - auto c = data[idx]; - if (!allow_unquoted_control_chars && c >= 0 && c < 32) { - //printf("%i FOUND INVALID CHAR AT %i %i\n", i, idx, c); - return false; - //} else { - // printf("%i FOUND GOOD CHAR AT %i '%c'\n", i, idx, c); + if (!allow_unquoted_control_chars) { + for (SymbolOffsetT idx = start + 1; idx < end; idx++) { + auto c = data[idx]; + if (c >= 0 && c < 32) { + return false; + } } } - //printf("\t%i STR CHAR END\n", i); - return true; }; @@ -257,10 +242,10 @@ void validate_token_stream(device_span d_input, validate_values, validate_strings] __device__(auto i) -> bool { if (tokens[i] == token_t::ValueEnd) { - return !validate_values(i, token_indices[i - 1], token_indices[i]); + return !validate_values(token_indices[i - 1], token_indices[i]); } else if (tokens[i] == token_t::FieldNameEnd || tokens[i] == token_t::StringEnd) { - return !validate_strings(i, token_indices[i - 1], token_indices[i]); + return !validate_strings(token_indices[i - 1], token_indices[i]); } return false; }; diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 10db8ff12b2..285dc5c644c 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -498,7 +498,11 @@ void testReadSingleQuotesJSONFileKeepQuotes() throws IOException { "{\"a\": \"A\u001FB\"}\n" + "{\"a\": \"A\\u001FB\"}\n" + "{\"a\": \"A\u0020B\"}\n" + - "{\"a\": \"A\\u0020B\"}\n" + "{\"a\": \"A\\u0020B\"}\n" + + "{\"a\": \"\\u12\"}\n" + + + "{\"a\": \"\\z\"}\n" + + "{\"a\": \"\\r\"}\n" ).getBytes(StandardCharsets.UTF_8); @Test @@ -522,7 +526,8 @@ void testJSONValidationNoStrict() { "true", "false", null, "true", "true", "1", "0", "-", "-0", "-01", "01", "-0.1", "-00.1", "NaN", "INF", "+INF", "-INF", "+Infinity", "Infinity", "-Infinity", "INFinity", "\"3710-11-10T02:46:58.732Z\"", "12.", "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", "-3.4E+", - "-3.4E-", "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"") + "-3.4E-", "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"") .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { @@ -551,7 +556,8 @@ void testJSONValidation() { "true", "false", null, null, "true", "1", "0", null, "-0", null, null, "-0.1", null, null, null, null, null, null, null, null, null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, - null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"") + null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"") .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { @@ -580,7 +586,8 @@ void testJSONValidationLeadingZeros() { "true", "false", null, null, "true", "1", "0", null, "-0", "-01", "01", "-0.1", "-00.1", null, null, null, null, null, null, null, null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, - null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"") + null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"") .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { @@ -609,7 +616,8 @@ void testJSONValidationNonNumeric() { "true", "false", null, null, "true", "1", "0", null, "-0", null, null, "-0.1", null, "NaN", null, "+INF", "-INF", "+Infinity", "Infinity", "-Infinity", null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, - null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"") + null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"") .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { @@ -638,7 +646,8 @@ void testJSONValidationUnquotedControl() { "true", "false", null, null, "true", "1", "0", null, "-0", null, null, "-0.1", null, null, null, null, null, null, null, null, null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, - null, null, "\"A\u0000B\"", null, "\"A\u0001B\"", null, "\"A\u001FB\"", "\"A B\"", "\"A B\"") + null, null, "\"A\u0000B\"", null, "\"A\u0001B\"", null, "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"") .build(); MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {