Skip to content

Commit

Permalink
Some code cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
revans2 committed Jun 26, 2024
1 parent 4716288 commit 5640d4c
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 34 deletions.
41 changes: 13 additions & 28 deletions cpp/src/io/json/process_tokens.cu
Original file line number Diff line number Diff line change
Expand Up @@ -93,19 +93,12 @@ void validate_token_stream(device_span<char const> d_input,
[data = d_input.data(),
allow_numeric_leading_zeros =
options.is_allowed_numeric_leading_zeros(),
allow_nonnumeric =
options.is_allowed_nonnumeric_numbers()] __device__(int32_t i,
SymbolOffsetT start,
allow_nonnumeric =
options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start,
SymbolOffsetT end) -> bool {
// This validates an unquoted value. A value must match https://www.json.org/json-en.html
// but the leading and training whitespace should already have been removed, and is not
// a string
//for (SymbolOffsetT idx = start; idx < end; idx++) {
// printf("%i VALUE CHAR %i/%i => '%c'\n", i, idx, end, data[idx]);
//}
//printf("\t%i VALUE CHAR END\n", i);

// TODO do I need to worry about an empty value???
auto c = data[start];
if ('n' == c) {
return substr_eq(data, start, end, 4, "null");
Expand Down Expand Up @@ -217,36 +210,28 @@ void validate_token_stream(device_span<char const> d_input,
num_state != number_state::saw_neg &&
num_state != number_state::saw_radix;
} else {
//printf("%i OTHER %c\n", i, c);
return false;
}
};

auto validate_strings =
[data = d_input.data(),
allow_unquoted_control_chars =
options.is_allowed_unquoted_control_chars()] __device__(int32_t i,
SymbolOffsetT start,
options.is_allowed_unquoted_control_chars()] __device__(SymbolOffsetT start,
SymbolOffsetT end) -> bool {
// This validates a quoted string. A string must match https://www.json.org/json-en.html
// but we already know that it has a starting and ending " and all white space has been
// stripped out.
//for (SymbolOffsetT idx = start + 1; idx < end; idx++) {
// printf("%i STR CHAR %i/%i => '%c'\n", i, idx, end, data[idx]);
//}
//printf("\t%i STR CHAR END\n", i);
// stripped out. Also the base CUDF validation makes sure escaped chars are correct
// so we only need to worry about unquoted control chars

for (SymbolOffsetT idx = start + 1; idx < end; idx++) {
auto c = data[idx];
if (!allow_unquoted_control_chars && c >= 0 && c < 32) {
//printf("%i FOUND INVALID CHAR AT %i %i\n", i, idx, c);
return false;
//} else {
// printf("%i FOUND GOOD CHAR AT %i '%c'\n", i, idx, c);
if (!allow_unquoted_control_chars) {
for (SymbolOffsetT idx = start + 1; idx < end; idx++) {
auto c = data[idx];
if (c >= 0 && c < 32) {
return false;
}
}
}
//printf("\t%i STR CHAR END\n", i);

return true;
};

Expand All @@ -257,10 +242,10 @@ void validate_token_stream(device_span<char const> d_input,
validate_values,
validate_strings] __device__(auto i) -> bool {
if (tokens[i] == token_t::ValueEnd) {
return !validate_values(i, token_indices[i - 1], token_indices[i]);
return !validate_values(token_indices[i - 1], token_indices[i]);
} else if (tokens[i] == token_t::FieldNameEnd ||
tokens[i] == token_t::StringEnd) {
return !validate_strings(i, token_indices[i - 1], token_indices[i]);
return !validate_strings(token_indices[i - 1], token_indices[i]);
}
return false;
};
Expand Down
21 changes: 15 additions & 6 deletions java/src/test/java/ai/rapids/cudf/TableTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,11 @@ void testReadSingleQuotesJSONFileKeepQuotes() throws IOException {
"{\"a\": \"A\u001FB\"}\n" +
"{\"a\": \"A\\u001FB\"}\n" +
"{\"a\": \"A\u0020B\"}\n" +
"{\"a\": \"A\\u0020B\"}\n"
"{\"a\": \"A\\u0020B\"}\n" +
"{\"a\": \"\\u12\"}\n" +

"{\"a\": \"\\z\"}\n" +
"{\"a\": \"\\r\"}\n"
).getBytes(StandardCharsets.UTF_8);

@Test
Expand All @@ -522,7 +526,8 @@ void testJSONValidationNoStrict() {
"true", "false", null, "true", "true", "1", "0", "-", "-0", "-01",
"01", "-0.1", "-00.1", "NaN", "INF", "+INF", "-INF", "+Infinity", "Infinity", "-Infinity",
"INFinity", "\"3710-11-10T02:46:58.732Z\"", "12.", "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", "-3.4E+",
"-3.4E-", "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"")
"-3.4E-", "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
null, "\"\r\"")
.build();
MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
Expand Down Expand Up @@ -551,7 +556,8 @@ void testJSONValidation() {
"true", "false", null, null, "true", "1", "0", null, "-0", null,
null, "-0.1", null, null, null, null, null, null, null, null,
null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null,
null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"")
null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
null, "\"\r\"")
.build();
MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
Expand Down Expand Up @@ -580,7 +586,8 @@ void testJSONValidationLeadingZeros() {
"true", "false", null, null, "true", "1", "0", null, "-0", "-01",
"01", "-0.1", "-00.1", null, null, null, null, null, null, null,
null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null,
null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"")
null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
null, "\"\r\"")
.build();
MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
Expand Down Expand Up @@ -609,7 +616,8 @@ void testJSONValidationNonNumeric() {
"true", "false", null, null, "true", "1", "0", null, "-0", null,
null, "-0.1", null, "NaN", null, "+INF", "-INF", "+Infinity", "Infinity", "-Infinity",
null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null,
null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"")
null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
null, "\"\r\"")
.build();
MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
Expand Down Expand Up @@ -638,7 +646,8 @@ void testJSONValidationUnquotedControl() {
"true", "false", null, null, "true", "1", "0", null, "-0", null,
null, "-0.1", null, null, null, null, null, null, null, null,
null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null,
null, null, "\"A\u0000B\"", null, "\"A\u0001B\"", null, "\"A\u001FB\"", "\"A B\"", "\"A B\"")
null, null, "\"A\u0000B\"", null, "\"A\u0001B\"", null, "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
null, "\"\r\"")
.build();
MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
Expand Down

0 comments on commit 5640d4c

Please sign in to comment.