Skip to content

Commit

Permalink
Added support for unquoted control chars
Browse files Browse the repository at this point in the history
  • Loading branch information
revans2 committed Jun 26, 2024
1 parent 52b0389 commit 4716288
Show file tree
Hide file tree
Showing 6 changed files with 155 additions and 19 deletions.
35 changes: 32 additions & 3 deletions cpp/include/cudf/io/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,14 @@ class json_reader_options {
*/
[[nodiscard]] bool is_allowed_nonnumeric_numbers() const { return _allow_nonnumeric_numbers; }

/**
* @brief Whether in a quoted string should characters greater than or equal to 0 and less than 32 be allowed
* without some form of escaping. Strict validation must be enabled for this to work.
*
* @return true if unquoted control chars are allowed.
*/
[[nodiscard]] bool is_allowed_unquoted_control_chars() const { return _allow_unquoted_control_chars; }

/**
* @brief Returns additional values to recognize as null values.
*
Expand Down Expand Up @@ -473,14 +481,14 @@ class json_reader_options {
void set_recovery_mode(json_recovery_mode_t val) { _recovery_mode = val; }

/**
* @brief Set Whether strict validation is enabled or not.
* @brief Set whether strict validation is enabled or not.
*
* @param val Boolean value to indicate whether strict validation is enabled.
*/
void set_strict_validation(bool val) { _strict_validation = val; }

/**
* @brief Set Whether leading zeros are allowed in numeric values. strict validation
* @brief Set whether leading zeros are allowed in numeric values. strict validation
* must be enabled for this to work.
*
* @param val Boolean value to indicate whether leading zeros are allowed in numeric values
Expand All @@ -495,6 +503,15 @@ class json_reader_options {
*/
void allow_nonnumeric_numbers(bool val) { _allow_nonnumeric_numbers = val; }

/**
* @brief Set whether in a quoted string should characters greater than or equal to 0
* and less than 32 be allowed without some form of escaping. Strict validation must
* be enabled for this to work.
*
* @param val true to indicate wether unquoted control chars are allowed.
*/
void allow_unquoted_control_chars(bool val) { _allow_unquoted_control_chars = val; }

/**
* @brief Sets additional values to recognize as null values.
*
Expand Down Expand Up @@ -736,7 +753,7 @@ class json_reader_options_builder {
}

/**
* @brief Set whether unquoted number values are valid JSON. The values are NaN,
* @brief Set whether specific unquoted number values are valid JSON. The values are NaN,
* +INF, -INF, +Infinity, Infinity, and -Infinity.
* strict validation must be enabled for this to have any effect.
*
Expand All @@ -749,6 +766,18 @@ class json_reader_options_builder {
return *this;
}

/**
* @brief Set whether chars >= 0 and < 32 are allowed in a quoted string without
* some form of escaping. strict validation must be enabled for this to have any effect.
*
* @param val Boolean value to indicate if unquoted control chars are allowed or not.
*/
json_reader_options_builder& unquoted_control_chars(bool val)
{
options.allow_unquoted_control_chars(val);
return *this;
}

/**
* @brief Sets additional values to recognize as null values.
*
Expand Down
39 changes: 36 additions & 3 deletions cpp/src/io/json/process_tokens.cu
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ void validate_token_stream(device_span<char const> d_input,
{
if (options.is_strict_validation()) {
using token_t = cudf::io::json::token_t;
auto validate_tokens =
auto validate_values =
[data = d_input.data(),
allow_numeric_leading_zeros =
options.is_allowed_numeric_leading_zeros(),
Expand Down Expand Up @@ -221,13 +221,46 @@ void validate_token_stream(device_span<char const> d_input,
return false;
}
};

auto validate_strings =
[data = d_input.data(),
allow_unquoted_control_chars =
options.is_allowed_unquoted_control_chars()] __device__(int32_t i,
SymbolOffsetT start,
SymbolOffsetT end) -> bool {
// This validates a quoted string. A string must match https://www.json.org/json-en.html
// but we already know that it has a starting and ending " and all white space has been
// stripped out.
//for (SymbolOffsetT idx = start + 1; idx < end; idx++) {
// printf("%i STR CHAR %i/%i => '%c'\n", i, idx, end, data[idx]);
//}
//printf("\t%i STR CHAR END\n", i);

for (SymbolOffsetT idx = start + 1; idx < end; idx++) {
auto c = data[idx];
if (!allow_unquoted_control_chars && c >= 0 && c < 32) {
//printf("%i FOUND INVALID CHAR AT %i %i\n", i, idx, c);
return false;
//} else {
// printf("%i FOUND GOOD CHAR AT %i '%c'\n", i, idx, c);
}
}
//printf("\t%i STR CHAR END\n", i);

return true;
};

auto num_tokens = tokens.size();
auto count_it = thrust::make_counting_iterator(0);
auto predicate = [tokens = tokens.begin(),
token_indices = token_indices.begin(),
validate_tokens] __device__(auto i) -> bool {
validate_values,
validate_strings] __device__(auto i) -> bool {
if (tokens[i] == token_t::ValueEnd) {
return !validate_tokens(i, token_indices[i - 1], token_indices[i]);
return !validate_values(i, token_indices[i - 1], token_indices[i]);
} else if (tokens[i] == token_t::FieldNameEnd ||
tokens[i] == token_t::StringEnd) {
return !validate_strings(i, token_indices[i - 1], token_indices[i]);
}
return false;
};
Expand Down
18 changes: 17 additions & 1 deletion java/src/main/java/ai/rapids/cudf/JSONOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ public final class JSONOptions extends ColumnFilterOptions {
private final boolean allowLeadingZeros;
private final boolean strictValidation;
private final boolean allowNonNumericNumbers;
private final boolean allowUnquotedControlChars;

private JSONOptions(Builder builder) {
super(builder);
Expand All @@ -50,6 +51,7 @@ private JSONOptions(Builder builder) {
strictValidation = builder.strictValidation;
allowLeadingZeros = builder.allowLeadingZeros;
allowNonNumericNumbers = builder.allowNonNumericNumbers;
allowUnquotedControlChars = builder.allowUnquotedControlChars;
}

public boolean isDayFirst() {
Expand Down Expand Up @@ -93,6 +95,10 @@ public boolean nonNumericNumbersAllowed() {
return allowNonNumericNumbers;
}

public boolean unquotedControlChars() {
return allowUnquotedControlChars;
}

@Override
String[] getIncludeColumnNames() {
throw new UnsupportedOperationException("JSON reader didn't support column prune");
Expand All @@ -104,6 +110,7 @@ public static Builder builder() {

public static final class Builder extends ColumnFilterOptions.Builder<JSONOptions.Builder> {
private boolean strictValidation = false;
private boolean allowUnquotedControlChars = true;
private boolean allowNonNumericNumbers = false;
private boolean allowLeadingZeros = false;
private boolean dayFirst = false;
Expand Down Expand Up @@ -134,14 +141,23 @@ public Builder withLeadingZeros(boolean isAllowed) {
}

/**
* Should non-numeric numbers be allowed or not Strict validation
* Should non-numeric numbers be allowed or not. Strict validation
* must be enabled for this to have any effect.
*/
public Builder withNonNumericNumbers(boolean isAllowed) {
allowNonNumericNumbers = isAllowed;
return this;
}

/**
* Should unquoted control chars be allowed in strings. Strict validation
* must be enabled for this to have any effect.
*/
public Builder withUnquotedControlChars(boolean isAllowed) {
allowUnquotedControlChars = isAllowed;
return this;
}

// TODO need to finish this for other configs...

/**
Expand Down
19 changes: 14 additions & 5 deletions java/src/main/java/ai/rapids/cudf/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,8 @@ private static native long readJSON(int[] numChildren, String[] columnNames,
boolean keepStringQuotes,
boolean strictValidation,
boolean allowLeadingZeros,
boolean allowNonNumericNumbers) throws CudfException;
boolean allowNonNumericNumbers,
boolean allowUnquotedControl) throws CudfException;

private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames,
int[] dTypeIds, int[] dTypeScales,
Expand All @@ -270,6 +271,7 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co
boolean strictValidation,
boolean allowLeadingZeros,
boolean allowNonNumericNumbers,
boolean allowUnquotedControl,
long dsHandle) throws CudfException;

private static native long readAndInferJSONFromDataSource(boolean dayFirst, boolean lines,
Expand All @@ -281,6 +283,7 @@ private static native long readAndInferJSONFromDataSource(boolean dayFirst, bool
boolean strictValidation,
boolean allowLeadingZeros,
boolean allowNonNumericNumbers,
boolean allowUnquotedControl,
long dsHandle) throws CudfException;

private static native long readAndInferJSON(long address, long length,
Expand All @@ -293,7 +296,8 @@ private static native long readAndInferJSON(long address, long length,
boolean keepStringQuotes,
boolean strictValidation,
boolean allowLeadingZeros,
boolean allowNonNumericNumbers) throws CudfException;
boolean allowNonNumericNumbers,
boolean allowUnquotedControl) throws CudfException;

/**
* Read in Parquet formatted data.
Expand Down Expand Up @@ -1287,7 +1291,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
opts.keepStringQuotes(),
opts.strictValidation(),
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed()))) {
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars()))) {

return gatherJSONColumns(schema, twm, -1);
}
Expand Down Expand Up @@ -1369,7 +1374,8 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
opts.keepStringQuotes(),
opts.strictValidation(),
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed()));
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars()));
}

/**
Expand All @@ -1390,6 +1396,7 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) {
opts.strictValidation(),
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars(),
dsHandle));
return twm;
} finally {
Expand Down Expand Up @@ -1442,7 +1449,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
opts.keepStringQuotes(),
opts.strictValidation(),
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed()))) {
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars()))) {
return gatherJSONColumns(schema, twm, emptyRowCount);
}
}
Expand Down Expand Up @@ -1480,6 +1488,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int
opts.strictValidation(),
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars(),
dsHandle))) {
return gatherJSONColumns(schema, twm, emptyRowCount);
} finally {
Expand Down
12 changes: 10 additions & 2 deletions java/src/main/native/src/TableJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1613,6 +1613,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
jboolean strict_validation,
jboolean allow_leading_zeros,
jboolean allow_nonnumeric_numbers,
jboolean allow_unquoted_control,
jlong ds_handle)
{
JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
Expand All @@ -1635,6 +1636,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
.strict_validation(strict_validation)
.numeric_leading_zeros(allow_leading_zeros)
.nonnumeric_numbers(allow_nonnumeric_numbers)
.unquoted_control_chars(allow_unquoted_control)
.keep_quotes(keep_quotes);

auto result =
Expand All @@ -1658,7 +1660,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
jboolean keep_quotes,
jboolean strict_validation,
jboolean allow_leading_zeros,
jboolean allow_nonnumeric_numbers)
jboolean allow_nonnumeric_numbers,
jboolean allow_unquoted_control)
{
JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
if (buffer_length <= 0) {
Expand All @@ -1683,6 +1686,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
.strict_validation(strict_validation)
.numeric_leading_zeros(allow_leading_zeros)
.nonnumeric_numbers(allow_nonnumeric_numbers)
.unquoted_control_chars(allow_unquoted_control)
.mixed_types_as_string(mixed_types_as_string)
.keep_quotes(keep_quotes);

Expand Down Expand Up @@ -1792,6 +1796,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
jboolean strict_validation,
jboolean allow_leading_zeros,
jboolean allow_nonnumeric_numbers,
jboolean allow_unquoted_control,
jlong ds_handle)
{
JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
Expand Down Expand Up @@ -1829,6 +1834,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
.strict_validation(strict_validation)
.numeric_leading_zeros(allow_leading_zeros)
.nonnumeric_numbers(allow_nonnumeric_numbers)
.unquoted_control_chars(allow_unquoted_control)
.keep_quotes(keep_quotes);

if (!n_types.is_null()) {
Expand Down Expand Up @@ -1882,7 +1888,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
jboolean keep_quotes,
jboolean strict_validation,
jboolean allow_leading_zeros,
jboolean allow_nonnumeric_numbers)
jboolean allow_nonnumeric_numbers,
jboolean allow_unquoted_control)
{
bool read_buffer = true;
if (buffer == 0) {
Expand Down Expand Up @@ -1934,6 +1941,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
.strict_validation(strict_validation)
.numeric_leading_zeros(allow_leading_zeros)
.nonnumeric_numbers(allow_nonnumeric_numbers)
.unquoted_control_chars(allow_unquoted_control)
.keep_quotes(keep_quotes);

if (!n_types.is_null()) {
Expand Down
Loading

0 comments on commit 4716288

Please sign in to comment.