Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup common parsing code in JSON, CSV reader #12022

Merged
Merged
Show file tree
Hide file tree
Changes from 49 commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
4423d20
trim quotes for non-string values in nested json parsing
karthikeyann Sep 30, 2022
1d69a69
add pytest compare cudf, cudf_experimental on values with quotes
karthikeyann Sep 30, 2022
b680e3f
remove unnecessary quote removal in timestamp and duration decode_value
karthikeyann Sep 30, 2022
49bdbb6
Merge branch 'branch-22.12' of github.com:rapidsai/cudf into fix-quot…
karthikeyann Oct 20, 2022
46663cf
null check before removing quotes
karthikeyann Oct 21, 2022
d2b76e3
return std::optional from parse_numeric, for validating parsing
karthikeyann Oct 21, 2022
c953410
add unit tests of corner cases
karthikeyann Oct 21, 2022
1646bcc
fix old json reader to treat "null" as string.
karthikeyann Oct 21, 2022
f0c468b
cleanup decode_digit
karthikeyann Oct 21, 2022
cad413c
fix doc
karthikeyann Oct 21, 2022
bbb434a
add TODO invalid input for decimal, duration, timestamp
karthikeyann Oct 21, 2022
0910f2e
fix bug in csv_reader_options construction in cython
karthikeyann Oct 26, 2022
31636ea
add pytest for breaking behaviour in read_csv
karthikeyann Oct 26, 2022
d2757dc
fix bug in csv_reader_options construction in cython
karthikeyann Oct 26, 2022
e26b3d8
add pytest for breaking behaviour in read_csv
karthikeyann Oct 26, 2022
27c8059
Merge branch 'branch-22.12' of github.com:rapidsai/cudf into fix-quot…
karthikeyann Oct 28, 2022
e119fac
parsing error in numeric types should make that element null
karthikeyann Oct 28, 2022
3f8eb52
revert parsing error changes
karthikeyann Oct 28, 2022
e0f6e3d
unit tests for string quotes
karthikeyann Oct 28, 2022
1317be3
remove debug prints
karthikeyann Oct 28, 2022
ce863b3
replace @param[in] with @param
karthikeyann Oct 28, 2022
d9fc33b
nan fix moved to PR #12022
karthikeyann Oct 28, 2022
1a6b10d
Merge branch 'fix-csv_reader_options-cython-bug' of github.com:karthi…
karthikeyann Oct 28, 2022
2ec4bd8
null for parsing error in bool - update pytest
karthikeyann Oct 28, 2022
ff77d5f
doc update
karthikeyann Oct 28, 2022
ba36b86
skip writing NAN for floating type for nulls
karthikeyann Oct 28, 2022
3e71482
fix dereferencing {begin - 1} in json reader
karthikeyann Oct 28, 2022
9b64cbd
doc update
karthikeyann Oct 28, 2022
90cdf3a
ignore data for nulls in CsvReaderTest.InvalidFloatingPoint test
karthikeyann Oct 28, 2022
6710b47
add pandas test for custom bool literals
karthikeyann Oct 28, 2022
99eedc0
Update python/cudf/cudf/tests/test_csv.py
karthikeyann Oct 31, 2022
0415252
static_cast<std::ptrdiff_t> for is_quoted in pointer arithmetic
karthikeyann Oct 31, 2022
8f9cca4
Merge branch 'pull-request/11898' of github.com:rapidsai/cudf into fe…
karthikeyann Oct 31, 2022
da4b16f
skip nan check for null elements in test
karthikeyann Oct 31, 2022
0c695b8
Merge branch 'pull-request/12021' of github.com:rapidsai/cudf into fe…
karthikeyann Oct 31, 2022
53e0f2b
Apply suggestions from code review
karthikeyann Nov 1, 2022
dd6a0ca
Update cpp/src/io/utilities/parsing_utils.cuh
karthikeyann Nov 1, 2022
cc2a348
review comments, style fix
karthikeyann Nov 2, 2022
dc228e5
pandas vs cudf csv parser differences
karthikeyann Nov 2, 2022
72a78be
Merge branch 'branch-22.12' of github.com:rapidsai/cudf into fix-csv_…
karthikeyann Nov 2, 2022
d86a5a0
Merge branch 'pull-request/11898' of github.com:rapidsai/cudf into fe…
karthikeyann Nov 2, 2022
4486193
Merge branch 'pull-request/12021' of github.com:rapidsai/cudf into fe…
karthikeyann Nov 2, 2022
efe6619
Merge branch 'branch-22.12' into fea-json-parsing-error-null
karthikeyann Nov 2, 2022
6fb3f96
Merge branch 'branch-22.12' of github.com:rapidsai/cudf into fea-json…
karthikeyann Nov 3, 2022
7e231ac
Apply suggestions from code review
karthikeyann Nov 3, 2022
d7fb88c
address review comments
karthikeyann Nov 14, 2022
6fbd8ac
Merge branch 'branch-22.12' of github.com:rapidsai/cudf into fea-json…
karthikeyann Nov 14, 2022
bea7fc3
replace std::nullopt
karthikeyann Nov 14, 2022
0e45d13
remove breaking behavior on nulls
karthikeyann Nov 14, 2022
df6844c
TODO clarify
karthikeyann Nov 15, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 50 additions & 104 deletions cpp/src/io/utilities/parsing_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,9 @@ struct parse_options {
};

/**
* @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization
* for integral types. Handles hexadecimal digits, both uppercase and lowercase.
* @brief Returns the numeric value of an ASCII/UTF-8 character.
* Handles hexadecimal digits, both uppercase and lowercase
* for integral types and only decimal digits for floating point types.
* If the character is not a valid numeric digit then `0` is returned and
* valid_flag is set to false.
*
Expand All @@ -127,31 +128,14 @@ struct parse_options {
*
* @return uint8_t Numeric value of the character, or `0`
*/
template <typename T, CUDF_ENABLE_IF(std::is_integral_v<T>)>
constexpr uint8_t decode_digit(char c, bool* valid_flag)
{
if (c >= '0' && c <= '9') return c - '0';
if (c >= 'a' && c <= 'f') return c - 'a' + 10;
if (c >= 'A' && c <= 'F') return c - 'A' + 10;

*valid_flag = false;
return 0;
}

/**
* @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization
* for non-integral types. Handles only decimal digits. If the character is not
* a valid numeric digit then `0` is returned and valid_flag is set to false.
*
* @param c ASCII or UTF-8 character
* @param valid_flag Set to false if input is not valid. Unchanged otherwise.
*
* @return uint8_t Numeric value of the character, or `0`
*/
template <typename T, CUDF_ENABLE_IF(!std::is_integral_v<T>)>
template <typename T, bool as_hex = false>
constexpr uint8_t decode_digit(char c, bool* valid_flag)
{
if (c >= '0' && c <= '9') return c - '0';
if constexpr (as_hex and std::is_integral_v<T>) {
if (c >= 'a' && c <= 'f') return c - 'a' + 10;
if (c >= 'A' && c <= 'F') return c - 'A' + 10;
}

*valid_flag = false;
return 0;
Expand Down Expand Up @@ -194,13 +178,13 @@ constexpr bool is_infinity(char const* begin, char const* end)
* @return The parsed and converted value
*/
template <typename T, int base = 10>
constexpr T parse_numeric(char const* begin,
char const* end,
parse_options_view const& opts,
T error_result = std::numeric_limits<T>::quiet_NaN())
__host__ __device__ std::optional<T> parse_numeric(char const* begin,
char const* end,
parse_options_view const& opts)
{
T value{};
bool all_digits_valid = true;
constexpr bool as_hex = (base == 16);

// Handle negative values if necessary
int32_t sign = (*begin == '-') ? -1 : 1;
Expand All @@ -223,7 +207,7 @@ constexpr T parse_numeric(char const* begin,
} else if (base == 10 && (*begin == 'e' || *begin == 'E')) {
break;
} else if (*begin != opts.thousands && *begin != '+') {
value = (value * base) + decode_digit<T>(*begin, &all_digits_valid);
value = (value * base) + decode_digit<T, as_hex>(*begin, &all_digits_valid);
}
++begin;
}
Expand All @@ -237,7 +221,7 @@ constexpr T parse_numeric(char const* begin,
break;
} else if (*begin != opts.thousands && *begin != '+') {
divisor /= base;
value += decode_digit<T>(*begin, &all_digits_valid) * divisor;
value += decode_digit<T, as_hex>(*begin, &all_digits_valid) * divisor;
}
++begin;
}
Expand All @@ -248,12 +232,12 @@ constexpr T parse_numeric(char const* begin,
if (*begin == '-' || *begin == '+') { ++begin; }
int32_t exponent = 0;
while (begin < end) {
exponent = (exponent * 10) + decode_digit<T>(*(begin++), &all_digits_valid);
exponent = (exponent * 10) + decode_digit<T, as_hex>(*(begin++), &all_digits_valid);
}
if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); }
}
}
if (!all_digits_valid) { return error_result; }
if (!all_digits_valid) { return std::optional<T>{}; }

return value * sign;
}
Expand Down Expand Up @@ -485,7 +469,7 @@ cudf::size_type count_all_from_set(host_span<char const> data,
/**
* @brief Checks whether the given character is a whitespace character.
*
* @param[in] ch The character to check
* @param ch The character to check
*
* @return True if the input is whitespace, False otherwise
*/
Expand Down Expand Up @@ -567,65 +551,6 @@ __inline__ __device__ std::pair<char const*, char const*> trim_quotes(char const
return {begin, end};
}

/**
* @brief Decodes a numeric value base on templated cudf type T with specified
* base.
*
* @param[in] begin Beginning of the character string
* @param[in] end End of the character string
* @param opts The global parsing behavior options
*
* @return The parsed numeric value
*/
template <typename T, int base>
__inline__ __device__ T decode_value(char const* begin,
char const* end,
parse_options_view const& opts)
{
return cudf::io::parse_numeric<T, base>(begin, end, opts);
}

/**
* @brief Decodes a numeric value base on templated cudf type T
*
* @param[in] begin Beginning of the character string
* @param[in] end End of the character string
* @param opts The global parsing behavior options
*
* @return The parsed numeric value
*/
template <typename T, CUDF_ENABLE_IF(!cudf::is_timestamp<T>() and !cudf::is_duration<T>())>
__inline__ __device__ T decode_value(char const* begin,
char const* end,
parse_options_view const& opts)
{
return cudf::io::parse_numeric<T>(begin, end, opts);
}

template <typename T, CUDF_ENABLE_IF(cudf::is_timestamp<T>())>
__inline__ __device__ T decode_value(char const* begin,
char const* end,
parse_options_view const& opts)
{
// If this is a string value, remove quotes
if ((thrust::distance(begin, end) >= 2 && *begin == '\"' && *thrust::prev(end) == '\"')) {
thrust::advance(begin, 1);
thrust::advance(end, -1);
}
return to_timestamp<T>(begin, end, opts.dayfirst);
}

template <typename T, CUDF_ENABLE_IF(cudf::is_duration<T>())>
__inline__ __device__ T decode_value(char const* begin, char const* end, parse_options_view const&)
{
// If this is a string value, remove quotes
if ((thrust::distance(begin, end) >= 2 && *begin == '\"' && *thrust::prev(end) == '\"')) {
thrust::advance(begin, 1);
thrust::advance(end, -1);
}
return to_duration<T>(begin, end);
}

struct ConvertFunctor {
/**
* @brief Dispatch for numeric types whose values can be convertible to
Expand All @@ -645,13 +570,15 @@ struct ConvertFunctor {
parse_options_view const& opts,
bool as_hex = false)
{
static_cast<T*>(out_buffer)[row] = [as_hex, &opts, begin, end]() -> T {
auto const value = [as_hex, &opts, begin, end]() -> std::optional<T> {
// Check for user-specified true/false values
auto const field_len = static_cast<size_t>(end - begin);
if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return 1; }
if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return 0; }
return as_hex ? decode_value<T, 16>(begin, end, opts) : decode_value<T>(begin, end, opts);
return as_hex ? cudf::io::parse_numeric<T, 16>(begin, end, opts)
: cudf::io::parse_numeric<T>(begin, end, opts);
}();
static_cast<T*>(out_buffer)[row] = value.value_or(std::numeric_limits<T>::quiet_NaN());

return true;
}
Expand All @@ -670,6 +597,7 @@ struct ConvertFunctor {
parse_options_view const& opts,
bool as_hex)
{
// TODO what's invalid input
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please make these TODOs clearer if we're planning to merge them?
I looked into the code and don't see that invalid input is handled in a robust way in parse_decimal and to_timestamp. Is that what this TODO needs to remind us of?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. this TODO is a reminder for future and also to remind reviewers to discuss and decide what is invalid input for this type.

static_cast<device_storage_type_t<T>*>(out_buffer)[row] =
[&opts, output_type, begin, end]() -> device_storage_type_t<T> {
return strings::detail::parse_decimal<device_storage_type_t<T>>(
Expand All @@ -691,13 +619,18 @@ struct ConvertFunctor {
parse_options_view const& opts,
bool as_hex)
{
static_cast<T*>(out_buffer)[row] = [&opts, begin, end]() {
auto const value = [&opts, begin, end]() -> std::optional<T> {
// Check for user-specified true/false values
auto const field_len = static_cast<size_t>(end - begin);
if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; }
if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; }
return decode_value<T>(begin, end, opts);
if (serialized_trie_contains(opts.trie_true, {begin, field_len})) {
return static_cast<T>(true);
}
if (serialized_trie_contains(opts.trie_false, {begin, field_len})) {
return static_cast<T>(false);
}
return cudf::io::parse_numeric<T>(begin, end, opts);
}();
static_cast<T*>(out_buffer)[row] = value.value_or(std::numeric_limits<T>::quiet_NaN());

return true;
}
Expand All @@ -715,10 +648,20 @@ struct ConvertFunctor {
parse_options_view const& opts,
bool as_hex)
{
T const value = decode_value<T>(begin, end, opts);
static_cast<T*>(out_buffer)[row] = value;
auto const value = [&opts, begin, end]() -> std::optional<T> {
// Check for user-specified true/false values
auto const field_len = static_cast<size_t>(end - begin);
if (serialized_trie_contains(opts.trie_true, {begin, field_len})) {
return static_cast<T>(true);
}
if (serialized_trie_contains(opts.trie_false, {begin, field_len})) {
return static_cast<T>(false);
}
return cudf::io::parse_numeric<T>(begin, end, opts);
}();
static_cast<T*>(out_buffer)[row] = value.value_or(std::numeric_limits<T>::quiet_NaN());

return !std::isnan(value);
return value.has_value() and !std::isnan(*value);
}

/**
Expand All @@ -735,12 +678,15 @@ struct ConvertFunctor {
parse_options_view const& opts,
bool as_hex)
{
if constexpr (cudf::is_timestamp<T>() or cudf::is_duration<T>()) {
static_cast<T*>(out_buffer)[row] = decode_value<T>(begin, end, opts);
return true;
// TODO what's invalid input
if constexpr (cudf::is_timestamp<T>()) {
static_cast<T*>(out_buffer)[row] = to_timestamp<T>(begin, end, opts.dayfirst);
} else if constexpr (cudf::is_duration<T>()) {
static_cast<T*>(out_buffer)[row] = to_duration<T>(begin, end);
} else {
return false;
}
return true;
}
};

Expand Down
7 changes: 4 additions & 3 deletions cpp/src/strings/json/json_path.cu
Original file line number Diff line number Diff line change
Expand Up @@ -570,9 +570,10 @@ class path_state : private parser {
op.type = path_operator_type::CHILD;
op.expected_type = OBJECT;
} else {
op.type = path_operator_type::CHILD_INDEX;
op.index = cudf::io::parse_numeric<int>(
op.name.data(), op.name.data() + op.name.size_bytes(), json_opts, -1);
op.type = path_operator_type::CHILD_INDEX;
auto const value = cudf::io::parse_numeric<int>(
op.name.data(), op.name.data() + op.name.size_bytes(), json_opts);
op.index = value.value_or(-1);
CUDF_EXPECTS(op.index >= 0, "Invalid numeric index specified in JSONPath");
vyasr marked this conversation as resolved.
Show resolved Hide resolved
op.expected_type = ARRAY;
}
Expand Down
Loading