From fd488cd82f26761f76a13bfa807eae5b10093a28 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Tue, 15 Nov 2022 21:51:23 +0530 Subject: [PATCH] Cleanup common parsing code in JSON, CSV reader (#12022) This PR will cleanup nested json reader and csv reader's common parsing code. - Uses `std::optional` for indicating parsing failure in `parse_numeric` - Cleanup - Removed `decode_value` as it only gives only specialization for timestamp and duration types, rest of types are passthrough. - Unified `decode_digit` Depends on #11898 and #12021 Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - GALI PREM SAGAR (https://github.com/galipremsagar) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/12022 --- cpp/src/io/utilities/parsing_utils.cuh | 154 ++++++++----------------- cpp/src/strings/json/json_path.cu | 7 +- cpp/tests/io/json_test.cpp | 112 ++++++++++++++++++ 3 files changed, 166 insertions(+), 107 deletions(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index cbd417c2b5b..89806956ae5 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -117,8 +117,9 @@ struct parse_options { }; /** - * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization - * for integral types. Handles hexadecimal digits, both uppercase and lowercase. + * @brief Returns the numeric value of an ASCII/UTF-8 character. + * Handles hexadecimal digits, both uppercase and lowercase + * for integral types and only decimal digits for floating point types. * If the character is not a valid numeric digit then `0` is returned and * valid_flag is set to false. * @@ -127,31 +128,14 @@ struct parse_options { * * @return uint8_t Numeric value of the character, or `0` */ -template )> -constexpr uint8_t decode_digit(char c, bool* valid_flag) -{ - if (c >= '0' && c <= '9') return c - '0'; - if (c >= 'a' && c <= 'f') return c - 'a' + 10; - if (c >= 'A' && c <= 'F') return c - 'A' + 10; - - *valid_flag = false; - return 0; -} - -/** - * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization - * for non-integral types. Handles only decimal digits. If the character is not - * a valid numeric digit then `0` is returned and valid_flag is set to false. - * - * @param c ASCII or UTF-8 character - * @param valid_flag Set to false if input is not valid. Unchanged otherwise. - * - * @return uint8_t Numeric value of the character, or `0` - */ -template )> +template constexpr uint8_t decode_digit(char c, bool* valid_flag) { if (c >= '0' && c <= '9') return c - '0'; + if constexpr (as_hex and std::is_integral_v) { + if (c >= 'a' && c <= 'f') return c - 'a' + 10; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + } *valid_flag = false; return 0; @@ -194,13 +178,13 @@ constexpr bool is_infinity(char const* begin, char const* end) * @return The parsed and converted value */ template -constexpr T parse_numeric(char const* begin, - char const* end, - parse_options_view const& opts, - T error_result = std::numeric_limits::quiet_NaN()) +__host__ __device__ std::optional parse_numeric(char const* begin, + char const* end, + parse_options_view const& opts) { T value{}; bool all_digits_valid = true; + constexpr bool as_hex = (base == 16); // Handle negative values if necessary int32_t sign = (*begin == '-') ? -1 : 1; @@ -223,7 +207,7 @@ constexpr T parse_numeric(char const* begin, } else if (base == 10 && (*begin == 'e' || *begin == 'E')) { break; } else if (*begin != opts.thousands && *begin != '+') { - value = (value * base) + decode_digit(*begin, &all_digits_valid); + value = (value * base) + decode_digit(*begin, &all_digits_valid); } ++begin; } @@ -237,7 +221,7 @@ constexpr T parse_numeric(char const* begin, break; } else if (*begin != opts.thousands && *begin != '+') { divisor /= base; - value += decode_digit(*begin, &all_digits_valid) * divisor; + value += decode_digit(*begin, &all_digits_valid) * divisor; } ++begin; } @@ -248,12 +232,12 @@ constexpr T parse_numeric(char const* begin, if (*begin == '-' || *begin == '+') { ++begin; } int32_t exponent = 0; while (begin < end) { - exponent = (exponent * 10) + decode_digit(*(begin++), &all_digits_valid); + exponent = (exponent * 10) + decode_digit(*(begin++), &all_digits_valid); } if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); } } } - if (!all_digits_valid) { return error_result; } + if (!all_digits_valid) { return std::optional{}; } return value * sign; } @@ -485,7 +469,7 @@ cudf::size_type count_all_from_set(host_span data, /** * @brief Checks whether the given character is a whitespace character. * - * @param[in] ch The character to check + * @param ch The character to check * * @return True if the input is whitespace, False otherwise */ @@ -567,65 +551,6 @@ __inline__ __device__ std::pair trim_quotes(char const return {begin, end}; } -/** - * @brief Decodes a numeric value base on templated cudf type T with specified - * base. - * - * @param[in] begin Beginning of the character string - * @param[in] end End of the character string - * @param opts The global parsing behavior options - * - * @return The parsed numeric value - */ -template -__inline__ __device__ T decode_value(char const* begin, - char const* end, - parse_options_view const& opts) -{ - return cudf::io::parse_numeric(begin, end, opts); -} - -/** - * @brief Decodes a numeric value base on templated cudf type T - * - * @param[in] begin Beginning of the character string - * @param[in] end End of the character string - * @param opts The global parsing behavior options - * - * @return The parsed numeric value - */ -template () and !cudf::is_duration())> -__inline__ __device__ T decode_value(char const* begin, - char const* end, - parse_options_view const& opts) -{ - return cudf::io::parse_numeric(begin, end, opts); -} - -template ())> -__inline__ __device__ T decode_value(char const* begin, - char const* end, - parse_options_view const& opts) -{ - // If this is a string value, remove quotes - if ((thrust::distance(begin, end) >= 2 && *begin == '\"' && *thrust::prev(end) == '\"')) { - thrust::advance(begin, 1); - thrust::advance(end, -1); - } - return to_timestamp(begin, end, opts.dayfirst); -} - -template ())> -__inline__ __device__ T decode_value(char const* begin, char const* end, parse_options_view const&) -{ - // If this is a string value, remove quotes - if ((thrust::distance(begin, end) >= 2 && *begin == '\"' && *thrust::prev(end) == '\"')) { - thrust::advance(begin, 1); - thrust::advance(end, -1); - } - return to_duration(begin, end); -} - struct ConvertFunctor { /** * @brief Dispatch for numeric types whose values can be convertible to @@ -645,13 +570,15 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex = false) { - static_cast(out_buffer)[row] = [as_hex, &opts, begin, end]() -> T { + auto const value = [as_hex, &opts, begin, end]() -> std::optional { // Check for user-specified true/false values auto const field_len = static_cast(end - begin); if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return 1; } if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return 0; } - return as_hex ? decode_value(begin, end, opts) : decode_value(begin, end, opts); + return as_hex ? cudf::io::parse_numeric(begin, end, opts) + : cudf::io::parse_numeric(begin, end, opts); }(); + static_cast(out_buffer)[row] = value.value_or(std::numeric_limits::quiet_NaN()); return true; } @@ -670,6 +597,7 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { + // TODO decide what's invalid input and update parsing functions static_cast*>(out_buffer)[row] = [&opts, output_type, begin, end]() -> device_storage_type_t { return strings::detail::parse_decimal>( @@ -691,13 +619,18 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { - static_cast(out_buffer)[row] = [&opts, begin, end]() { + auto const value = [&opts, begin, end]() -> std::optional { // Check for user-specified true/false values auto const field_len = static_cast(end - begin); - if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; } - if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; } - return decode_value(begin, end, opts); + if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { + return static_cast(true); + } + if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { + return static_cast(false); + } + return cudf::io::parse_numeric(begin, end, opts); }(); + static_cast(out_buffer)[row] = value.value_or(std::numeric_limits::quiet_NaN()); return true; } @@ -715,10 +648,20 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { - T const value = decode_value(begin, end, opts); - static_cast(out_buffer)[row] = value; + auto const value = [&opts, begin, end]() -> std::optional { + // Check for user-specified true/false values + auto const field_len = static_cast(end - begin); + if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { + return static_cast(true); + } + if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { + return static_cast(false); + } + return cudf::io::parse_numeric(begin, end, opts); + }(); + static_cast(out_buffer)[row] = value.value_or(std::numeric_limits::quiet_NaN()); - return !std::isnan(value); + return value.has_value() and !std::isnan(*value); } /** @@ -735,12 +678,15 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { - if constexpr (cudf::is_timestamp() or cudf::is_duration()) { - static_cast(out_buffer)[row] = decode_value(begin, end, opts); - return true; + // TODO decide what's invalid input and update parsing functions + if constexpr (cudf::is_timestamp()) { + static_cast(out_buffer)[row] = to_timestamp(begin, end, opts.dayfirst); + } else if constexpr (cudf::is_duration()) { + static_cast(out_buffer)[row] = to_duration(begin, end); } else { return false; } + return true; } }; diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index 303c35ea7fb..5ce00eeee4a 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -570,9 +570,10 @@ class path_state : private parser { op.type = path_operator_type::CHILD; op.expected_type = OBJECT; } else { - op.type = path_operator_type::CHILD_INDEX; - op.index = cudf::io::parse_numeric( - op.name.data(), op.name.data() + op.name.size_bytes(), json_opts, -1); + op.type = path_operator_type::CHILD_INDEX; + auto const value = cudf::io::parse_numeric( + op.name.data(), op.name.data() + op.name.size_bytes(), json_opts); + op.index = value.value_or(-1); CUDF_EXPECTS(op.index >= 0, "Invalid numeric index specified in JSONPath"); op.expected_type = ARRAY; } diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index b8cd4622484..f7b21008f70 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -29,6 +30,7 @@ #include #include +#include #include #include @@ -1451,4 +1453,114 @@ TEST_F(JsonReaderTest, JsonNestedDtypeSchema) int_wrapper{{1, 1, 2}, {true, true, true}}); } +TEST_P(JsonReaderParamTest, JsonDtypeParsing) +{ + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + // All corner cases of dtype parsing + // 0, "0", " 0", 1, "1", " 1", "a", "z", null, true, false, "null", "true", "false", nan, "nan" + // Test for dtypes: bool, int, float, str, duration, timestamp + std::string row_orient = + "[0]\n[\"0\"]\n[\" 0\"]\n[1]\n[\"1\"]\n[\" 1\"]\n[\"a\"]\n[\"z\"]\n" + "[null]\n[true]\n[false]\n[\"null\"]\n[\"true\"]\n[\"false\"]\n[nan]\n[\"nan\"]\n"; + std::string record_orient = to_records_orient({{{"0", "0"}}, + {{"0", "\"0\""}}, + {{"0", "\" 0\""}}, + {{"0", "1"}}, + {{"0", "\"1\""}}, + {{"0", "\" 1\""}}, + {{"0", "\"a\""}}, + {{"0", "\"z\""}}, + {{"0", "null"}}, + {{"0", "true"}}, + {{"0", "false"}}, + {{"0", "\"null\""}}, + {{"0", "\"true\""}}, + {{"0", "\"false\""}}, + {{"0", "nan"}}, + {{"0", "\"nan\""}}}, + "\n"); + + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; + + auto make_validity = [](std::vector const& validity) { + return cudf::detail::make_counting_transform_iterator( + 0, [=](auto i) -> bool { return static_cast(validity[i]); }); + }; + + constexpr int int_NA = 0; + constexpr double double_NA = std::numeric_limits::quiet_NaN(); + constexpr bool bool_NA = false; + + std::vector const validity = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0}; + + auto int_col = int_wrapper{ + {0, 0, int_NA, 1, 1, int_NA, int_NA, int_NA, int_NA, 1, 0, int_NA, 1, 0, int_NA, int_NA}, + cudf::test::iterators::nulls_at(std::vector{8})}; + auto float_col = float_wrapper{{0.0, + 0.0, + double_NA, + 1.0, + 1.0, + double_NA, + double_NA, + double_NA, + double_NA, + 1.0, + 0.0, + double_NA, + 1.0, + 0.0, + double_NA, + double_NA}, + make_validity(validity)}; + auto str_col = + cudf::test::strings_column_wrapper{// clang-format off + {"0", "0", " 0", "1", "1", " 1", "a", "z", "", "true", "false", "null", "true", "false", "nan", "nan"}, + cudf::test::iterators::nulls_at(std::vector{8})}; + // clang-format on + auto bool_col = bool_wrapper{{false, + false, + bool_NA, + true, + true, + bool_NA, + bool_NA, + bool_NA, + bool_NA, + true, + false, + bool_NA, + true, + false, + bool_NA, + bool_NA}, + cudf::test::iterators::nulls_at(std::vector{8})}; + + // Types to test + const std::vector dtypes = { + dtype(), dtype(), dtype(), dtype()}; + const std::vector cols{cudf::column_view(int_col), + cudf::column_view(float_col), + cudf::column_view(str_col), + cudf::column_view(bool_col)}; + for (size_t col_type = 0; col_type < cols.size(); col_type++) { + std::map dtype_schema{{"0", {dtypes[col_type]}}}; + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .dtypes(dtype_schema) + .lines(true) + .experimental(test_experimental); + + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + + EXPECT_EQ(result.tbl->num_columns(), 1); + EXPECT_EQ(result.tbl->num_rows(), 16); + EXPECT_EQ(result.metadata.schema_info[0].name, "0"); + + EXPECT_EQ(result.tbl->get_column(0).type().id(), dtypes[col_type].id()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), cols[col_type]); + } +} + CUDF_TEST_PROGRAM_MAIN()