From 4423d201a3871aa7bb51e415306274890ac5bc82 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 30 Sep 2022 16:03:37 +0530 Subject: [PATCH 01/39] trim quotes for non-string values in nested json parsing --- cpp/include/cudf/io/detail/data_casting.cuh | 12 +++++++----- cpp/src/io/utilities/parsing_utils.cuh | 21 +++++++++++++++++++++ 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index 628c00ad603..4b8b016f7f5 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -381,10 +381,12 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, col_size, [str_tuples, col = *output_dv_ptr, options, col_type] __device__(size_type row) { if (col.is_null(row)) { return; } - auto const in = str_tuples[row]; + // If this is a string value, remove quotes + auto [in_begin, in_end] = trim_quotes( + str_tuples[row].first, str_tuples[row].first + str_tuples[row].second, options.quotechar); - auto const is_null_literal = - serialized_trie_contains(options.trie_na, {in.first, static_cast(in.second)}); + auto const is_null_literal = serialized_trie_contains( + options.trie_na, {in_begin, static_cast(in_end - in_begin)}); if (is_null_literal) { col.set_null(row); @@ -393,8 +395,8 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, auto const is_parsed = cudf::type_dispatcher(col_type, ConvertFunctor{}, - in.first, - in.first + in.second, + in_begin, + in_end, col.data(), row, col_type, diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 388c9b28001..7964f43b783 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -523,6 +523,27 @@ __inline__ __device__ std::pair trim_whitespaces_quote return {skip_character(trim_begin, quotechar), skip_character(trim_end, quotechar).base()}; } +/** + * @brief Adjusts the range to ignore starting/trailing quotation characters. + * + * @param[in] begin Pointer to the first character in the parsing range + * @param[in] end pointer to the first character after the parsing range + * @param[in] quotechar The character used to denote quotes; '\0' if none + * + * @return Trimmed range + */ +__inline__ __device__ std::pair trim_quotes(char const* begin, + char const* end, + char quotechar = '\0') +{ + if ((thrust::distance(begin, end) >= 2 && *begin == quotechar && + *thrust::prev(end) == quotechar)) { + thrust::advance(begin, 1); + thrust::advance(end, -1); + } + return {begin, end}; +} + /** * @brief Decodes a numeric value base on templated cudf type T with specified * base. From 1d69a69a863bbb26da274b04005a07d132ac34df Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 30 Sep 2022 17:18:09 +0530 Subject: [PATCH 02/39] add pytest compare cudf, cudf_experimental on values with quotes --- python/cudf/cudf/tests/test_json.py | 52 +++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 1fdef44546a..8e3c6fc6bda 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -669,6 +669,58 @@ def test_json_types_data(): assert df.to_arrow().equals(pa_table_pdf) +@pytest.mark.parametrize( + "col_type,json_str", + [ + # without quotes + ("int", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'), + # with quotes + ("int", '[{"k": "1"}, {"k": "2"}]'), + # with quotes, mixed + ("int", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'), + # with quotes, null, mixed + ("int", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'), + # without quotes, null + ("int", '[{"k": 1}, {"k": 2}, {"k": null}, {"k": 4}]'), + # without quotes + ("float", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'), + # with quotes + ("float", '[{"k": "1"}, {"k": "2"}]'), + # with quotes, mixed + ("float", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'), + # with quotes, null, mixed + ("float", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'), + # with quotes, NAN + ("float", '[{"k": "1"}, {"k": "2"}, {"k": NaN}, {"k": "4"}]'), + # without quotes + ("str", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'), + # with quotes + ("str", '[{"k": "1"}, {"k": "2"}]'), + # with quotes, mixed + ("str", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'), + # with quotes, null, mixed + ("str", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'), + # without quotes, null + ("str", '[{"k": 1}, {"k": 2}, {"k": null}, {"k": 4}]'), + ], +) +def test_json_quoted_values_with_schema(col_type, json_str): + experimental_df = cudf.read_json( + StringIO(json_str), + engine="cudf_experimental", + orient="records", + dtype={"k": col_type}, + ) + cudf_df = cudf.read_json( + StringIO(json_str.replace(",", "\n")[1:-1]), + engine="cudf", + orient="records", + lines=True, + dtype={"k": col_type}, + ) + assert_eq(cudf_df, experimental_df) + + @pytest.mark.parametrize( "keep_quotes,result", [ From b680e3f4cd652310cb012eb6afa9d15503854801 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 30 Sep 2022 19:03:37 +0530 Subject: [PATCH 03/39] remove unnecessary quote removal in timestamp and duration decode_value --- cpp/src/io/utilities/parsing_utils.cuh | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 7964f43b783..a075fa18531 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -584,22 +584,12 @@ __inline__ __device__ T decode_value(char const* begin, char const* end, parse_options_view const& opts) { - // If this is a string value, remove quotes - if ((thrust::distance(begin, end) >= 2 && *begin == '\"' && *thrust::prev(end) == '\"')) { - thrust::advance(begin, 1); - thrust::advance(end, -1); - } return to_timestamp(begin, end, opts.dayfirst); } template ())> __inline__ __device__ T decode_value(char const* begin, char const* end, parse_options_view const&) { - // If this is a string value, remove quotes - if ((thrust::distance(begin, end) >= 2 && *begin == '\"' && *thrust::prev(end) == '\"')) { - thrust::advance(begin, 1); - thrust::advance(end, -1); - } return to_duration(begin, end); } From 46663cfc79107b97351718a088c24365de066a54 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Sat, 22 Oct 2022 03:51:20 +0530 Subject: [PATCH 04/39] null check before removing quotes --- cpp/include/cudf/io/detail/data_casting.cuh | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index 4b8b016f7f5..aba9ec07bc6 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -381,18 +381,19 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, col_size, [str_tuples, col = *output_dv_ptr, options, col_type] __device__(size_type row) { if (col.is_null(row)) { return; } - // If this is a string value, remove quotes - auto [in_begin, in_end] = trim_quotes( - str_tuples[row].first, str_tuples[row].first + str_tuples[row].second, options.quotechar); + auto const in = str_tuples[row]; - auto const is_null_literal = serialized_trie_contains( - options.trie_na, {in_begin, static_cast(in_end - in_begin)}); + auto const is_null_literal = + serialized_trie_contains(options.trie_na, {in.first, static_cast(in.second)}); if (is_null_literal) { col.set_null(row); return; } + // If this is a string value, remove quotes + auto [in_begin, in_end] = trim_quotes(in.first, in.first + in.second, options.quotechar); + auto const is_parsed = cudf::type_dispatcher(col_type, ConvertFunctor{}, in_begin, From d2b76e356073d3e9e781d6166e23f0fd76e1debb Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Sat, 22 Oct 2022 03:52:20 +0530 Subject: [PATCH 05/39] return std::optional from parse_numeric, for validating parsing --- cpp/src/io/utilities/parsing_utils.cuh | 115 +++++++++---------------- cpp/src/strings/json/json_path.cu | 7 +- 2 files changed, 44 insertions(+), 78 deletions(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index a075fa18531..b37ab9d2b55 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -127,12 +127,14 @@ struct parse_options { * * @return uint8_t Numeric value of the character, or `0` */ -template )> +template )> constexpr uint8_t decode_digit(char c, bool* valid_flag) { if (c >= '0' && c <= '9') return c - '0'; - if (c >= 'a' && c <= 'f') return c - 'a' + 10; - if (c >= 'A' && c <= 'F') return c - 'A' + 10; + if constexpr (as_hex) { + if (c >= 'a' && c <= 'f') return c - 'a' + 10; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + } *valid_flag = false; return 0; @@ -148,7 +150,7 @@ constexpr uint8_t decode_digit(char c, bool* valid_flag) * * @return uint8_t Numeric value of the character, or `0` */ -template )> +template )> constexpr uint8_t decode_digit(char c, bool* valid_flag) { if (c >= '0' && c <= '9') return c - '0'; @@ -194,13 +196,13 @@ constexpr bool is_infinity(char const* begin, char const* end) * @return The parsed and converted value */ template -constexpr T parse_numeric(char const* begin, - char const* end, - parse_options_view const& opts, - T error_result = std::numeric_limits::quiet_NaN()) +__host__ __device__ std::optional parse_numeric(char const* begin, + char const* end, + parse_options_view const& opts) { T value{}; bool all_digits_valid = true; + constexpr bool as_hex = (base == 16); // Handle negative values if necessary int32_t sign = (*begin == '-') ? -1 : 1; @@ -223,7 +225,7 @@ constexpr T parse_numeric(char const* begin, } else if (base == 10 && (*begin == 'e' || *begin == 'E')) { break; } else if (*begin != opts.thousands && *begin != '+') { - value = (value * base) + decode_digit(*begin, &all_digits_valid); + value = (value * base) + decode_digit(*begin, &all_digits_valid); } ++begin; } @@ -237,7 +239,7 @@ constexpr T parse_numeric(char const* begin, break; } else if (*begin != opts.thousands && *begin != '+') { divisor /= base; - value += decode_digit(*begin, &all_digits_valid) * divisor; + value += decode_digit(*begin, &all_digits_valid) * divisor; } ++begin; } @@ -248,12 +250,12 @@ constexpr T parse_numeric(char const* begin, if (*begin == '-' || *begin == '+') { ++begin; } int32_t exponent = 0; while (begin < end) { - exponent = (exponent * 10) + decode_digit(*(begin++), &all_digits_valid); + exponent = (exponent * 10) + decode_digit(*(begin++), &all_digits_valid); } if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); } } } - if (!all_digits_valid) { return error_result; } + if (!all_digits_valid) { return {}; } return value * sign; } @@ -544,55 +546,6 @@ __inline__ __device__ std::pair trim_quotes(char const return {begin, end}; } -/** - * @brief Decodes a numeric value base on templated cudf type T with specified - * base. - * - * @param[in] begin Beginning of the character string - * @param[in] end End of the character string - * @param opts The global parsing behavior options - * - * @return The parsed numeric value - */ -template -__inline__ __device__ T decode_value(char const* begin, - char const* end, - parse_options_view const& opts) -{ - return cudf::io::parse_numeric(begin, end, opts); -} - -/** - * @brief Decodes a numeric value base on templated cudf type T - * - * @param[in] begin Beginning of the character string - * @param[in] end End of the character string - * @param opts The global parsing behavior options - * - * @return The parsed numeric value - */ -template () and !cudf::is_duration())> -__inline__ __device__ T decode_value(char const* begin, - char const* end, - parse_options_view const& opts) -{ - return cudf::io::parse_numeric(begin, end, opts); -} - -template ())> -__inline__ __device__ T decode_value(char const* begin, - char const* end, - parse_options_view const& opts) -{ - return to_timestamp(begin, end, opts.dayfirst); -} - -template ())> -__inline__ __device__ T decode_value(char const* begin, char const* end, parse_options_view const&) -{ - return to_duration(begin, end); -} - struct ConvertFunctor { /** * @brief Dispatch for numeric types whose values can be convertible to @@ -612,15 +565,17 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex = false) { - static_cast(out_buffer)[row] = [as_hex, &opts, begin, end]() -> T { + auto const value = [as_hex, &opts, begin, end]() -> std::optional { // Check for user-specified true/false values auto const field_len = static_cast(end - begin); if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return 1; } if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return 0; } - return as_hex ? decode_value(begin, end, opts) : decode_value(begin, end, opts); + return as_hex ? cudf::io::parse_numeric(begin, end, opts) + : cudf::io::parse_numeric(begin, end, opts); }(); + if (value.has_value()) { static_cast(out_buffer)[row] = *value; } - return true; + return value.has_value(); } /** @@ -637,13 +592,14 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { + bool all_digits_valid = true; static_cast*>(out_buffer)[row] = - [&opts, output_type, begin, end]() -> device_storage_type_t { + [&opts, &all_digits_valid, output_type, begin, end]() -> device_storage_type_t { return strings::detail::parse_decimal>( begin, end, output_type.scale()); }(); - return true; + return all_digits_valid; } /** @@ -658,15 +614,16 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { - static_cast(out_buffer)[row] = [&opts, begin, end]() { + auto const value = [&opts, begin, end]() -> std::optional { // Check for user-specified true/false values auto const field_len = static_cast(end - begin); if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; } if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; } - return decode_value(begin, end, opts); + return cudf::io::parse_numeric(begin, end, opts); }(); + if (value.has_value()) { static_cast(out_buffer)[row] = *value; } - return true; + return value.has_value(); } /** @@ -682,10 +639,16 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { - T const value = decode_value(begin, end, opts); - static_cast(out_buffer)[row] = value; + auto const value = [&opts, begin, end]() -> std::optional { + // Check for user-specified true/false values + auto const field_len = static_cast(end - begin); + if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; } + if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; } + return cudf::io::parse_numeric(begin, end, opts); + }(); + static_cast(out_buffer)[row] = value.value_or(std::numeric_limits::quiet_NaN()); - return !std::isnan(value); + return value.has_value() and !std::isnan(*value); } /** @@ -702,12 +665,14 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { - if constexpr (cudf::is_timestamp() or cudf::is_duration()) { - static_cast(out_buffer)[row] = decode_value(begin, end, opts); - return true; + if constexpr (cudf::is_timestamp()) { + static_cast(out_buffer)[row] = to_timestamp(begin, end, opts.dayfirst); + } else if constexpr (cudf::is_duration()) { + static_cast(out_buffer)[row] = to_duration(begin, end); } else { return false; } + return true; } }; diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index 9ec1ec248e5..528c62e7875 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -570,9 +570,10 @@ class path_state : private parser { op.type = path_operator_type::CHILD; op.expected_type = OBJECT; } else { - op.type = path_operator_type::CHILD_INDEX; - op.index = cudf::io::parse_numeric( - op.name.data(), op.name.data() + op.name.size_bytes(), json_opts, -1); + op.type = path_operator_type::CHILD_INDEX; + auto const value = cudf::io::parse_numeric( + op.name.data(), op.name.data() + op.name.size_bytes(), json_opts); + op.index = value.value_or(-1); CUDF_EXPECTS(op.index >= 0, "Invalid numeric index specified in JSONPath"); op.expected_type = ARRAY; } From c953410bc8f29becce9f594c2bcdc708f4611fa1 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Sat, 22 Oct 2022 03:53:21 +0530 Subject: [PATCH 06/39] add unit tests of corner cases --- cpp/tests/io/json_test.cpp | 90 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index b8cd4622484..a6b477e2c42 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -29,6 +30,7 @@ #include #include +#include #include #include @@ -1451,4 +1453,92 @@ TEST_F(JsonReaderTest, JsonNestedDtypeSchema) int_wrapper{{1, 1, 2}, {true, true, true}}); } +TEST_P(JsonReaderParamTest, JsonDtypeParsing) +{ + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + // All corner cases of dtype parsing + // 0, "0", " 0", 1, "1", " 1", "a", "z", null, true, false, "null", "true", "false", nan, "nan" + // Test for dtypes: bool, int, float, str, duration, timestamp + std::string row_orient = + "[0]\n[\"0\"]\n[\" 0\"]\n[1]\n[\"1\"]\n[\" 1\"]\n[\"a\"]\n[\"z\"]\n" + "[null]\n[true]\n[false]\n[\"null\"]\n[\"true\"]\n[\"false\"]\n[nan]\n[\"nan\"]\n"; + std::string record_orient = to_records_orient({{{"0", "0"}}, + {{"0", "\"0\""}}, + {{"0", "\" 0\""}}, + {{"0", "1"}}, + {{"0", "\"1\""}}, + {{"0", "\" 1\""}}, + {{"0", "\"a\""}}, + {{"0", "\"z\""}}, + {{"0", "null"}}, + {{"0", "true"}}, + {{"0", "false"}}, + {{"0", "\"null\""}}, + {{"0", "\"true\""}}, + {{"0", "\"false\""}}, + {{"0", "nan"}}, + {{"0", "\"nan\""}}}, + "\n"); + + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; + + auto make_validity = [](std::vector const& validity) { + return cudf::detail::make_counting_transform_iterator( + 0, [=](auto i) -> bool { return static_cast(validity[i]); }); + }; + + constexpr int iNA = -1; + constexpr double dNA = std::numeric_limits::quiet_NaN(); + constexpr bool bNA = false; + + std::vector const validity_int = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0}; + std::vector const validity_float = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0}; + std::vector const validity_bool = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0}; + + auto int_col = int_wrapper{{0, 0, iNA, 1, 1, iNA, iNA, iNA, iNA, 1, 0, iNA, 1, 0, iNA, iNA}, + make_validity(validity_int)}; + auto float_col = + float_wrapper{{0.0, 0.0, dNA, 1.0, 1.0, dNA, dNA, dNA, dNA, 1.0, 0.0, dNA, 1.0, 0.0, dNA, dNA}, + make_validity(validity_float)}; + auto str_col = + cudf::test::strings_column_wrapper{// clang-format off + {"0", "0", " 0", "1", "1", " 1", "a", "z", "", "true", "false", "null", "true", "false", "nan", "nan"}, + cudf::test::iterators::nulls_at(std::vector{8, 11})}; + // clang-format on + auto bool_col = bool_wrapper{ + {false, false, bNA, true, true, bNA, bNA, bNA, bNA, true, false, bNA, true, false, bNA, bNA}, + make_validity(validity_bool)}; + + // Types to test + const std::vector dtypes = { + dtype(), dtype(), dtype(), dtype()}; + const std::vector cols{cudf::column_view(int_col), + cudf::column_view(float_col), + cudf::column_view(str_col), + cudf::column_view(bool_col)}; + for (size_t col_type = 0; col_type < cols.size(); col_type++) { + std::map dtype_schema{{"0", {dtypes[col_type]}}}; + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .dtypes(dtype_schema) + .lines(true) + .experimental(test_experimental); + + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + + EXPECT_EQ(result.tbl->num_columns(), 1); + EXPECT_EQ(result.tbl->num_rows(), 16); + + EXPECT_EQ(result.tbl->get_column(0).type().id(), dtypes[col_type].id()); + + EXPECT_EQ(result.metadata.schema_info[0].name, "0"); + + std::cout << "Column: " << col_type << std::endl; + cudf::test::print(result.tbl->get_column(0)); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), cols[col_type]); + } +} + CUDF_TEST_PROGRAM_MAIN() From 1646bcc4e4462d123c5e461efac35be865cd9c17 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Sat, 22 Oct 2022 05:01:25 +0530 Subject: [PATCH 07/39] fix old json reader to treat "null" as string. --- cpp/src/io/json/json_gpu.cu | 5 ++++- cpp/tests/io/json_test.cpp | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu index dbfcca7d37a..e4fb7e2edf5 100644 --- a/cpp/src/io/json/json_gpu.cu +++ b/cpp/src/io/json/json_gpu.cu @@ -255,13 +255,16 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts, auto const desc = next_field_descriptor(current, row_data_range.second, opts, input_field_index, col_map); auto const value_len = static_cast(std::max(desc.value_end - desc.value_begin, 0L)); + const bool is_quoted = + *(desc.value_begin - 1) == opts.quotechar && *desc.value_end == opts.quotechar; current = desc.value_end + 1; using string_index_pair = thrust::pair; // Empty fields are not legal values - if (!serialized_trie_contains(opts.trie_na, {desc.value_begin, value_len})) { + if (!serialized_trie_contains(opts.trie_na, + {desc.value_begin - is_quoted, value_len + is_quoted * 2})) { // Type dispatcher does not handle strings if (column_types[desc.column].id() == type_id::STRING) { auto str_list = static_cast(output_columns[desc.column]); diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index a6b477e2c42..c7cd8bef9e3 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1504,7 +1504,7 @@ TEST_P(JsonReaderParamTest, JsonDtypeParsing) auto str_col = cudf::test::strings_column_wrapper{// clang-format off {"0", "0", " 0", "1", "1", " 1", "a", "z", "", "true", "false", "null", "true", "false", "nan", "nan"}, - cudf::test::iterators::nulls_at(std::vector{8, 11})}; + cudf::test::iterators::nulls_at(std::vector{8})}; // clang-format on auto bool_col = bool_wrapper{ {false, false, bNA, true, true, bNA, bNA, bNA, bNA, true, false, bNA, true, false, bNA, bNA}, From f0c468bd071e878626bfbb069e67eb08b0bf9995 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Sat, 22 Oct 2022 05:01:55 +0530 Subject: [PATCH 08/39] cleanup decode_digit --- cpp/src/io/utilities/parsing_utils.cuh | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index b37ab9d2b55..daf73118fc1 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -118,7 +118,8 @@ struct parse_options { /** * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization - * for integral types. Handles hexadecimal digits, both uppercase and lowercase. + * for integral types. Handles hexadecimal digits, both uppercase and lowercase + * for integral types and only decimal digits for floating point types. * If the character is not a valid numeric digit then `0` is returned and * valid_flag is set to false. * @@ -127,11 +128,11 @@ struct parse_options { * * @return uint8_t Numeric value of the character, or `0` */ -template )> +template constexpr uint8_t decode_digit(char c, bool* valid_flag) { if (c >= '0' && c <= '9') return c - '0'; - if constexpr (as_hex) { + if constexpr (as_hex and std::is_integral_v) { if (c >= 'a' && c <= 'f') return c - 'a' + 10; if (c >= 'A' && c <= 'F') return c - 'A' + 10; } @@ -140,25 +141,6 @@ constexpr uint8_t decode_digit(char c, bool* valid_flag) return 0; } -/** - * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization - * for non-integral types. Handles only decimal digits. If the character is not - * a valid numeric digit then `0` is returned and valid_flag is set to false. - * - * @param c ASCII or UTF-8 character - * @param valid_flag Set to false if input is not valid. Unchanged otherwise. - * - * @return uint8_t Numeric value of the character, or `0` - */ -template )> -constexpr uint8_t decode_digit(char c, bool* valid_flag) -{ - if (c >= '0' && c <= '9') return c - '0'; - - *valid_flag = false; - return 0; -} - // Converts character to lowercase. constexpr char to_lower(char const c) { return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; } From cad413cf138caa36cfb4cafbf6fcb3f61f7faa72 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Sat, 22 Oct 2022 05:11:19 +0530 Subject: [PATCH 09/39] fix doc --- cpp/src/io/utilities/parsing_utils.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index daf73118fc1..dabb39f5e3a 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -117,8 +117,8 @@ struct parse_options { }; /** - * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization - * for integral types. Handles hexadecimal digits, both uppercase and lowercase + * @brief Returns the numeric value of an ASCII/UTF-8 character. + * Handles hexadecimal digits, both uppercase and lowercase * for integral types and only decimal digits for floating point types. * If the character is not a valid numeric digit then `0` is returned and * valid_flag is set to false. From bbb434a619b3170b0f38ddbfa1e0b697f5e1fa28 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Sat, 22 Oct 2022 05:13:53 +0530 Subject: [PATCH 10/39] add TODO invalid input for decimal, duration, timestamp --- cpp/src/io/utilities/parsing_utils.cuh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index dabb39f5e3a..414375147f2 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -574,6 +574,7 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { + // TODO what's invalid input bool all_digits_valid = true; static_cast*>(out_buffer)[row] = [&opts, &all_digits_valid, output_type, begin, end]() -> device_storage_type_t { @@ -647,6 +648,7 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { + // TODO what's invalid input if constexpr (cudf::is_timestamp()) { static_cast(out_buffer)[row] = to_timestamp(begin, end, opts.dayfirst); } else if constexpr (cudf::is_duration()) { From 0910f2e7b17db2bfba7c821481660507474998df Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 26 Oct 2022 20:43:45 +0530 Subject: [PATCH 11/39] fix bug in csv_reader_options construction in cython --- python/cudf/cudf/_lib/csv.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index f1a75baa951..52ef8d6c92d 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -304,7 +304,7 @@ cdef csv_reader_options make_csv_reader_options( if false_values is not None: c_false_values.reserve(len(false_values)) - for fv in c_false_values: + for fv in false_values: c_false_values.push_back(fv.encode()) csv_reader_options_c.set_false_values(c_false_values) From 31636ea0a80e55b71465fc3d160aad7744fbdabe Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 26 Oct 2022 20:55:41 +0530 Subject: [PATCH 12/39] add pytest for breaking behaviour in read_csv --- python/cudf/cudf/tests/test_csv.py | 37 ++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index b91893d8991..580b5a2b2df 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -766,6 +766,43 @@ def test_csv_reader_bools(tmpdir, names, dtypes, data, trues, falses): assert_eq(df_out, out) +def test_csv_reader_bools_NA(): + names = ["text", "int"] + dtypes = ["str", "int"] + trues = ["foo"] + falses = ["bar"] + lines = [ + ",".join(names), + "true,true", + "false,false", + "foo,foo", + "bar,bar", + "qux,qux", + ] + + buffer = "\n".join(lines) + + df = read_csv( + StringIO(buffer), + names=names, + dtype=dtypes, + skiprows=1, + true_values=trues, + false_values=falses, + ) + assert len(df.columns) == 2 + assert df["text"].dtype == np.dtype("object") + assert df["int"].dtype == np.dtype("int64") + expected = pd.DataFrame( + { + "text": ["true", "false", "foo", "bar", "qux"], + "int": [1.0, 0.0, 1.0, 0.0, np.nan], + } + ) + # breaking behaviour is np.nan for qux + assert_eq(df, expected) + + def test_csv_quotednumbers(tmpdir): fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file12.csv") From d2757dc20343f25035da4c91653afc0ea8c27719 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 26 Oct 2022 20:43:45 +0530 Subject: [PATCH 13/39] fix bug in csv_reader_options construction in cython --- python/cudf/cudf/_lib/csv.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index f1a75baa951..52ef8d6c92d 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -304,7 +304,7 @@ cdef csv_reader_options make_csv_reader_options( if false_values is not None: c_false_values.reserve(len(false_values)) - for fv in c_false_values: + for fv in false_values: c_false_values.push_back(fv.encode()) csv_reader_options_c.set_false_values(c_false_values) From e26b3d8ebbb554d1fe8b02aa4041e7d7f9699b15 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 26 Oct 2022 20:55:41 +0530 Subject: [PATCH 14/39] add pytest for breaking behaviour in read_csv --- python/cudf/cudf/tests/test_csv.py | 37 ++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index b91893d8991..580b5a2b2df 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -766,6 +766,43 @@ def test_csv_reader_bools(tmpdir, names, dtypes, data, trues, falses): assert_eq(df_out, out) +def test_csv_reader_bools_NA(): + names = ["text", "int"] + dtypes = ["str", "int"] + trues = ["foo"] + falses = ["bar"] + lines = [ + ",".join(names), + "true,true", + "false,false", + "foo,foo", + "bar,bar", + "qux,qux", + ] + + buffer = "\n".join(lines) + + df = read_csv( + StringIO(buffer), + names=names, + dtype=dtypes, + skiprows=1, + true_values=trues, + false_values=falses, + ) + assert len(df.columns) == 2 + assert df["text"].dtype == np.dtype("object") + assert df["int"].dtype == np.dtype("int64") + expected = pd.DataFrame( + { + "text": ["true", "false", "foo", "bar", "qux"], + "int": [1.0, 0.0, 1.0, 0.0, np.nan], + } + ) + # breaking behaviour is np.nan for qux + assert_eq(df, expected) + + def test_csv_quotednumbers(tmpdir): fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file12.csv") From e119facfadf2481f40302e05981ec7e9b4489113 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 28 Oct 2022 15:47:00 +0530 Subject: [PATCH 15/39] parsing error in numeric types should make that element null --- cpp/src/io/utilities/parsing_utils.cuh | 150 +++++++++---------------- cpp/src/strings/json/json_path.cu | 7 +- cpp/tests/io/json_test.cpp | 90 +++++++++++++++ python/cudf/cudf/tests/test_json.py | 52 +++++++++ 4 files changed, 201 insertions(+), 98 deletions(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 388c9b28001..414375147f2 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -117,8 +117,9 @@ struct parse_options { }; /** - * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization - * for integral types. Handles hexadecimal digits, both uppercase and lowercase. + * @brief Returns the numeric value of an ASCII/UTF-8 character. + * Handles hexadecimal digits, both uppercase and lowercase + * for integral types and only decimal digits for floating point types. * If the character is not a valid numeric digit then `0` is returned and * valid_flag is set to false. * @@ -127,31 +128,14 @@ struct parse_options { * * @return uint8_t Numeric value of the character, or `0` */ -template )> -constexpr uint8_t decode_digit(char c, bool* valid_flag) -{ - if (c >= '0' && c <= '9') return c - '0'; - if (c >= 'a' && c <= 'f') return c - 'a' + 10; - if (c >= 'A' && c <= 'F') return c - 'A' + 10; - - *valid_flag = false; - return 0; -} - -/** - * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization - * for non-integral types. Handles only decimal digits. If the character is not - * a valid numeric digit then `0` is returned and valid_flag is set to false. - * - * @param c ASCII or UTF-8 character - * @param valid_flag Set to false if input is not valid. Unchanged otherwise. - * - * @return uint8_t Numeric value of the character, or `0` - */ -template )> +template constexpr uint8_t decode_digit(char c, bool* valid_flag) { if (c >= '0' && c <= '9') return c - '0'; + if constexpr (as_hex and std::is_integral_v) { + if (c >= 'a' && c <= 'f') return c - 'a' + 10; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + } *valid_flag = false; return 0; @@ -194,13 +178,13 @@ constexpr bool is_infinity(char const* begin, char const* end) * @return The parsed and converted value */ template -constexpr T parse_numeric(char const* begin, - char const* end, - parse_options_view const& opts, - T error_result = std::numeric_limits::quiet_NaN()) +__host__ __device__ std::optional parse_numeric(char const* begin, + char const* end, + parse_options_view const& opts) { T value{}; bool all_digits_valid = true; + constexpr bool as_hex = (base == 16); // Handle negative values if necessary int32_t sign = (*begin == '-') ? -1 : 1; @@ -223,7 +207,7 @@ constexpr T parse_numeric(char const* begin, } else if (base == 10 && (*begin == 'e' || *begin == 'E')) { break; } else if (*begin != opts.thousands && *begin != '+') { - value = (value * base) + decode_digit(*begin, &all_digits_valid); + value = (value * base) + decode_digit(*begin, &all_digits_valid); } ++begin; } @@ -237,7 +221,7 @@ constexpr T parse_numeric(char const* begin, break; } else if (*begin != opts.thousands && *begin != '+') { divisor /= base; - value += decode_digit(*begin, &all_digits_valid) * divisor; + value += decode_digit(*begin, &all_digits_valid) * divisor; } ++begin; } @@ -248,12 +232,12 @@ constexpr T parse_numeric(char const* begin, if (*begin == '-' || *begin == '+') { ++begin; } int32_t exponent = 0; while (begin < end) { - exponent = (exponent * 10) + decode_digit(*(begin++), &all_digits_valid); + exponent = (exponent * 10) + decode_digit(*(begin++), &all_digits_valid); } if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); } } } - if (!all_digits_valid) { return error_result; } + if (!all_digits_valid) { return {}; } return value * sign; } @@ -524,62 +508,24 @@ __inline__ __device__ std::pair trim_whitespaces_quote } /** - * @brief Decodes a numeric value base on templated cudf type T with specified - * base. - * - * @param[in] begin Beginning of the character string - * @param[in] end End of the character string - * @param opts The global parsing behavior options - * - * @return The parsed numeric value - */ -template -__inline__ __device__ T decode_value(char const* begin, - char const* end, - parse_options_view const& opts) -{ - return cudf::io::parse_numeric(begin, end, opts); -} - -/** - * @brief Decodes a numeric value base on templated cudf type T + * @brief Adjusts the range to ignore starting/trailing quotation characters. * - * @param[in] begin Beginning of the character string - * @param[in] end End of the character string - * @param opts The global parsing behavior options + * @param[in] begin Pointer to the first character in the parsing range + * @param[in] end pointer to the first character after the parsing range + * @param[in] quotechar The character used to denote quotes; '\0' if none * - * @return The parsed numeric value + * @return Trimmed range */ -template () and !cudf::is_duration())> -__inline__ __device__ T decode_value(char const* begin, - char const* end, - parse_options_view const& opts) -{ - return cudf::io::parse_numeric(begin, end, opts); -} - -template ())> -__inline__ __device__ T decode_value(char const* begin, - char const* end, - parse_options_view const& opts) +__inline__ __device__ std::pair trim_quotes(char const* begin, + char const* end, + char quotechar = '\0') { - // If this is a string value, remove quotes - if ((thrust::distance(begin, end) >= 2 && *begin == '\"' && *thrust::prev(end) == '\"')) { + if ((thrust::distance(begin, end) >= 2 && *begin == quotechar && + *thrust::prev(end) == quotechar)) { thrust::advance(begin, 1); thrust::advance(end, -1); } - return to_timestamp(begin, end, opts.dayfirst); -} - -template ())> -__inline__ __device__ T decode_value(char const* begin, char const* end, parse_options_view const&) -{ - // If this is a string value, remove quotes - if ((thrust::distance(begin, end) >= 2 && *begin == '\"' && *thrust::prev(end) == '\"')) { - thrust::advance(begin, 1); - thrust::advance(end, -1); - } - return to_duration(begin, end); + return {begin, end}; } struct ConvertFunctor { @@ -601,15 +547,17 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex = false) { - static_cast(out_buffer)[row] = [as_hex, &opts, begin, end]() -> T { + auto const value = [as_hex, &opts, begin, end]() -> std::optional { // Check for user-specified true/false values auto const field_len = static_cast(end - begin); if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return 1; } if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return 0; } - return as_hex ? decode_value(begin, end, opts) : decode_value(begin, end, opts); + return as_hex ? cudf::io::parse_numeric(begin, end, opts) + : cudf::io::parse_numeric(begin, end, opts); }(); + if (value.has_value()) { static_cast(out_buffer)[row] = *value; } - return true; + return value.has_value(); } /** @@ -626,13 +574,15 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { + // TODO what's invalid input + bool all_digits_valid = true; static_cast*>(out_buffer)[row] = - [&opts, output_type, begin, end]() -> device_storage_type_t { + [&opts, &all_digits_valid, output_type, begin, end]() -> device_storage_type_t { return strings::detail::parse_decimal>( begin, end, output_type.scale()); }(); - return true; + return all_digits_valid; } /** @@ -647,15 +597,16 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { - static_cast(out_buffer)[row] = [&opts, begin, end]() { + auto const value = [&opts, begin, end]() -> std::optional { // Check for user-specified true/false values auto const field_len = static_cast(end - begin); if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; } if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; } - return decode_value(begin, end, opts); + return cudf::io::parse_numeric(begin, end, opts); }(); + if (value.has_value()) { static_cast(out_buffer)[row] = *value; } - return true; + return value.has_value(); } /** @@ -671,10 +622,16 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { - T const value = decode_value(begin, end, opts); - static_cast(out_buffer)[row] = value; + auto const value = [&opts, begin, end]() -> std::optional { + // Check for user-specified true/false values + auto const field_len = static_cast(end - begin); + if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; } + if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; } + return cudf::io::parse_numeric(begin, end, opts); + }(); + static_cast(out_buffer)[row] = value.value_or(std::numeric_limits::quiet_NaN()); - return !std::isnan(value); + return value.has_value() and !std::isnan(*value); } /** @@ -691,12 +648,15 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { - if constexpr (cudf::is_timestamp() or cudf::is_duration()) { - static_cast(out_buffer)[row] = decode_value(begin, end, opts); - return true; + // TODO what's invalid input + if constexpr (cudf::is_timestamp()) { + static_cast(out_buffer)[row] = to_timestamp(begin, end, opts.dayfirst); + } else if constexpr (cudf::is_duration()) { + static_cast(out_buffer)[row] = to_duration(begin, end); } else { return false; } + return true; } }; diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index 303c35ea7fb..5ce00eeee4a 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -570,9 +570,10 @@ class path_state : private parser { op.type = path_operator_type::CHILD; op.expected_type = OBJECT; } else { - op.type = path_operator_type::CHILD_INDEX; - op.index = cudf::io::parse_numeric( - op.name.data(), op.name.data() + op.name.size_bytes(), json_opts, -1); + op.type = path_operator_type::CHILD_INDEX; + auto const value = cudf::io::parse_numeric( + op.name.data(), op.name.data() + op.name.size_bytes(), json_opts); + op.index = value.value_or(-1); CUDF_EXPECTS(op.index >= 0, "Invalid numeric index specified in JSONPath"); op.expected_type = ARRAY; } diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index b8cd4622484..c7cd8bef9e3 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -29,6 +30,7 @@ #include #include +#include #include #include @@ -1451,4 +1453,92 @@ TEST_F(JsonReaderTest, JsonNestedDtypeSchema) int_wrapper{{1, 1, 2}, {true, true, true}}); } +TEST_P(JsonReaderParamTest, JsonDtypeParsing) +{ + auto const test_opt = GetParam(); + bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); + // All corner cases of dtype parsing + // 0, "0", " 0", 1, "1", " 1", "a", "z", null, true, false, "null", "true", "false", nan, "nan" + // Test for dtypes: bool, int, float, str, duration, timestamp + std::string row_orient = + "[0]\n[\"0\"]\n[\" 0\"]\n[1]\n[\"1\"]\n[\" 1\"]\n[\"a\"]\n[\"z\"]\n" + "[null]\n[true]\n[false]\n[\"null\"]\n[\"true\"]\n[\"false\"]\n[nan]\n[\"nan\"]\n"; + std::string record_orient = to_records_orient({{{"0", "0"}}, + {{"0", "\"0\""}}, + {{"0", "\" 0\""}}, + {{"0", "1"}}, + {{"0", "\"1\""}}, + {{"0", "\" 1\""}}, + {{"0", "\"a\""}}, + {{"0", "\"z\""}}, + {{"0", "null"}}, + {{"0", "true"}}, + {{"0", "false"}}, + {{"0", "\"null\""}}, + {{"0", "\"true\""}}, + {{"0", "\"false\""}}, + {{"0", "nan"}}, + {{"0", "\"nan\""}}}, + "\n"); + + std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; + + auto make_validity = [](std::vector const& validity) { + return cudf::detail::make_counting_transform_iterator( + 0, [=](auto i) -> bool { return static_cast(validity[i]); }); + }; + + constexpr int iNA = -1; + constexpr double dNA = std::numeric_limits::quiet_NaN(); + constexpr bool bNA = false; + + std::vector const validity_int = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0}; + std::vector const validity_float = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0}; + std::vector const validity_bool = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0}; + + auto int_col = int_wrapper{{0, 0, iNA, 1, 1, iNA, iNA, iNA, iNA, 1, 0, iNA, 1, 0, iNA, iNA}, + make_validity(validity_int)}; + auto float_col = + float_wrapper{{0.0, 0.0, dNA, 1.0, 1.0, dNA, dNA, dNA, dNA, 1.0, 0.0, dNA, 1.0, 0.0, dNA, dNA}, + make_validity(validity_float)}; + auto str_col = + cudf::test::strings_column_wrapper{// clang-format off + {"0", "0", " 0", "1", "1", " 1", "a", "z", "", "true", "false", "null", "true", "false", "nan", "nan"}, + cudf::test::iterators::nulls_at(std::vector{8})}; + // clang-format on + auto bool_col = bool_wrapper{ + {false, false, bNA, true, true, bNA, bNA, bNA, bNA, true, false, bNA, true, false, bNA, bNA}, + make_validity(validity_bool)}; + + // Types to test + const std::vector dtypes = { + dtype(), dtype(), dtype(), dtype()}; + const std::vector cols{cudf::column_view(int_col), + cudf::column_view(float_col), + cudf::column_view(str_col), + cudf::column_view(bool_col)}; + for (size_t col_type = 0; col_type < cols.size(); col_type++) { + std::map dtype_schema{{"0", {dtypes[col_type]}}}; + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .dtypes(dtype_schema) + .lines(true) + .experimental(test_experimental); + + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + + EXPECT_EQ(result.tbl->num_columns(), 1); + EXPECT_EQ(result.tbl->num_rows(), 16); + + EXPECT_EQ(result.tbl->get_column(0).type().id(), dtypes[col_type].id()); + + EXPECT_EQ(result.metadata.schema_info[0].name, "0"); + + std::cout << "Column: " << col_type << std::endl; + cudf::test::print(result.tbl->get_column(0)); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), cols[col_type]); + } +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index fb2c24b3757..8d06d3e55f3 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -687,6 +687,58 @@ def test_json_types_data(): assert df.to_arrow().equals(pa_table_pdf) +@pytest.mark.parametrize( + "col_type,json_str", + [ + # without quotes + ("int", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'), + # with quotes + ("int", '[{"k": "1"}, {"k": "2"}]'), + # with quotes, mixed + ("int", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'), + # with quotes, null, mixed + ("int", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'), + # without quotes, null + ("int", '[{"k": 1}, {"k": 2}, {"k": null}, {"k": 4}]'), + # without quotes + ("float", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'), + # with quotes + ("float", '[{"k": "1"}, {"k": "2"}]'), + # with quotes, mixed + ("float", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'), + # with quotes, null, mixed + ("float", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'), + # with quotes, NAN + ("float", '[{"k": "1"}, {"k": "2"}, {"k": NaN}, {"k": "4"}]'), + # without quotes + ("str", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'), + # with quotes + ("str", '[{"k": "1"}, {"k": "2"}]'), + # with quotes, mixed + ("str", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'), + # with quotes, null, mixed + ("str", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'), + # without quotes, null + ("str", '[{"k": 1}, {"k": 2}, {"k": null}, {"k": 4}]'), + ], +) +def test_json_quoted_values_with_schema(col_type, json_str): + experimental_df = cudf.read_json( + StringIO(json_str), + engine="cudf_experimental", + orient="records", + dtype={"k": col_type}, + ) + cudf_df = cudf.read_json( + StringIO(json_str.replace(",", "\n")[1:-1]), + engine="cudf", + orient="records", + lines=True, + dtype={"k": col_type}, + ) + assert_eq(cudf_df, experimental_df) + + @pytest.mark.parametrize( "keep_quotes,result", [ From 3f8eb525aa7dd0ab292deb515ab580cb08f3d94e Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 28 Oct 2022 16:42:33 +0530 Subject: [PATCH 16/39] revert parsing error changes --- cpp/src/io/utilities/parsing_utils.cuh | 155 +++++++++++++++++-------- cpp/src/strings/json/json_path.cu | 7 +- cpp/tests/io/json_test.cpp | 90 -------------- python/cudf/cudf/_lib/csv.pyx | 2 +- python/cudf/cudf/tests/test_csv.py | 37 ------ python/cudf/cudf/tests/test_json.py | 52 --------- 6 files changed, 112 insertions(+), 231 deletions(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 414375147f2..354abfa3985 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -117,9 +117,8 @@ struct parse_options { }; /** - * @brief Returns the numeric value of an ASCII/UTF-8 character. - * Handles hexadecimal digits, both uppercase and lowercase - * for integral types and only decimal digits for floating point types. + * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization + * for integral types. Handles hexadecimal digits, both uppercase and lowercase. * If the character is not a valid numeric digit then `0` is returned and * valid_flag is set to false. * @@ -128,14 +127,31 @@ struct parse_options { * * @return uint8_t Numeric value of the character, or `0` */ -template +template )> +constexpr uint8_t decode_digit(char c, bool* valid_flag) +{ + if (c >= '0' && c <= '9') return c - '0'; + if (c >= 'a' && c <= 'f') return c - 'a' + 10; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + + *valid_flag = false; + return 0; +} + +/** + * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization + * for non-integral types. Handles only decimal digits. If the character is not + * a valid numeric digit then `0` is returned and valid_flag is set to false. + * + * @param c ASCII or UTF-8 character + * @param valid_flag Set to false if input is not valid. Unchanged otherwise. + * + * @return uint8_t Numeric value of the character, or `0` + */ +template )> constexpr uint8_t decode_digit(char c, bool* valid_flag) { if (c >= '0' && c <= '9') return c - '0'; - if constexpr (as_hex and std::is_integral_v) { - if (c >= 'a' && c <= 'f') return c - 'a' + 10; - if (c >= 'A' && c <= 'F') return c - 'A' + 10; - } *valid_flag = false; return 0; @@ -178,13 +194,13 @@ constexpr bool is_infinity(char const* begin, char const* end) * @return The parsed and converted value */ template -__host__ __device__ std::optional parse_numeric(char const* begin, - char const* end, - parse_options_view const& opts) +constexpr T parse_numeric(char const* begin, + char const* end, + parse_options_view const& opts, + T error_result = std::numeric_limits::quiet_NaN()) { T value{}; bool all_digits_valid = true; - constexpr bool as_hex = (base == 16); // Handle negative values if necessary int32_t sign = (*begin == '-') ? -1 : 1; @@ -207,7 +223,7 @@ __host__ __device__ std::optional parse_numeric(char const* begin, } else if (base == 10 && (*begin == 'e' || *begin == 'E')) { break; } else if (*begin != opts.thousands && *begin != '+') { - value = (value * base) + decode_digit(*begin, &all_digits_valid); + value = (value * base) + decode_digit(*begin, &all_digits_valid); } ++begin; } @@ -221,7 +237,7 @@ __host__ __device__ std::optional parse_numeric(char const* begin, break; } else if (*begin != opts.thousands && *begin != '+') { divisor /= base; - value += decode_digit(*begin, &all_digits_valid) * divisor; + value += decode_digit(*begin, &all_digits_valid) * divisor; } ++begin; } @@ -232,12 +248,12 @@ __host__ __device__ std::optional parse_numeric(char const* begin, if (*begin == '-' || *begin == '+') { ++begin; } int32_t exponent = 0; while (begin < end) { - exponent = (exponent * 10) + decode_digit(*(begin++), &all_digits_valid); + exponent = (exponent * 10) + decode_digit(*(begin++), &all_digits_valid); } if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); } } } - if (!all_digits_valid) { return {}; } + if (!all_digits_valid) { return error_result; } return value * sign; } @@ -510,9 +526,9 @@ __inline__ __device__ std::pair trim_whitespaces_quote /** * @brief Adjusts the range to ignore starting/trailing quotation characters. * - * @param[in] begin Pointer to the first character in the parsing range - * @param[in] end pointer to the first character after the parsing range - * @param[in] quotechar The character used to denote quotes; '\0' if none + * @param begin Pointer to the first character in the parsing range + * @param end pointer to the first character after the parsing range + * @param quotechar The character used to denote quotes; '\0' if none * * @return Trimmed range */ @@ -528,6 +544,65 @@ __inline__ __device__ std::pair trim_quotes(char const return {begin, end}; } +/** + * @brief Decodes a numeric value base on templated cudf type T with specified + * base. + * + * @param[in] begin Beginning of the character string + * @param[in] end End of the character string + * @param opts The global parsing behavior options + * + * @return The parsed numeric value + */ +template +__inline__ __device__ T decode_value(char const* begin, + char const* end, + parse_options_view const& opts) +{ + return cudf::io::parse_numeric(begin, end, opts); +} + +/** + * @brief Decodes a numeric value base on templated cudf type T + * + * @param[in] begin Beginning of the character string + * @param[in] end End of the character string + * @param opts The global parsing behavior options + * + * @return The parsed numeric value + */ +template () and !cudf::is_duration())> +__inline__ __device__ T decode_value(char const* begin, + char const* end, + parse_options_view const& opts) +{ + return cudf::io::parse_numeric(begin, end, opts); +} + +template ())> +__inline__ __device__ T decode_value(char const* begin, + char const* end, + parse_options_view const& opts) +{ + // If this is a string value, remove quotes + if ((thrust::distance(begin, end) >= 2 && *begin == '\"' && *thrust::prev(end) == '\"')) { + thrust::advance(begin, 1); + thrust::advance(end, -1); + } + return to_timestamp(begin, end, opts.dayfirst); +} + +template ())> +__inline__ __device__ T decode_value(char const* begin, char const* end, parse_options_view const&) +{ + // If this is a string value, remove quotes + if ((thrust::distance(begin, end) >= 2 && *begin == '\"' && *thrust::prev(end) == '\"')) { + thrust::advance(begin, 1); + thrust::advance(end, -1); + } + return to_duration(begin, end); +} + struct ConvertFunctor { /** * @brief Dispatch for numeric types whose values can be convertible to @@ -547,17 +622,15 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex = false) { - auto const value = [as_hex, &opts, begin, end]() -> std::optional { + static_cast(out_buffer)[row] = [as_hex, &opts, begin, end]() -> T { // Check for user-specified true/false values auto const field_len = static_cast(end - begin); if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return 1; } if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return 0; } - return as_hex ? cudf::io::parse_numeric(begin, end, opts) - : cudf::io::parse_numeric(begin, end, opts); + return as_hex ? decode_value(begin, end, opts) : decode_value(begin, end, opts); }(); - if (value.has_value()) { static_cast(out_buffer)[row] = *value; } - return value.has_value(); + return true; } /** @@ -574,15 +647,13 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { - // TODO what's invalid input - bool all_digits_valid = true; static_cast*>(out_buffer)[row] = - [&opts, &all_digits_valid, output_type, begin, end]() -> device_storage_type_t { + [&opts, output_type, begin, end]() -> device_storage_type_t { return strings::detail::parse_decimal>( begin, end, output_type.scale()); }(); - return all_digits_valid; + return true; } /** @@ -597,16 +668,15 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { - auto const value = [&opts, begin, end]() -> std::optional { + static_cast(out_buffer)[row] = [&opts, begin, end]() { // Check for user-specified true/false values auto const field_len = static_cast(end - begin); if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; } if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; } - return cudf::io::parse_numeric(begin, end, opts); + return decode_value(begin, end, opts); }(); - if (value.has_value()) { static_cast(out_buffer)[row] = *value; } - return value.has_value(); + return true; } /** @@ -622,16 +692,10 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { - auto const value = [&opts, begin, end]() -> std::optional { - // Check for user-specified true/false values - auto const field_len = static_cast(end - begin); - if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; } - if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; } - return cudf::io::parse_numeric(begin, end, opts); - }(); - static_cast(out_buffer)[row] = value.value_or(std::numeric_limits::quiet_NaN()); + T const value = decode_value(begin, end, opts); + static_cast(out_buffer)[row] = value; - return value.has_value() and !std::isnan(*value); + return !std::isnan(value); } /** @@ -648,15 +712,12 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { - // TODO what's invalid input - if constexpr (cudf::is_timestamp()) { - static_cast(out_buffer)[row] = to_timestamp(begin, end, opts.dayfirst); - } else if constexpr (cudf::is_duration()) { - static_cast(out_buffer)[row] = to_duration(begin, end); + if constexpr (cudf::is_timestamp() or cudf::is_duration()) { + static_cast(out_buffer)[row] = decode_value(begin, end, opts); + return true; } else { return false; } - return true; } }; diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index 5ce00eeee4a..303c35ea7fb 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -570,10 +570,9 @@ class path_state : private parser { op.type = path_operator_type::CHILD; op.expected_type = OBJECT; } else { - op.type = path_operator_type::CHILD_INDEX; - auto const value = cudf::io::parse_numeric( - op.name.data(), op.name.data() + op.name.size_bytes(), json_opts); - op.index = value.value_or(-1); + op.type = path_operator_type::CHILD_INDEX; + op.index = cudf::io::parse_numeric( + op.name.data(), op.name.data() + op.name.size_bytes(), json_opts, -1); CUDF_EXPECTS(op.index >= 0, "Invalid numeric index specified in JSONPath"); op.expected_type = ARRAY; } diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index c7cd8bef9e3..b8cd4622484 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -18,7 +18,6 @@ #include #include #include -#include #include #include @@ -30,7 +29,6 @@ #include #include -#include #include #include @@ -1453,92 +1451,4 @@ TEST_F(JsonReaderTest, JsonNestedDtypeSchema) int_wrapper{{1, 1, 2}, {true, true, true}}); } -TEST_P(JsonReaderParamTest, JsonDtypeParsing) -{ - auto const test_opt = GetParam(); - bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient); - // All corner cases of dtype parsing - // 0, "0", " 0", 1, "1", " 1", "a", "z", null, true, false, "null", "true", "false", nan, "nan" - // Test for dtypes: bool, int, float, str, duration, timestamp - std::string row_orient = - "[0]\n[\"0\"]\n[\" 0\"]\n[1]\n[\"1\"]\n[\" 1\"]\n[\"a\"]\n[\"z\"]\n" - "[null]\n[true]\n[false]\n[\"null\"]\n[\"true\"]\n[\"false\"]\n[nan]\n[\"nan\"]\n"; - std::string record_orient = to_records_orient({{{"0", "0"}}, - {{"0", "\"0\""}}, - {{"0", "\" 0\""}}, - {{"0", "1"}}, - {{"0", "\"1\""}}, - {{"0", "\" 1\""}}, - {{"0", "\"a\""}}, - {{"0", "\"z\""}}, - {{"0", "null"}}, - {{"0", "true"}}, - {{"0", "false"}}, - {{"0", "\"null\""}}, - {{"0", "\"true\""}}, - {{"0", "\"false\""}}, - {{"0", "nan"}}, - {{"0", "\"nan\""}}}, - "\n"); - - std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient; - - auto make_validity = [](std::vector const& validity) { - return cudf::detail::make_counting_transform_iterator( - 0, [=](auto i) -> bool { return static_cast(validity[i]); }); - }; - - constexpr int iNA = -1; - constexpr double dNA = std::numeric_limits::quiet_NaN(); - constexpr bool bNA = false; - - std::vector const validity_int = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0}; - std::vector const validity_float = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0}; - std::vector const validity_bool = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0}; - - auto int_col = int_wrapper{{0, 0, iNA, 1, 1, iNA, iNA, iNA, iNA, 1, 0, iNA, 1, 0, iNA, iNA}, - make_validity(validity_int)}; - auto float_col = - float_wrapper{{0.0, 0.0, dNA, 1.0, 1.0, dNA, dNA, dNA, dNA, 1.0, 0.0, dNA, 1.0, 0.0, dNA, dNA}, - make_validity(validity_float)}; - auto str_col = - cudf::test::strings_column_wrapper{// clang-format off - {"0", "0", " 0", "1", "1", " 1", "a", "z", "", "true", "false", "null", "true", "false", "nan", "nan"}, - cudf::test::iterators::nulls_at(std::vector{8})}; - // clang-format on - auto bool_col = bool_wrapper{ - {false, false, bNA, true, true, bNA, bNA, bNA, bNA, true, false, bNA, true, false, bNA, bNA}, - make_validity(validity_bool)}; - - // Types to test - const std::vector dtypes = { - dtype(), dtype(), dtype(), dtype()}; - const std::vector cols{cudf::column_view(int_col), - cudf::column_view(float_col), - cudf::column_view(str_col), - cudf::column_view(bool_col)}; - for (size_t col_type = 0; col_type < cols.size(); col_type++) { - std::map dtype_schema{{"0", {dtypes[col_type]}}}; - cudf::io::json_reader_options in_options = - cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) - .dtypes(dtype_schema) - .lines(true) - .experimental(test_experimental); - - cudf::io::table_with_metadata result = cudf::io::read_json(in_options); - - EXPECT_EQ(result.tbl->num_columns(), 1); - EXPECT_EQ(result.tbl->num_rows(), 16); - - EXPECT_EQ(result.tbl->get_column(0).type().id(), dtypes[col_type].id()); - - EXPECT_EQ(result.metadata.schema_info[0].name, "0"); - - std::cout << "Column: " << col_type << std::endl; - cudf::test::print(result.tbl->get_column(0)); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), cols[col_type]); - } -} - CUDF_TEST_PROGRAM_MAIN() diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 52ef8d6c92d..f1a75baa951 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -304,7 +304,7 @@ cdef csv_reader_options make_csv_reader_options( if false_values is not None: c_false_values.reserve(len(false_values)) - for fv in false_values: + for fv in c_false_values: c_false_values.push_back(fv.encode()) csv_reader_options_c.set_false_values(c_false_values) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 580b5a2b2df..b91893d8991 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -766,43 +766,6 @@ def test_csv_reader_bools(tmpdir, names, dtypes, data, trues, falses): assert_eq(df_out, out) -def test_csv_reader_bools_NA(): - names = ["text", "int"] - dtypes = ["str", "int"] - trues = ["foo"] - falses = ["bar"] - lines = [ - ",".join(names), - "true,true", - "false,false", - "foo,foo", - "bar,bar", - "qux,qux", - ] - - buffer = "\n".join(lines) - - df = read_csv( - StringIO(buffer), - names=names, - dtype=dtypes, - skiprows=1, - true_values=trues, - false_values=falses, - ) - assert len(df.columns) == 2 - assert df["text"].dtype == np.dtype("object") - assert df["int"].dtype == np.dtype("int64") - expected = pd.DataFrame( - { - "text": ["true", "false", "foo", "bar", "qux"], - "int": [1.0, 0.0, 1.0, 0.0, np.nan], - } - ) - # breaking behaviour is np.nan for qux - assert_eq(df, expected) - - def test_csv_quotednumbers(tmpdir): fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file12.csv") diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 8d06d3e55f3..fb2c24b3757 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -687,58 +687,6 @@ def test_json_types_data(): assert df.to_arrow().equals(pa_table_pdf) -@pytest.mark.parametrize( - "col_type,json_str", - [ - # without quotes - ("int", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'), - # with quotes - ("int", '[{"k": "1"}, {"k": "2"}]'), - # with quotes, mixed - ("int", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'), - # with quotes, null, mixed - ("int", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'), - # without quotes, null - ("int", '[{"k": 1}, {"k": 2}, {"k": null}, {"k": 4}]'), - # without quotes - ("float", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'), - # with quotes - ("float", '[{"k": "1"}, {"k": "2"}]'), - # with quotes, mixed - ("float", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'), - # with quotes, null, mixed - ("float", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'), - # with quotes, NAN - ("float", '[{"k": "1"}, {"k": "2"}, {"k": NaN}, {"k": "4"}]'), - # without quotes - ("str", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'), - # with quotes - ("str", '[{"k": "1"}, {"k": "2"}]'), - # with quotes, mixed - ("str", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'), - # with quotes, null, mixed - ("str", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'), - # without quotes, null - ("str", '[{"k": 1}, {"k": 2}, {"k": null}, {"k": 4}]'), - ], -) -def test_json_quoted_values_with_schema(col_type, json_str): - experimental_df = cudf.read_json( - StringIO(json_str), - engine="cudf_experimental", - orient="records", - dtype={"k": col_type}, - ) - cudf_df = cudf.read_json( - StringIO(json_str.replace(",", "\n")[1:-1]), - engine="cudf", - orient="records", - lines=True, - dtype={"k": col_type}, - ) - assert_eq(cudf_df, experimental_df) - - @pytest.mark.parametrize( "keep_quotes,result", [ From e0f6e3d28e4a5ff7f41d9991ca3180484f24e091 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 28 Oct 2022 17:04:15 +0530 Subject: [PATCH 17/39] unit tests for string quotes --- python/cudf/cudf/tests/test_json.py | 52 +++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index fb2c24b3757..8d06d3e55f3 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -687,6 +687,58 @@ def test_json_types_data(): assert df.to_arrow().equals(pa_table_pdf) +@pytest.mark.parametrize( + "col_type,json_str", + [ + # without quotes + ("int", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'), + # with quotes + ("int", '[{"k": "1"}, {"k": "2"}]'), + # with quotes, mixed + ("int", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'), + # with quotes, null, mixed + ("int", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'), + # without quotes, null + ("int", '[{"k": 1}, {"k": 2}, {"k": null}, {"k": 4}]'), + # without quotes + ("float", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'), + # with quotes + ("float", '[{"k": "1"}, {"k": "2"}]'), + # with quotes, mixed + ("float", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'), + # with quotes, null, mixed + ("float", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'), + # with quotes, NAN + ("float", '[{"k": "1"}, {"k": "2"}, {"k": NaN}, {"k": "4"}]'), + # without quotes + ("str", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'), + # with quotes + ("str", '[{"k": "1"}, {"k": "2"}]'), + # with quotes, mixed + ("str", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'), + # with quotes, null, mixed + ("str", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'), + # without quotes, null + ("str", '[{"k": 1}, {"k": 2}, {"k": null}, {"k": 4}]'), + ], +) +def test_json_quoted_values_with_schema(col_type, json_str): + experimental_df = cudf.read_json( + StringIO(json_str), + engine="cudf_experimental", + orient="records", + dtype={"k": col_type}, + ) + cudf_df = cudf.read_json( + StringIO(json_str.replace(",", "\n")[1:-1]), + engine="cudf", + orient="records", + lines=True, + dtype={"k": col_type}, + ) + assert_eq(cudf_df, experimental_df) + + @pytest.mark.parametrize( "keep_quotes,result", [ From 1317be3b5515ca6f1dd4226ed1d97c6912a609ee Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 28 Oct 2022 17:21:51 +0530 Subject: [PATCH 18/39] remove debug prints --- cpp/tests/io/json_test.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index c7cd8bef9e3..396072e8b8a 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1529,14 +1529,9 @@ TEST_P(JsonReaderParamTest, JsonDtypeParsing) EXPECT_EQ(result.tbl->num_columns(), 1); EXPECT_EQ(result.tbl->num_rows(), 16); - - EXPECT_EQ(result.tbl->get_column(0).type().id(), dtypes[col_type].id()); - EXPECT_EQ(result.metadata.schema_info[0].name, "0"); - std::cout << "Column: " << col_type << std::endl; - cudf::test::print(result.tbl->get_column(0)); - + EXPECT_EQ(result.tbl->get_column(0).type().id(), dtypes[col_type].id()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), cols[col_type]); } } From ce863b389fbbc5ea99e4522fcae02aad9ba9266c Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 28 Oct 2022 17:24:20 +0530 Subject: [PATCH 19/39] replace @param[in] with @param --- cpp/src/io/utilities/parsing_utils.cuh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 414375147f2..405f425716d 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -469,7 +469,7 @@ cudf::size_type count_all_from_set(host_span data, /** * @brief Checks whether the given character is a whitespace character. * - * @param[in] ch The character to check + * @param ch The character to check * * @return True if the input is whitespace, False otherwise */ @@ -487,9 +487,9 @@ __inline__ __device__ It skip_character(It const& it, char ch) /** * @brief Adjusts the range to ignore starting/trailing whitespace and quotation characters. * - * @param[in] begin Pointer to the first character in the parsing range - * @param[in] end pointer to the first character after the parsing range - * @param[in] quotechar The character used to denote quotes; '\0' if none + * @param begin Pointer to the first character in the parsing range + * @param end pointer to the first character after the parsing range + * @param quotechar The character used to denote quotes; '\0' if none * * @return Trimmed range */ @@ -510,9 +510,9 @@ __inline__ __device__ std::pair trim_whitespaces_quote /** * @brief Adjusts the range to ignore starting/trailing quotation characters. * - * @param[in] begin Pointer to the first character in the parsing range - * @param[in] end pointer to the first character after the parsing range - * @param[in] quotechar The character used to denote quotes; '\0' if none + * @param begin Pointer to the first character in the parsing range + * @param end pointer to the first character after the parsing range + * @param quotechar The character used to denote quotes; '\0' if none * * @return Trimmed range */ From d9fc33b16390399cc2678401da68cf7a52c34c10 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Fri, 28 Oct 2022 17:31:59 +0530 Subject: [PATCH 20/39] nan fix moved to PR #12022 --- python/cudf/cudf/tests/test_csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 580b5a2b2df..0895da19c8b 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -796,7 +796,7 @@ def test_csv_reader_bools_NA(): expected = pd.DataFrame( { "text": ["true", "false", "foo", "bar", "qux"], - "int": [1.0, 0.0, 1.0, 0.0, np.nan], + "int": [1, 0, 1, 0, 0], } ) # breaking behaviour is np.nan for qux From 2ec4bd88b6e79c7f2fd9b720580c5a5d26c2c7a6 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 28 Oct 2022 17:35:29 +0530 Subject: [PATCH 21/39] null for parsing error in bool - update pytest --- python/cudf/cudf/tests/test_csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 0895da19c8b..580b5a2b2df 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -796,7 +796,7 @@ def test_csv_reader_bools_NA(): expected = pd.DataFrame( { "text": ["true", "false", "foo", "bar", "qux"], - "int": [1, 0, 1, 0, 0], + "int": [1.0, 0.0, 1.0, 0.0, np.nan], } ) # breaking behaviour is np.nan for qux From ff77d5f7d6bd677ca85006cd59899a96af3e93fb Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 28 Oct 2022 18:03:27 +0530 Subject: [PATCH 22/39] doc update --- cpp/src/io/utilities/parsing_utils.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 405f425716d..0077d3f1fa4 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -488,7 +488,7 @@ __inline__ __device__ It skip_character(It const& it, char ch) * @brief Adjusts the range to ignore starting/trailing whitespace and quotation characters. * * @param begin Pointer to the first character in the parsing range - * @param end pointer to the first character after the parsing range + * @param end Pointer to the first character after the parsing range * @param quotechar The character used to denote quotes; '\0' if none * * @return Trimmed range @@ -511,7 +511,7 @@ __inline__ __device__ std::pair trim_whitespaces_quote * @brief Adjusts the range to ignore starting/trailing quotation characters. * * @param begin Pointer to the first character in the parsing range - * @param end pointer to the first character after the parsing range + * @param end Pointer to the first character after the parsing range * @param quotechar The character used to denote quotes; '\0' if none * * @return Trimmed range From ba36b86e0e5c7232e70fd90f49a5289004eef26c Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 28 Oct 2022 18:03:42 +0530 Subject: [PATCH 23/39] skip writing NAN for floating type for nulls --- cpp/src/io/utilities/parsing_utils.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 0077d3f1fa4..f003724c77b 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -629,7 +629,7 @@ struct ConvertFunctor { if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; } return cudf::io::parse_numeric(begin, end, opts); }(); - static_cast(out_buffer)[row] = value.value_or(std::numeric_limits::quiet_NaN()); + if (value.has_value()) { static_cast(out_buffer)[row] = *value; } return value.has_value() and !std::isnan(*value); } From 3e7148237c76643fa13295875e919c208b8472fe Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 28 Oct 2022 19:11:19 +0530 Subject: [PATCH 24/39] fix dereferencing {begin - 1} in json reader --- cpp/src/io/json/json_gpu.cu | 28 ++++++++++++++++++-------- cpp/src/io/utilities/parsing_utils.cuh | 22 ++++++++++++++++++++ 2 files changed, 42 insertions(+), 8 deletions(-) diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu index e4fb7e2edf5..ca2737a5930 100644 --- a/cpp/src/io/json/json_gpu.cu +++ b/cpp/src/io/json/json_gpu.cu @@ -156,6 +156,7 @@ struct field_descriptor { cudf::size_type column; char const* value_begin; char const* value_end; + bool is_quoted; }; /** @@ -178,7 +179,10 @@ __device__ field_descriptor next_field_descriptor(const char* begin, auto const desc_pre_trim = col_map.capacity() == 0 // No key - column and begin are trivial - ? field_descriptor{field_idx, begin, cudf::io::gpu::seek_field_end(begin, end, opts, true)} + ? field_descriptor{field_idx, + begin, + cudf::io::gpu::seek_field_end(begin, end, opts, true), + false} : [&]() { auto const key_range = get_next_key(begin, end, opts.quotechar); auto const key_hash = cudf::detail::MurmurHash3_32{}( @@ -189,14 +193,23 @@ __device__ field_descriptor next_field_descriptor(const char* begin, // Skip the colon between the key and the value auto const value_begin = thrust::find(thrust::seq, key_range.second, end, ':') + 1; - return field_descriptor{ - column, value_begin, cudf::io::gpu::seek_field_end(value_begin, end, opts, true)}; + return field_descriptor{column, + value_begin, + cudf::io::gpu::seek_field_end(value_begin, end, opts, true), + false}; }(); // Modify start & end to ignore whitespace and quotechars auto const trimmed_value_range = - trim_whitespaces_quotes(desc_pre_trim.value_begin, desc_pre_trim.value_end, opts.quotechar); - return {desc_pre_trim.column, trimmed_value_range.first, trimmed_value_range.second}; + trim_whitespaces(desc_pre_trim.value_begin, desc_pre_trim.value_end); + bool const is_quoted = + thrust::distance(trimmed_value_range.first, trimmed_value_range.second) >= 2 and + *trimmed_value_range.first == opts.quotechar and + *thrust::prev(trimmed_value_range.second) == opts.quotechar; + return {desc_pre_trim.column, + trimmed_value_range.first + is_quoted, + trimmed_value_range.second - is_quoted, + is_quoted}; } /** @@ -255,8 +268,7 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts, auto const desc = next_field_descriptor(current, row_data_range.second, opts, input_field_index, col_map); auto const value_len = static_cast(std::max(desc.value_end - desc.value_begin, 0L)); - const bool is_quoted = - *(desc.value_begin - 1) == opts.quotechar && *desc.value_end == opts.quotechar; + auto const is_quoted = static_cast(desc.is_quoted); current = desc.value_end + 1; @@ -348,7 +360,7 @@ __global__ void detect_data_types_kernel( atomicAdd(&column_infos[desc.column].null_count, -1); } // Don't need counts to detect strings, any field in quotes is deduced to be a string - if (*(desc.value_begin - 1) == opts.quotechar && *desc.value_end == opts.quotechar) { + if (desc.is_quoted) { atomicAdd(&column_infos[desc.column].string_count, 1); continue; } diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 354abfa3985..772e706fc80 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -523,6 +523,28 @@ __inline__ __device__ std::pair trim_whitespaces_quote return {skip_character(trim_begin, quotechar), skip_character(trim_end, quotechar).base()}; } +/** + * @brief Adjusts the range to ignore starting/trailing whitespace characters. + * + * @param[in] begin Pointer to the first character in the parsing range + * @param[in] end pointer to the first character after the parsing range + * + * @return Trimmed range + */ +__inline__ __device__ std::pair trim_whitespaces(char const* begin, + char const* end) +{ + auto not_whitespace = [] __device__(auto c) { return !is_whitespace(c); }; + + auto const trim_begin = thrust::find_if(thrust::seq, begin, end, not_whitespace); + auto const trim_end = thrust::find_if(thrust::seq, + thrust::make_reverse_iterator(end), + thrust::make_reverse_iterator(trim_begin), + not_whitespace); + + return {trim_begin, trim_end.base()}; +} + /** * @brief Adjusts the range to ignore starting/trailing quotation characters. * From 9b64cbd954af7480de79fbb39a1a5110629e3adb Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 28 Oct 2022 19:14:19 +0530 Subject: [PATCH 25/39] doc update --- cpp/src/io/utilities/parsing_utils.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 772e706fc80..2da7a28e96c 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -504,7 +504,7 @@ __inline__ __device__ It skip_character(It const& it, char ch) * @brief Adjusts the range to ignore starting/trailing whitespace and quotation characters. * * @param[in] begin Pointer to the first character in the parsing range - * @param[in] end pointer to the first character after the parsing range + * @param[in] end Pointer to the first character after the parsing range * @param[in] quotechar The character used to denote quotes; '\0' if none * * @return Trimmed range @@ -527,7 +527,7 @@ __inline__ __device__ std::pair trim_whitespaces_quote * @brief Adjusts the range to ignore starting/trailing whitespace characters. * * @param[in] begin Pointer to the first character in the parsing range - * @param[in] end pointer to the first character after the parsing range + * @param[in] end Pointer to the first character after the parsing range * * @return Trimmed range */ @@ -549,7 +549,7 @@ __inline__ __device__ std::pair trim_whitespaces(char * @brief Adjusts the range to ignore starting/trailing quotation characters. * * @param begin Pointer to the first character in the parsing range - * @param end pointer to the first character after the parsing range + * @param end Pointer to the first character after the parsing range * @param quotechar The character used to denote quotes; '\0' if none * * @return Trimmed range From 90cdf3aa5838a1cce07ffc3093847762aaa35400 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 28 Oct 2022 23:20:54 +0530 Subject: [PATCH 26/39] ignore data for nulls in CsvReaderTest.InvalidFloatingPoint test --- cpp/tests/io/csv_test.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index f532836ef95..7f975741ae8 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -1188,8 +1188,7 @@ TEST_F(CsvReaderTest, InvalidFloatingPoint) const auto col_data = cudf::test::to_host(view.column(0)); // col_data.first contains the column data - for (const auto& elem : col_data.first) - ASSERT_TRUE(std::isnan(elem)); + // ignore all data because it is all nulls. // col_data.second contains the bitmasks ASSERT_EQ(0u, col_data.second[0]); } From 6710b4701c098fcdc4de3de06be152380582a713 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Sat, 29 Oct 2022 01:06:49 +0530 Subject: [PATCH 27/39] add pandas test for custom bool literals --- python/cudf/cudf/tests/test_csv.py | 37 ++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 0895da19c8b..e805f4a6867 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -766,6 +766,43 @@ def test_csv_reader_bools(tmpdir, names, dtypes, data, trues, falses): assert_eq(df_out, out) +def test_csv_reader_bools_custom(): + names = ["text", "int"] + dtypes = ["str", "int"] + trues = ["foo"] + falses = ["bar"] + lines = [ + ",".join(names), + "true,true", + "false,false", + "foo,foo", + "bar,bar", + "0,0", + ] + buffer = "\n".join(lines) + + df = read_csv( + StringIO(buffer), + names=names, + dtype=dtypes, + skiprows=1, + true_values=trues, + false_values=falses, + ) + assert len(df.columns) == 2 + assert df["text"].dtype == np.dtype("object") + assert df["int"].dtype == np.dtype("int64") + expected = pd.read_csv( + StringIO(buffer), + names=names, + dtype=dtypes, + skiprows=1, + true_values=trues, + false_values=falses, + ) + assert_eq(df, expected) + + def test_csv_reader_bools_NA(): names = ["text", "int"] dtypes = ["str", "int"] From 99eedc059a38503da43a8434e16eb169830e992d Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Mon, 31 Oct 2022 12:57:02 +0530 Subject: [PATCH 28/39] Update python/cudf/cudf/tests/test_csv.py Co-authored-by: GALI PREM SAGAR --- python/cudf/cudf/tests/test_csv.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index e805f4a6867..8a0d15cbfe2 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -789,9 +789,6 @@ def test_csv_reader_bools_custom(): true_values=trues, false_values=falses, ) - assert len(df.columns) == 2 - assert df["text"].dtype == np.dtype("object") - assert df["int"].dtype == np.dtype("int64") expected = pd.read_csv( StringIO(buffer), names=names, @@ -800,7 +797,7 @@ def test_csv_reader_bools_custom(): true_values=trues, false_values=falses, ) - assert_eq(df, expected) + assert_eq(df, expected, check_dtype=True) def test_csv_reader_bools_NA(): From 0415252a9ace6aaef406893eee12a88e5146ca41 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 31 Oct 2022 13:16:52 +0530 Subject: [PATCH 29/39] static_cast for is_quoted in pointer arithmetic --- cpp/src/io/json/json_gpu.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu index ca2737a5930..d2e69d086e6 100644 --- a/cpp/src/io/json/json_gpu.cu +++ b/cpp/src/io/json/json_gpu.cu @@ -207,8 +207,8 @@ __device__ field_descriptor next_field_descriptor(const char* begin, *trimmed_value_range.first == opts.quotechar and *thrust::prev(trimmed_value_range.second) == opts.quotechar; return {desc_pre_trim.column, - trimmed_value_range.first + is_quoted, - trimmed_value_range.second - is_quoted, + trimmed_value_range.first + static_cast(is_quoted), + trimmed_value_range.second - static_cast(is_quoted), is_quoted}; } From da4b16f6db4abf5742fe60c3111a89793f6504fa Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 31 Oct 2022 14:08:58 +0530 Subject: [PATCH 30/39] skip nan check for null elements in test --- cpp/tests/io/json_test.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 396072e8b8a..999f56b9808 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -914,8 +914,7 @@ TEST_P(JsonReaderParamTest, InvalidFloatingPoint) const auto col_data = cudf::test::to_host(result.tbl->view().column(0)); // col_data.first contains the column data - for (const auto& elem : col_data.first) - ASSERT_TRUE(std::isnan(elem)); + // ignore all data because it is all nulls. // col_data.second contains the bitmasks ASSERT_EQ(0u, col_data.second[0]); } From 53e0f2b6f285e7f123ee936a30d4cb01f2238c1a Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Tue, 1 Nov 2022 15:17:10 +0530 Subject: [PATCH 31/39] Apply suggestions from code review Co-authored-by: Bradley Dice --- cpp/src/io/utilities/parsing_utils.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 2da7a28e96c..02575767e52 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -550,13 +550,13 @@ __inline__ __device__ std::pair trim_whitespaces(char * * @param begin Pointer to the first character in the parsing range * @param end Pointer to the first character after the parsing range - * @param quotechar The character used to denote quotes; '\0' if none + * @param quotechar The character used to denote quotes. Provide '\0' if no quotes should be trimmed. * * @return Trimmed range */ __inline__ __device__ std::pair trim_quotes(char const* begin, char const* end, - char quotechar = '\0') + char quotechar) { if ((thrust::distance(begin, end) >= 2 && *begin == quotechar && *thrust::prev(end) == quotechar)) { From dd6a0ca9ec9d3eb9c06ce6d2573c73168f51c34d Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Tue, 1 Nov 2022 15:17:27 +0530 Subject: [PATCH 32/39] Update cpp/src/io/utilities/parsing_utils.cuh Co-authored-by: Bradley Dice --- cpp/src/io/utilities/parsing_utils.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 02575767e52..19a520e2f6f 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -526,8 +526,8 @@ __inline__ __device__ std::pair trim_whitespaces_quote /** * @brief Adjusts the range to ignore starting/trailing whitespace characters. * - * @param[in] begin Pointer to the first character in the parsing range - * @param[in] end Pointer to the first character after the parsing range + * @param begin Pointer to the first character in the parsing range + * @param end Pointer to the first character after the parsing range * * @return Trimmed range */ From cc2a348038a227a6aca028cde9e92658103c8121 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 2 Nov 2022 13:18:36 +0530 Subject: [PATCH 33/39] review comments, style fix --- cpp/src/io/json/json_gpu.cu | 1 - cpp/src/io/utilities/parsing_utils.cuh | 9 +++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu index d2e69d086e6..8b6c0f9d528 100644 --- a/cpp/src/io/json/json_gpu.cu +++ b/cpp/src/io/json/json_gpu.cu @@ -274,7 +274,6 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts, using string_index_pair = thrust::pair; - // Empty fields are not legal values if (!serialized_trie_contains(opts.trie_na, {desc.value_begin - is_quoted, value_len + is_quoted * 2})) { // Type dispatcher does not handle strings diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 19a520e2f6f..cbd417c2b5b 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -503,9 +503,9 @@ __inline__ __device__ It skip_character(It const& it, char ch) /** * @brief Adjusts the range to ignore starting/trailing whitespace and quotation characters. * - * @param[in] begin Pointer to the first character in the parsing range - * @param[in] end Pointer to the first character after the parsing range - * @param[in] quotechar The character used to denote quotes; '\0' if none + * @param begin Pointer to the first character in the parsing range + * @param end Pointer to the first character after the parsing range + * @param quotechar The character used to denote quotes; '\0' if none * * @return Trimmed range */ @@ -550,7 +550,8 @@ __inline__ __device__ std::pair trim_whitespaces(char * * @param begin Pointer to the first character in the parsing range * @param end Pointer to the first character after the parsing range - * @param quotechar The character used to denote quotes. Provide '\0' if no quotes should be trimmed. + * @param quotechar The character used to denote quotes. Provide '\0' if no quotes should be + * trimmed. * * @return Trimmed range */ From dc228e5078b9b53837089936a590ffebd42bf774 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 2 Nov 2022 13:35:08 +0530 Subject: [PATCH 34/39] pandas vs cudf csv parser differences bool literals give parsing errors as int "0" and "1" give parsing errors as bool in pandas --- python/cudf/cudf/tests/test_csv.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 8a0d15cbfe2..e85d404d2c4 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -767,10 +767,10 @@ def test_csv_reader_bools(tmpdir, names, dtypes, data, trues, falses): def test_csv_reader_bools_custom(): - names = ["text", "int"] - dtypes = ["str", "int"] - trues = ["foo"] - falses = ["bar"] + names = ["text", "bool"] + dtypes = {"text": "str", "bool": "bool"} + trues = ["foo", "1"] + falses = ["bar", "0"] lines = [ ",".join(names), "true,true", @@ -778,6 +778,7 @@ def test_csv_reader_bools_custom(): "foo,foo", "bar,bar", "0,0", + "1,1", ] buffer = "\n".join(lines) @@ -789,6 +790,9 @@ def test_csv_reader_bools_custom(): true_values=trues, false_values=falses, ) + + # Note: bool literals give parsing errors as int + # "0" and "1" give parsing errors as bool in pandas expected = pd.read_csv( StringIO(buffer), names=names, From 7e231ac6aa34eaf92d88887c0da0b077120fe8ba Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Fri, 4 Nov 2022 03:20:39 +0530 Subject: [PATCH 35/39] Apply suggestions from code review --- cpp/src/io/utilities/parsing_utils.cuh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 445ec871dbc..0015777279b 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -598,14 +598,13 @@ struct ConvertFunctor { bool as_hex) { // TODO what's invalid input - bool all_digits_valid = true; static_cast*>(out_buffer)[row] = - [&opts, &all_digits_valid, output_type, begin, end]() -> device_storage_type_t { + [&opts, output_type, begin, end]() -> device_storage_type_t { return strings::detail::parse_decimal>( begin, end, output_type.scale()); }(); - return all_digits_valid; + return true; } /** From d7fb88ca6157dc7059f513e46c0103139ad8f287 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 15 Nov 2022 02:58:08 +0530 Subject: [PATCH 36/39] address review comments --- cpp/src/io/utilities/parsing_utils.cuh | 18 +++++--- cpp/tests/io/json_test.cpp | 59 +++++++++++++++++++------- 2 files changed, 56 insertions(+), 21 deletions(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 0015777279b..6c5c817eb02 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -237,7 +237,7 @@ __host__ __device__ std::optional parse_numeric(char const* begin, if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); } } } - if (!all_digits_valid) { return {}; } + if (!all_digits_valid) { return std::nullopt; } return value * sign; } @@ -622,8 +622,12 @@ struct ConvertFunctor { auto const value = [&opts, begin, end]() -> std::optional { // Check for user-specified true/false values auto const field_len = static_cast(end - begin); - if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; } - if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; } + if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { + return static_cast(true); + } + if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { + return static_cast(false); + } return cudf::io::parse_numeric(begin, end, opts); }(); if (value.has_value()) { static_cast(out_buffer)[row] = *value; } @@ -647,8 +651,12 @@ struct ConvertFunctor { auto const value = [&opts, begin, end]() -> std::optional { // Check for user-specified true/false values auto const field_len = static_cast(end - begin); - if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; } - if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; } + if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { + return static_cast(true); + } + if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { + return static_cast(false); + } return cudf::io::parse_numeric(begin, end, opts); }(); if (value.has_value()) { static_cast(out_buffer)[row] = *value; } diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 999f56b9808..5da4d8969d6 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1487,27 +1487,54 @@ TEST_P(JsonReaderParamTest, JsonDtypeParsing) 0, [=](auto i) -> bool { return static_cast(validity[i]); }); }; - constexpr int iNA = -1; - constexpr double dNA = std::numeric_limits::quiet_NaN(); - constexpr bool bNA = false; - - std::vector const validity_int = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0}; - std::vector const validity_float = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0}; - std::vector const validity_bool = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0}; - - auto int_col = int_wrapper{{0, 0, iNA, 1, 1, iNA, iNA, iNA, iNA, 1, 0, iNA, 1, 0, iNA, iNA}, - make_validity(validity_int)}; - auto float_col = - float_wrapper{{0.0, 0.0, dNA, 1.0, 1.0, dNA, dNA, dNA, dNA, 1.0, 0.0, dNA, 1.0, 0.0, dNA, dNA}, - make_validity(validity_float)}; + constexpr int int_NA = 0; + constexpr double double_NA = std::numeric_limits::quiet_NaN(); + constexpr bool bool_NA = false; + + std::vector const validity = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0}; + + auto int_col = int_wrapper{ + {0, 0, int_NA, 1, 1, int_NA, int_NA, int_NA, int_NA, 1, 0, int_NA, 1, 0, int_NA, int_NA}, + make_validity(validity)}; + auto float_col = float_wrapper{{0.0, + 0.0, + double_NA, + 1.0, + 1.0, + double_NA, + double_NA, + double_NA, + double_NA, + 1.0, + 0.0, + double_NA, + 1.0, + 0.0, + double_NA, + double_NA}, + make_validity(validity)}; auto str_col = cudf::test::strings_column_wrapper{// clang-format off {"0", "0", " 0", "1", "1", " 1", "a", "z", "", "true", "false", "null", "true", "false", "nan", "nan"}, cudf::test::iterators::nulls_at(std::vector{8})}; // clang-format on - auto bool_col = bool_wrapper{ - {false, false, bNA, true, true, bNA, bNA, bNA, bNA, true, false, bNA, true, false, bNA, bNA}, - make_validity(validity_bool)}; + auto bool_col = bool_wrapper{{false, + false, + bool_NA, + true, + true, + bool_NA, + bool_NA, + bool_NA, + bool_NA, + true, + false, + bool_NA, + true, + false, + bool_NA, + bool_NA}, + make_validity(validity)}; // Types to test const std::vector dtypes = { From bea7fc3aca340fd1e55c1de5ffaa8571261fa933 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 15 Nov 2022 04:57:54 +0530 Subject: [PATCH 37/39] replace std::nullopt --- cpp/src/io/utilities/parsing_utils.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 6c5c817eb02..a78700db484 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -237,7 +237,7 @@ __host__ __device__ std::optional parse_numeric(char const* begin, if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); } } } - if (!all_digits_valid) { return std::nullopt; } + if (!all_digits_valid) { return std::optional{}; } return value * sign; } From 0e45d133ce3631ac8b74d8188dc1c9d3361ee717 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 15 Nov 2022 05:28:27 +0530 Subject: [PATCH 38/39] remove breaking behavior on nulls --- cpp/src/io/utilities/parsing_utils.cuh | 10 +++++----- cpp/tests/io/csv_test.cpp | 3 ++- cpp/tests/io/json_test.cpp | 7 ++++--- python/cudf/cudf/tests/test_csv.py | 2 +- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index a78700db484..71c86553c4d 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -578,9 +578,9 @@ struct ConvertFunctor { return as_hex ? cudf::io::parse_numeric(begin, end, opts) : cudf::io::parse_numeric(begin, end, opts); }(); - if (value.has_value()) { static_cast(out_buffer)[row] = *value; } + static_cast(out_buffer)[row] = value.value_or(std::numeric_limits::quiet_NaN()); - return value.has_value(); + return true; } /** @@ -630,9 +630,9 @@ struct ConvertFunctor { } return cudf::io::parse_numeric(begin, end, opts); }(); - if (value.has_value()) { static_cast(out_buffer)[row] = *value; } + static_cast(out_buffer)[row] = value.value_or(std::numeric_limits::quiet_NaN()); - return value.has_value(); + return true; } /** @@ -659,7 +659,7 @@ struct ConvertFunctor { } return cudf::io::parse_numeric(begin, end, opts); }(); - if (value.has_value()) { static_cast(out_buffer)[row] = *value; } + static_cast(out_buffer)[row] = value.value_or(std::numeric_limits::quiet_NaN()); return value.has_value() and !std::isnan(*value); } diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index af2e0f5abb9..8acc6f8f6ee 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -1170,7 +1170,8 @@ TEST_F(CsvReaderTest, InvalidFloatingPoint) const auto col_data = cudf::test::to_host(view.column(0)); // col_data.first contains the column data - // ignore all data because it is all nulls. + for (const auto& elem : col_data.first) + ASSERT_TRUE(std::isnan(elem)); // col_data.second contains the bitmasks ASSERT_EQ(0u, col_data.second[0]); } diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 5da4d8969d6..f7b21008f70 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -914,7 +914,8 @@ TEST_P(JsonReaderParamTest, InvalidFloatingPoint) const auto col_data = cudf::test::to_host(result.tbl->view().column(0)); // col_data.first contains the column data - // ignore all data because it is all nulls. + for (const auto& elem : col_data.first) + ASSERT_TRUE(std::isnan(elem)); // col_data.second contains the bitmasks ASSERT_EQ(0u, col_data.second[0]); } @@ -1495,7 +1496,7 @@ TEST_P(JsonReaderParamTest, JsonDtypeParsing) auto int_col = int_wrapper{ {0, 0, int_NA, 1, 1, int_NA, int_NA, int_NA, int_NA, 1, 0, int_NA, 1, 0, int_NA, int_NA}, - make_validity(validity)}; + cudf::test::iterators::nulls_at(std::vector{8})}; auto float_col = float_wrapper{{0.0, 0.0, double_NA, @@ -1534,7 +1535,7 @@ TEST_P(JsonReaderParamTest, JsonDtypeParsing) false, bool_NA, bool_NA}, - make_validity(validity)}; + cudf::test::iterators::nulls_at(std::vector{8})}; // Types to test const std::vector dtypes = { diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 02569285efe..e85d404d2c4 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -834,7 +834,7 @@ def test_csv_reader_bools_NA(): expected = pd.DataFrame( { "text": ["true", "false", "foo", "bar", "qux"], - "int": [1.0, 0.0, 1.0, 0.0, np.nan], + "int": [1, 0, 1, 0, 0], } ) # breaking behaviour is np.nan for qux From df6844c6b3263ffc9f1b7045970ffb8d80855728 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 15 Nov 2022 15:29:43 +0530 Subject: [PATCH 39/39] TODO clarify --- cpp/src/io/utilities/parsing_utils.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 71c86553c4d..89806956ae5 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -597,7 +597,7 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { - // TODO what's invalid input + // TODO decide what's invalid input and update parsing functions static_cast*>(out_buffer)[row] = [&opts, output_type, begin, end]() -> device_storage_type_t { return strings::detail::parse_decimal>( @@ -678,7 +678,7 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { - // TODO what's invalid input + // TODO decide what's invalid input and update parsing functions if constexpr (cudf::is_timestamp()) { static_cast(out_buffer)[row] = to_timestamp(begin, end, opts.dayfirst); } else if constexpr (cudf::is_duration()) {