From ba5daec7efc6dbd26178561cf2a255bf69a0061a Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Mon, 5 Dec 2022 10:30:58 +0530 Subject: [PATCH] Null element for parsing error in numeric types in JSON, CSV reader (#12272) This PR changes behaviour of nested json reader to return null element on parsing error of numeric types. Breaking behaviour in read_csv, and read_json: - old behaviour: parsing error returns quiet NAN for that type as result for that row. - new behaviour: parsing error returns null for that row. Moved commit 0e45d13 https://github.com/rapidsai/cudf/pull/12022#issuecomment-1314569271 to 23.02 Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Vyas Ramasubramani (https://github.com/vyasr) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/12272 --- cpp/src/io/utilities/parsing_utils.cuh | 10 ++-- cpp/tests/io/csv_test.cpp | 8 +-- cpp/tests/io/json_test.cpp | 68 +++++++++++++++----------- python/cudf/cudf/tests/test_csv.py | 3 +- 4 files changed, 47 insertions(+), 42 deletions(-) diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 816c4c2f4f8..5c3af588411 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -578,9 +578,9 @@ struct ConvertFunctor { return as_hex ? cudf::io::parse_numeric(begin, end, opts) : cudf::io::parse_numeric(begin, end, opts); }(); - static_cast(out_buffer)[row] = value.value_or(std::numeric_limits::quiet_NaN()); + if (value.has_value()) { static_cast(out_buffer)[row] = *value; } - return true; + return value.has_value(); } /** @@ -630,9 +630,9 @@ struct ConvertFunctor { } return cudf::io::parse_numeric(begin, end, opts); }(); - static_cast(out_buffer)[row] = value.value_or(std::numeric_limits::quiet_NaN()); + if (value.has_value()) { static_cast(out_buffer)[row] = *value; } - return true; + return value.has_value(); } /** @@ -659,7 +659,7 @@ struct ConvertFunctor { } return cudf::io::parse_numeric(begin, end, opts); }(); - static_cast(out_buffer)[row] = value.value_or(std::numeric_limits::quiet_NaN()); + if (value.has_value()) { static_cast(out_buffer)[row] = *value; } return value.has_value() and !std::isnan(*value); } diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index c0af2579299..233bb51695c 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -1168,12 +1168,8 @@ TEST_F(CsvReaderTest, InvalidFloatingPoint) EXPECT_EQ(1, view.num_columns()); ASSERT_EQ(type_id::FLOAT32, view.column(0).type().id()); - const auto col_data = cudf::test::to_host(view.column(0)); - // col_data.first contains the column data - for (const auto& elem : col_data.first) - ASSERT_TRUE(std::isnan(elem)); - // col_data.second contains the bitmasks - ASSERT_EQ(0u, col_data.second[0]); + // ignore all data because it is all nulls. + ASSERT_EQ(6u, result.tbl->view().column(0).null_count()); } TEST_F(CsvReaderTest, StringInference) diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 3d5f047d52c..a6e4555c4db 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -960,12 +960,8 @@ TEST_P(JsonReaderParamTest, InvalidFloatingPoint) EXPECT_EQ(result.tbl->num_columns(), 1); EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::FLOAT32); - const auto col_data = cudf::test::to_host(result.tbl->view().column(0)); - // col_data.first contains the column data - for (const auto& elem : col_data.first) - ASSERT_TRUE(std::isnan(elem)); - // col_data.second contains the bitmasks - ASSERT_EQ(0u, col_data.second[0]); + // ignore all data because it is all nulls. + ASSERT_EQ(6u, result.tbl->view().column(0).null_count()); } TEST_P(JsonReaderParamTest, StringInference) @@ -1536,31 +1532,45 @@ TEST_P(JsonReaderParamTest, JsonDtypeParsing) 0, [=](auto i) -> bool { return static_cast(validity[i]); }); }; - constexpr int int_NA = 0; - constexpr double double_NA = std::numeric_limits::quiet_NaN(); - constexpr bool bool_NA = false; + constexpr int int_ignore{}; + constexpr double double_ignore{}; + constexpr bool bool_ignore{}; std::vector const validity = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0}; - auto int_col = int_wrapper{ - {0, 0, int_NA, 1, 1, int_NA, int_NA, int_NA, int_NA, 1, 0, int_NA, 1, 0, int_NA, int_NA}, - cudf::test::iterators::nulls_at(std::vector{8})}; + auto int_col = int_wrapper{{0, + 0, + int_ignore, + 1, + 1, + int_ignore, + int_ignore, + int_ignore, + int_ignore, + 1, + 0, + int_ignore, + 1, + 0, + int_ignore, + int_ignore}, + make_validity(validity)}; auto float_col = float_wrapper{{0.0, 0.0, - double_NA, + double_ignore, 1.0, 1.0, - double_NA, - double_NA, - double_NA, - double_NA, + double_ignore, + double_ignore, + double_ignore, + double_ignore, 1.0, 0.0, - double_NA, + double_ignore, 1.0, 0.0, - double_NA, - double_NA}, + double_ignore, + double_ignore}, make_validity(validity)}; auto str_col = cudf::test::strings_column_wrapper{// clang-format off @@ -1569,21 +1579,21 @@ TEST_P(JsonReaderParamTest, JsonDtypeParsing) // clang-format on auto bool_col = bool_wrapper{{false, false, - bool_NA, + bool_ignore, true, true, - bool_NA, - bool_NA, - bool_NA, - bool_NA, + bool_ignore, + bool_ignore, + bool_ignore, + bool_ignore, true, false, - bool_NA, + bool_ignore, true, false, - bool_NA, - bool_NA}, - cudf::test::iterators::nulls_at(std::vector{8})}; + bool_ignore, + bool_ignore}, + make_validity(validity)}; // Types to test const std::vector dtypes = { diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index eae04ec1976..8f1ac532ef6 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -834,10 +834,9 @@ def test_csv_reader_bools_NA(): expected = pd.DataFrame( { "text": ["true", "false", "foo", "bar", "qux"], - "int": [1, 0, 1, 0, 0], + "int": [1.0, 0.0, 1.0, 0.0, np.nan], } ) - # breaking behaviour is np.nan for qux assert_eq(df, expected)