Null element for parsing error in numeric types in JSON, CSV reader (#…

…12272) This PR changes behaviour of nested json reader to return null element on parsing error of numeric types. Breaking behaviour in read_csv, and read_json: - old behaviour: parsing error returns quiet NAN for that type as result for that row. - new behaviour: parsing error returns null for that row. Moved commit 0e45d13 #12022 (comment) to 23.02 Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Vyas Ramasubramani (https://github.com/vyasr) - Vukasin Milovanovic (https://github.com/vuule) URL: #12272
rapidsai · Dec 5, 2022 · ba5daec · ba5daec
1 parent 77d1717
commit ba5daec
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 42 deletions.
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
@@ -578,9 +578,9 @@ struct ConvertFunctor {
       return as_hex ? cudf::io::parse_numeric<T, 16>(begin, end, opts)
                     : cudf::io::parse_numeric<T>(begin, end, opts);
     }();
-    static_cast<T*>(out_buffer)[row] = value.value_or(std::numeric_limits<T>::quiet_NaN());
+    if (value.has_value()) { static_cast<T*>(out_buffer)[row] = *value; }
 
-    return true;
+    return value.has_value();
   }
 
   /**
@@ -630,9 +630,9 @@ struct ConvertFunctor {
       }
       return cudf::io::parse_numeric<T>(begin, end, opts);
     }();
-    static_cast<T*>(out_buffer)[row] = value.value_or(std::numeric_limits<T>::quiet_NaN());
+    if (value.has_value()) { static_cast<T*>(out_buffer)[row] = *value; }
 
-    return true;
+    return value.has_value();
   }
 
   /**
@@ -659,7 +659,7 @@ struct ConvertFunctor {
       }
       return cudf::io::parse_numeric<T>(begin, end, opts);
     }();
-    static_cast<T*>(out_buffer)[row] = value.value_or(std::numeric_limits<T>::quiet_NaN());
+    if (value.has_value()) { static_cast<T*>(out_buffer)[row] = *value; }
 
     return value.has_value() and !std::isnan(*value);
   }

diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
@@ -1168,12 +1168,8 @@ TEST_F(CsvReaderTest, InvalidFloatingPoint)
   EXPECT_EQ(1, view.num_columns());
   ASSERT_EQ(type_id::FLOAT32, view.column(0).type().id());
 
-  const auto col_data = cudf::test::to_host<float>(view.column(0));
-  // col_data.first contains the column data
-  for (const auto& elem : col_data.first)
-    ASSERT_TRUE(std::isnan(elem));
-  // col_data.second contains the bitmasks
-  ASSERT_EQ(0u, col_data.second[0]);
+  // ignore all data because it is all nulls.
+  ASSERT_EQ(6u, result.tbl->view().column(0).null_count());
 }
 
 TEST_F(CsvReaderTest, StringInference)

diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
@@ -960,12 +960,8 @@ TEST_P(JsonReaderParamTest, InvalidFloatingPoint)
   EXPECT_EQ(result.tbl->num_columns(), 1);
   EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::FLOAT32);
 
-  const auto col_data = cudf::test::to_host<float>(result.tbl->view().column(0));
-  // col_data.first contains the column data
-  for (const auto& elem : col_data.first)
-    ASSERT_TRUE(std::isnan(elem));
-  // col_data.second contains the bitmasks
-  ASSERT_EQ(0u, col_data.second[0]);
+  // ignore all data because it is all nulls.
+  ASSERT_EQ(6u, result.tbl->view().column(0).null_count());
 }
 
 TEST_P(JsonReaderParamTest, StringInference)
@@ -1536,31 +1532,45 @@ TEST_P(JsonReaderParamTest, JsonDtypeParsing)
       0, [=](auto i) -> bool { return static_cast<bool>(validity[i]); });
   };
 
-  constexpr int int_NA       = 0;
-  constexpr double double_NA = std::numeric_limits<double>::quiet_NaN();
-  constexpr bool bool_NA     = false;
+  constexpr int int_ignore{};
+  constexpr double double_ignore{};
+  constexpr bool bool_ignore{};
 
   std::vector<int> const validity = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0};
 
-  auto int_col = int_wrapper{
-    {0, 0, int_NA, 1, 1, int_NA, int_NA, int_NA, int_NA, 1, 0, int_NA, 1, 0, int_NA, int_NA},
-    cudf::test::iterators::nulls_at(std::vector<int>{8})};
+  auto int_col   = int_wrapper{{0,
+                              0,
+                              int_ignore,
+                              1,
+                              1,
+                              int_ignore,
+                              int_ignore,
+                              int_ignore,
+                              int_ignore,
+                              1,
+                              0,
+                              int_ignore,
+                              1,
+                              0,
+                              int_ignore,
+                              int_ignore},
+                             make_validity(validity)};
   auto float_col = float_wrapper{{0.0,
                                   0.0,
-                                  double_NA,
+                                  double_ignore,
                                   1.0,
                                   1.0,
-                                  double_NA,
-                                  double_NA,
-                                  double_NA,
-                                  double_NA,
+                                  double_ignore,
+                                  double_ignore,
+                                  double_ignore,
+                                  double_ignore,
                                   1.0,
                                   0.0,
-                                  double_NA,
+                                  double_ignore,
                                   1.0,
                                   0.0,
-                                  double_NA,
-                                  double_NA},
+                                  double_ignore,
+                                  double_ignore},
                                  make_validity(validity)};
   auto str_col =
     cudf::test::strings_column_wrapper{// clang-format off
@@ -1569,21 +1579,21 @@ TEST_P(JsonReaderParamTest, JsonDtypeParsing)
   // clang-format on
   auto bool_col = bool_wrapper{{false,
                                 false,
-                                bool_NA,
+                                bool_ignore,
                                 true,
                                 true,
-                                bool_NA,
-                                bool_NA,
-                                bool_NA,
-                                bool_NA,
+                                bool_ignore,
+                                bool_ignore,
+                                bool_ignore,
+                                bool_ignore,
                                 true,
                                 false,
-                                bool_NA,
+                                bool_ignore,
                                 true,
                                 false,
-                                bool_NA,
-                                bool_NA},
-                               cudf::test::iterators::nulls_at(std::vector<int>{8})};
+                                bool_ignore,
+                                bool_ignore},
+                               make_validity(validity)};
 
   // Types to test
   const std::vector<data_type> dtypes = {

diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
@@ -834,10 +834,9 @@ def test_csv_reader_bools_NA():
     expected = pd.DataFrame(
         {
             "text": ["true", "false", "foo", "bar", "qux"],
-            "int": [1, 0, 1, 0, 0],
+            "int": [1.0, 0.0, 1.0, 0.0, np.nan],
         }
     )
-    # breaking behaviour is np.nan for qux
     assert_eq(df, expected)