Skip to content

Commit

Permalink
Null element for parsing error in numeric types in JSON, CSV reader (#…
Browse files Browse the repository at this point in the history
…12272)

This PR changes behaviour of nested json reader to return null element on parsing error of numeric types.
Breaking behaviour in read_csv, and read_json:

- old behaviour: parsing error returns quiet NAN for that type as result for that row.
- new behaviour: parsing error returns null for that row.

Moved commit 0e45d13  #12022 (comment) to 23.02

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: #12272
  • Loading branch information
karthikeyann authored Dec 5, 2022
1 parent 77d1717 commit ba5daec
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 42 deletions.
10 changes: 5 additions & 5 deletions cpp/src/io/utilities/parsing_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -578,9 +578,9 @@ struct ConvertFunctor {
return as_hex ? cudf::io::parse_numeric<T, 16>(begin, end, opts)
: cudf::io::parse_numeric<T>(begin, end, opts);
}();
static_cast<T*>(out_buffer)[row] = value.value_or(std::numeric_limits<T>::quiet_NaN());
if (value.has_value()) { static_cast<T*>(out_buffer)[row] = *value; }

return true;
return value.has_value();
}

/**
Expand Down Expand Up @@ -630,9 +630,9 @@ struct ConvertFunctor {
}
return cudf::io::parse_numeric<T>(begin, end, opts);
}();
static_cast<T*>(out_buffer)[row] = value.value_or(std::numeric_limits<T>::quiet_NaN());
if (value.has_value()) { static_cast<T*>(out_buffer)[row] = *value; }

return true;
return value.has_value();
}

/**
Expand All @@ -659,7 +659,7 @@ struct ConvertFunctor {
}
return cudf::io::parse_numeric<T>(begin, end, opts);
}();
static_cast<T*>(out_buffer)[row] = value.value_or(std::numeric_limits<T>::quiet_NaN());
if (value.has_value()) { static_cast<T*>(out_buffer)[row] = *value; }

return value.has_value() and !std::isnan(*value);
}
Expand Down
8 changes: 2 additions & 6 deletions cpp/tests/io/csv_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1168,12 +1168,8 @@ TEST_F(CsvReaderTest, InvalidFloatingPoint)
EXPECT_EQ(1, view.num_columns());
ASSERT_EQ(type_id::FLOAT32, view.column(0).type().id());

const auto col_data = cudf::test::to_host<float>(view.column(0));
// col_data.first contains the column data
for (const auto& elem : col_data.first)
ASSERT_TRUE(std::isnan(elem));
// col_data.second contains the bitmasks
ASSERT_EQ(0u, col_data.second[0]);
// ignore all data because it is all nulls.
ASSERT_EQ(6u, result.tbl->view().column(0).null_count());
}

TEST_F(CsvReaderTest, StringInference)
Expand Down
68 changes: 39 additions & 29 deletions cpp/tests/io/json_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -960,12 +960,8 @@ TEST_P(JsonReaderParamTest, InvalidFloatingPoint)
EXPECT_EQ(result.tbl->num_columns(), 1);
EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::FLOAT32);

const auto col_data = cudf::test::to_host<float>(result.tbl->view().column(0));
// col_data.first contains the column data
for (const auto& elem : col_data.first)
ASSERT_TRUE(std::isnan(elem));
// col_data.second contains the bitmasks
ASSERT_EQ(0u, col_data.second[0]);
// ignore all data because it is all nulls.
ASSERT_EQ(6u, result.tbl->view().column(0).null_count());
}

TEST_P(JsonReaderParamTest, StringInference)
Expand Down Expand Up @@ -1536,31 +1532,45 @@ TEST_P(JsonReaderParamTest, JsonDtypeParsing)
0, [=](auto i) -> bool { return static_cast<bool>(validity[i]); });
};

constexpr int int_NA = 0;
constexpr double double_NA = std::numeric_limits<double>::quiet_NaN();
constexpr bool bool_NA = false;
constexpr int int_ignore{};
constexpr double double_ignore{};
constexpr bool bool_ignore{};

std::vector<int> const validity = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0};

auto int_col = int_wrapper{
{0, 0, int_NA, 1, 1, int_NA, int_NA, int_NA, int_NA, 1, 0, int_NA, 1, 0, int_NA, int_NA},
cudf::test::iterators::nulls_at(std::vector<int>{8})};
auto int_col = int_wrapper{{0,
0,
int_ignore,
1,
1,
int_ignore,
int_ignore,
int_ignore,
int_ignore,
1,
0,
int_ignore,
1,
0,
int_ignore,
int_ignore},
make_validity(validity)};
auto float_col = float_wrapper{{0.0,
0.0,
double_NA,
double_ignore,
1.0,
1.0,
double_NA,
double_NA,
double_NA,
double_NA,
double_ignore,
double_ignore,
double_ignore,
double_ignore,
1.0,
0.0,
double_NA,
double_ignore,
1.0,
0.0,
double_NA,
double_NA},
double_ignore,
double_ignore},
make_validity(validity)};
auto str_col =
cudf::test::strings_column_wrapper{// clang-format off
Expand All @@ -1569,21 +1579,21 @@ TEST_P(JsonReaderParamTest, JsonDtypeParsing)
// clang-format on
auto bool_col = bool_wrapper{{false,
false,
bool_NA,
bool_ignore,
true,
true,
bool_NA,
bool_NA,
bool_NA,
bool_NA,
bool_ignore,
bool_ignore,
bool_ignore,
bool_ignore,
true,
false,
bool_NA,
bool_ignore,
true,
false,
bool_NA,
bool_NA},
cudf::test::iterators::nulls_at(std::vector<int>{8})};
bool_ignore,
bool_ignore},
make_validity(validity)};

// Types to test
const std::vector<data_type> dtypes = {
Expand Down
3 changes: 1 addition & 2 deletions python/cudf/cudf/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -834,10 +834,9 @@ def test_csv_reader_bools_NA():
expected = pd.DataFrame(
{
"text": ["true", "false", "foo", "bar", "qux"],
"int": [1, 0, 1, 0, 0],
"int": [1.0, 0.0, 1.0, 0.0, np.nan],
}
)
# breaking behaviour is np.nan for qux
assert_eq(df, expected)


Expand Down

0 comments on commit ba5daec

Please sign in to comment.