Skip to content

Commit

Permalink
Update check for inf/nan strings in libcudf float conversion to ignor…
Browse files Browse the repository at this point in the history
…e case (#9694)

Reference https://github.com/rapidsai/cudf/pull/9613/files#r743579126

Add support to ignore case for strings `INF`, `INFINITY` and `NAN` to `cudf::strings::is_float` and `cudf::strings::to_float` for consistency with https://en.cppreference.com/w/cpp/string/basic_string/stof

Also, remove the expensive `replace` call in the cudf  before calling this from Python.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Robert Maynard (https://github.com/robertmaynard)
  - Nghia Truong (https://github.com/ttnghia)
  - Jason Lowe (https://github.com/jlowe)

URL: #9694
  • Loading branch information
davidwendt authored Nov 30, 2021
1 parent 1697f63 commit 69d5765
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 139 deletions.
64 changes: 52 additions & 12 deletions cpp/include/cudf/strings/string.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,43 @@ inline __device__ bool is_integer(string_view const& d_str)
thrust::seq, begin, end, [] __device__(auto chr) { return chr >= '0' && chr <= '9'; });
}

/**
* @brief Returns true if input contains the not-a-number string.
*
* The following are valid for this function: "NAN" and "NaN"
* @param d_str input string
* @return true if input is as valid NaN string.
*/
inline __device__ bool is_nan_str(string_view const& d_str)
{
auto const ptr = d_str.data();
return (d_str.size_bytes() == 3) && (ptr[0] == 'N' || ptr[0] == 'n') &&
(ptr[1] == 'A' || ptr[1] == 'a') && (ptr[2] == 'N' || ptr[2] == 'n');
}

/**
* @brief Returns true if input contains the infinity string.
*
* The following are valid for this function: "INF", "INFINITY", and "Inf"
* @param d_str input string
* @return true if input is as valid Inf string.
*/
inline __device__ bool is_inf_str(string_view const& d_str)
{
auto const ptr = d_str.data();
auto const size = d_str.size_bytes();

if (size != 3 && size != 8) return false;

auto const prefix_valid = (ptr[0] == 'I' || ptr[0] == 'i') && (ptr[1] == 'N' || ptr[1] == 'n') &&
(ptr[2] == 'F' || ptr[2] == 'f');

return prefix_valid &&
((size == 3) || ((ptr[3] == 'I' || ptr[3] == 'i') && (ptr[4] == 'N' || ptr[4] == 'n') &&
(ptr[5] == 'I' || ptr[5] == 'i') && (ptr[6] == 'T' || ptr[6] == 't') &&
(ptr[7] == 'Y' || ptr[7] == 'y')));
}

/**
* @brief Returns `true` if all characters in the string
* are valid for conversion to a float type.
Expand All @@ -65,38 +102,41 @@ inline __device__ bool is_integer(string_view const& d_str)
* An empty string returns `false`.
* No bounds checking is performed to verify if the value would fit
* within a specific float type.
* The following strings are also allowed "NaN", "Inf" and, "-Inf"
* and will return true.
* The following strings are also allowed and will return true:
* "NaN", "NAN", "Inf", "INF", "INFINITY"
*
* @param d_str String to check.
* @return true if string has valid float characters
*/
inline __device__ bool is_float(string_view const& d_str)
{
if (d_str.empty()) return false;
// strings allowed by the converter
if (d_str.compare("NaN", 3) == 0) return true;
if (d_str.compare("Inf", 3) == 0) return true;
if (d_str.compare("-Inf", 4) == 0) return true;
bool decimal_found = false;
bool exponent_found = false;
size_type bytes = d_str.size_bytes();
const char* data = d_str.data();
// sign character allowed at the beginning of the string
size_type chidx = (*data == '-' || *data == '+') ? 1 : 0;
bool result = chidx < bytes;
size_type ch_idx = (*data == '-' || *data == '+') ? 1 : 0;

bool result = ch_idx < bytes;
// check for nan and infinity strings
if (result && data[ch_idx] > '9') {
auto const inf_nan = string_view(data + ch_idx, bytes - ch_idx);
if (is_nan_str(inf_nan) || is_inf_str(inf_nan)) return true;
}

// check for float chars [0-9] and a single decimal '.'
// and scientific notation [eE][+-][0-9]
for (; chidx < bytes; ++chidx) {
auto chr = data[chidx];
for (; ch_idx < bytes; ++ch_idx) {
auto chr = data[ch_idx];
if (chr >= '0' && chr <= '9') continue;
if (!decimal_found && chr == '.') {
decimal_found = true; // no more decimals
continue;
}
if (!exponent_found && (chr == 'e' || chr == 'E')) {
if (chidx + 1 < bytes) chr = data[chidx + 1];
if (chr == '-' || chr == '+') ++chidx;
if (ch_idx + 1 < bytes) chr = data[ch_idx + 1];
if (chr == '-' || chr == '+') ++ch_idx;
decimal_found = true; // no decimal allowed in exponent
exponent_found = true; // no more exponents
continue;
Expand Down
13 changes: 8 additions & 5 deletions cpp/src/strings/convert/convert_floats.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ namespace {
* @brief This function converts the given string into a
* floating point double value.
*
* This will also map strings containing "NaN", "Inf" and "-Inf"
* This will also map strings containing "NaN", "Inf", etc.
* to the appropriate float values.
*
* This function will also handle scientific notation format.
Expand All @@ -55,16 +55,19 @@ __device__ inline double stod(string_view const& d_str)
const char* in_ptr = d_str.data();
const char* end = in_ptr + d_str.size_bytes();
if (end == in_ptr) return 0.0;
// special strings
if (d_str.compare("NaN", 3) == 0) return std::numeric_limits<double>::quiet_NaN();
if (d_str.compare("Inf", 3) == 0) return std::numeric_limits<double>::infinity();
if (d_str.compare("-Inf", 4) == 0) return -std::numeric_limits<double>::infinity();
double sign{1.0};
if (*in_ptr == '-' || *in_ptr == '+') {
sign = (*in_ptr == '-' ? -1 : 1);
++in_ptr;
}

// special strings: NaN, Inf
if ((in_ptr < end) && *in_ptr > '9') {
auto const inf_nan = string_view(in_ptr, static_cast<size_type>(thrust::distance(in_ptr, end)));
if (string::is_nan_str(inf_nan)) return std::numeric_limits<double>::quiet_NaN();
if (string::is_inf_str(inf_nan)) return sign * std::numeric_limits<double>::infinity();
}

// Parse and store the mantissa as much as we can,
// until we are about to exceed the limit of uint64_t
constexpr uint64_t max_holding = (std::numeric_limits<uint64_t>::max() - 9L) / 10L;
Expand Down
51 changes: 13 additions & 38 deletions cpp/tests/strings/floats_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,32 +58,20 @@ TEST_F(StringsConvertTest, IsFloat)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);

cudf::test::strings_column_wrapper strings2(
{"+175", "-34", "9.8", "1234567890", "6.7e17", "-917.2e5"});
{"-34", "9.8", "1234567890", "-917.2e5", "INF", "NAN", "-Inf", "INFINITY"});
results = cudf::strings::is_float(cudf::strings_column_view(strings2));
cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1});
cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
}

TEST_F(StringsConvertTest, ToFloats32)
{
std::vector<const char*> h_strings{"1234",
nullptr,
"-876",
"543.2",
"-0.12",
".25",
"-.002",
"",
"-0.0",
"1.2e4",
"NaN",
"abc123",
"123abc",
"456e",
"-1.78e+5",
"-122.33644782123456789",
"12e+309",
"3.4028236E38"};
std::vector<const char*> h_strings{
"1234", nullptr, "-876", "543.2",
"-0.12", ".25", "-.002", "",
"-0.0", "1.2e4", "NAN", "abc123",
"123abc", "456e", "-1.78e+5", "-122.33644782123456789",
"12e+309", "3.4028236E38", "INF", "Infinity"};
cudf::test::strings_column_wrapper strings(
h_strings.begin(),
h_strings.end(),
Expand Down Expand Up @@ -135,24 +123,11 @@ TEST_F(StringsConvertTest, FromFloats32)

TEST_F(StringsConvertTest, ToFloats64)
{
std::vector<const char*> h_strings{"1234",
nullptr,
"-876",
"543.2",
"-0.12",
".25",
"-.002",
"",
"-0.0",
"1.28e256",
"NaN",
"abc123",
"123abc",
"456e",
"-1.78e+5",
"-122.33644782",
"12e+309",
"1.7976931348623159E308"};
std::vector<const char*> h_strings{
"1234", nullptr, "-876", "543.2", "-0.12", ".25",
"-.002", "", "-0.0", "1.28e256", "NaN", "abc123",
"123abc", "456e", "-1.78e+5", "-122.33644782", "12e+309", "1.7976931348623159E308",
"-Inf", "-INFINITY"};
cudf::test::strings_column_wrapper strings(
h_strings.begin(),
h_strings.end(),
Expand Down
23 changes: 12 additions & 11 deletions java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -4919,11 +4919,12 @@ void testIsFloat() {
try (ColumnVector floatStringCV = ColumnVector.fromStrings(floatStrings);
ColumnVector isFloat = floatStringCV.isFloat();
ColumnVector floats = floatStringCV.asFloats();
ColumnVector expectedFloats = ColumnVector.fromBoxedFloats(0f, 0f, Float.POSITIVE_INFINITY,
Float.NEGATIVE_INFINITY, 0f, 0f, -0f, 0f, Float.MAX_VALUE, Float.POSITIVE_INFINITY,
-Float.MAX_VALUE, Float.NEGATIVE_INFINITY, 1.2e-24f, 0f, 0f, null, 423f);
ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, true, true, false,
false, true, true, true, true, true, true, true, false, false, null, true)) {
ColumnVector expectedFloats = ColumnVector.fromBoxedFloats(0f, Float.NaN, Float.POSITIVE_INFINITY,
Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY, Float.POSITIVE_INFINITY, -0f, 0f,
Float.MAX_VALUE, Float.POSITIVE_INFINITY, -Float.MAX_VALUE, Float.NEGATIVE_INFINITY,
1.2e-24f, 0f, 0f, null, 423f);
ColumnVector expected = ColumnVector.fromBoxedBooleans(false, true, true, true, true,
true, true, true, true, true, true, true, true, false, false, null, true)) {
assertColumnsAreEqual(expected, isFloat);
assertColumnsAreEqual(expectedFloats, floats);
}
Expand All @@ -4944,12 +4945,12 @@ void testIsDouble() {
try (ColumnVector doubleStringCV = ColumnVector.fromStrings(doubleStrings);
ColumnVector isDouble = doubleStringCV.isFloat();
ColumnVector doubles = doubleStringCV.asDoubles();
ColumnVector expectedDoubles = ColumnVector.fromBoxedDoubles(0d, 0d,
Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, 0d, 0d, -0d, 0d, Double.MAX_VALUE,
Double.POSITIVE_INFINITY, -Double.MAX_VALUE, Double.NEGATIVE_INFINITY, 1.2e-234d, 0d,
0d, null, 423d);
ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, true, true, false,
false, true, true, true, true, true, true, true, false, false, null, true)) {
ColumnVector expectedDoubles = ColumnVector.fromBoxedDoubles(0d, Double.NaN,
Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY,
-0d, 0d, Double.MAX_VALUE, Double.POSITIVE_INFINITY, -Double.MAX_VALUE, Double.NEGATIVE_INFINITY,
1.2e-234d, 0d, 0d, null, 423d);
ColumnVector expected = ColumnVector.fromBoxedBooleans(false, true, true, true, true,
true, true, true, true, true, true, true, true, false, false, null, true)) {
assertColumnsAreEqual(expected, isDouble);
assertColumnsAreEqual(expectedDoubles, doubles);
}
Expand Down
73 changes: 0 additions & 73 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,69 +97,6 @@ def str_to_boolean(column: StringColumn):
cudf.dtype("timedelta64[ns]"): str_cast.int2timedelta,
}

_NAN_INF_VARIATIONS = [
"nan",
"NAN",
"Nan",
"naN",
"nAN",
"NAn",
"nAn",
"-inf",
"-INF",
"-InF",
"-inF",
"-iNF",
"-INf",
"-iNf",
"+inf",
"+INF",
"+InF",
"+inF",
"+iNF",
"+INf",
"+Inf",
"+iNf",
"inf",
"INF",
"InF",
"inF",
"iNF",
"INf",
"iNf",
]
_LIBCUDF_SUPPORTED_NAN_INF_VARIATIONS = [
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"NaN",
"-Inf",
"-Inf",
"-Inf",
"-Inf",
"-Inf",
"-Inf",
"-Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
"Inf",
]


def _is_supported_regex_flags(flags):
return flags == 0 or (
Expand Down Expand Up @@ -5309,16 +5246,6 @@ def as_numerical_column(
"type due to presence of non-integer values."
)
elif out_dtype.kind == "f":
# TODO: Replace this `replace` call with a
# case-insensitive method once following
# issue is fixed: https://github.com/rapidsai/cudf/issues/5217
old_values = cudf.core.column.as_column(_NAN_INF_VARIATIONS)
new_values = cudf.core.column.as_column(
_LIBCUDF_SUPPORTED_NAN_INF_VARIATIONS
)
string_col = libcudf.replace.replace(
string_col, old_values, new_values
)
if not libstrings.is_float(string_col).all():
raise ValueError(
"Could not convert strings to float "
Expand Down

0 comments on commit 69d5765

Please sign in to comment.