diff --git a/cpp/include/cudf/strings/string.cuh b/cpp/include/cudf/strings/string.cuh index 82da5ad8f10..d85d19d7f10 100644 --- a/cpp/include/cudf/strings/string.cuh +++ b/cpp/include/cudf/strings/string.cuh @@ -52,6 +52,43 @@ inline __device__ bool is_integer(string_view const& d_str) thrust::seq, begin, end, [] __device__(auto chr) { return chr >= '0' && chr <= '9'; }); } +/** + * @brief Returns true if input contains the not-a-number string. + * + * The following are valid for this function: "NAN" and "NaN" + * @param d_str input string + * @return true if input is as valid NaN string. + */ +inline __device__ bool is_nan_str(string_view const& d_str) +{ + auto const ptr = d_str.data(); + return (d_str.size_bytes() == 3) && (ptr[0] == 'N' || ptr[0] == 'n') && + (ptr[1] == 'A' || ptr[1] == 'a') && (ptr[2] == 'N' || ptr[2] == 'n'); +} + +/** + * @brief Returns true if input contains the infinity string. + * + * The following are valid for this function: "INF", "INFINITY", and "Inf" + * @param d_str input string + * @return true if input is as valid Inf string. + */ +inline __device__ bool is_inf_str(string_view const& d_str) +{ + auto const ptr = d_str.data(); + auto const size = d_str.size_bytes(); + + if (size != 3 && size != 8) return false; + + auto const prefix_valid = (ptr[0] == 'I' || ptr[0] == 'i') && (ptr[1] == 'N' || ptr[1] == 'n') && + (ptr[2] == 'F' || ptr[2] == 'f'); + + return prefix_valid && + ((size == 3) || ((ptr[3] == 'I' || ptr[3] == 'i') && (ptr[4] == 'N' || ptr[4] == 'n') && + (ptr[5] == 'I' || ptr[5] == 'i') && (ptr[6] == 'T' || ptr[6] == 't') && + (ptr[7] == 'Y' || ptr[7] == 'y'))); +} + /** * @brief Returns `true` if all characters in the string * are valid for conversion to a float type. @@ -65,8 +102,8 @@ inline __device__ bool is_integer(string_view const& d_str) * An empty string returns `false`. * No bounds checking is performed to verify if the value would fit * within a specific float type. - * The following strings are also allowed "NaN", "Inf" and, "-Inf" - * and will return true. + * The following strings are also allowed and will return true: + * "NaN", "NAN", "Inf", "INF", "INFINITY" * * @param d_str String to check. * @return true if string has valid float characters @@ -74,29 +111,32 @@ inline __device__ bool is_integer(string_view const& d_str) inline __device__ bool is_float(string_view const& d_str) { if (d_str.empty()) return false; - // strings allowed by the converter - if (d_str.compare("NaN", 3) == 0) return true; - if (d_str.compare("Inf", 3) == 0) return true; - if (d_str.compare("-Inf", 4) == 0) return true; bool decimal_found = false; bool exponent_found = false; size_type bytes = d_str.size_bytes(); const char* data = d_str.data(); // sign character allowed at the beginning of the string - size_type chidx = (*data == '-' || *data == '+') ? 1 : 0; - bool result = chidx < bytes; + size_type ch_idx = (*data == '-' || *data == '+') ? 1 : 0; + + bool result = ch_idx < bytes; + // check for nan and infinity strings + if (result && data[ch_idx] > '9') { + auto const inf_nan = string_view(data + ch_idx, bytes - ch_idx); + if (is_nan_str(inf_nan) || is_inf_str(inf_nan)) return true; + } + // check for float chars [0-9] and a single decimal '.' // and scientific notation [eE][+-][0-9] - for (; chidx < bytes; ++chidx) { - auto chr = data[chidx]; + for (; ch_idx < bytes; ++ch_idx) { + auto chr = data[ch_idx]; if (chr >= '0' && chr <= '9') continue; if (!decimal_found && chr == '.') { decimal_found = true; // no more decimals continue; } if (!exponent_found && (chr == 'e' || chr == 'E')) { - if (chidx + 1 < bytes) chr = data[chidx + 1]; - if (chr == '-' || chr == '+') ++chidx; + if (ch_idx + 1 < bytes) chr = data[ch_idx + 1]; + if (chr == '-' || chr == '+') ++ch_idx; decimal_found = true; // no decimal allowed in exponent exponent_found = true; // no more exponents continue; diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu index 366d4fe7d42..70b5f528213 100644 --- a/cpp/src/strings/convert/convert_floats.cu +++ b/cpp/src/strings/convert/convert_floats.cu @@ -45,7 +45,7 @@ namespace { * @brief This function converts the given string into a * floating point double value. * - * This will also map strings containing "NaN", "Inf" and "-Inf" + * This will also map strings containing "NaN", "Inf", etc. * to the appropriate float values. * * This function will also handle scientific notation format. @@ -55,16 +55,19 @@ __device__ inline double stod(string_view const& d_str) const char* in_ptr = d_str.data(); const char* end = in_ptr + d_str.size_bytes(); if (end == in_ptr) return 0.0; - // special strings - if (d_str.compare("NaN", 3) == 0) return std::numeric_limits::quiet_NaN(); - if (d_str.compare("Inf", 3) == 0) return std::numeric_limits::infinity(); - if (d_str.compare("-Inf", 4) == 0) return -std::numeric_limits::infinity(); double sign{1.0}; if (*in_ptr == '-' || *in_ptr == '+') { sign = (*in_ptr == '-' ? -1 : 1); ++in_ptr; } + // special strings: NaN, Inf + if ((in_ptr < end) && *in_ptr > '9') { + auto const inf_nan = string_view(in_ptr, static_cast(thrust::distance(in_ptr, end))); + if (string::is_nan_str(inf_nan)) return std::numeric_limits::quiet_NaN(); + if (string::is_inf_str(inf_nan)) return sign * std::numeric_limits::infinity(); + } + // Parse and store the mantissa as much as we can, // until we are about to exceed the limit of uint64_t constexpr uint64_t max_holding = (std::numeric_limits::max() - 9L) / 10L; diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp index 126bffa1e49..e6f4f6bb8d9 100644 --- a/cpp/tests/strings/floats_tests.cpp +++ b/cpp/tests/strings/floats_tests.cpp @@ -58,32 +58,20 @@ TEST_F(StringsConvertTest, IsFloat) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1); cudf::test::strings_column_wrapper strings2( - {"+175", "-34", "9.8", "1234567890", "6.7e17", "-917.2e5"}); + {"-34", "9.8", "1234567890", "-917.2e5", "INF", "NAN", "-Inf", "INFINITY"}); results = cudf::strings::is_float(cudf::strings_column_view(strings2)); - cudf::test::fixed_width_column_wrapper expected2({1, 1, 1, 1, 1, 1}); + cudf::test::fixed_width_column_wrapper expected2({1, 1, 1, 1, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); } TEST_F(StringsConvertTest, ToFloats32) { - std::vector h_strings{"1234", - nullptr, - "-876", - "543.2", - "-0.12", - ".25", - "-.002", - "", - "-0.0", - "1.2e4", - "NaN", - "abc123", - "123abc", - "456e", - "-1.78e+5", - "-122.33644782123456789", - "12e+309", - "3.4028236E38"}; + std::vector h_strings{ + "1234", nullptr, "-876", "543.2", + "-0.12", ".25", "-.002", "", + "-0.0", "1.2e4", "NAN", "abc123", + "123abc", "456e", "-1.78e+5", "-122.33644782123456789", + "12e+309", "3.4028236E38", "INF", "Infinity"}; cudf::test::strings_column_wrapper strings( h_strings.begin(), h_strings.end(), @@ -135,24 +123,11 @@ TEST_F(StringsConvertTest, FromFloats32) TEST_F(StringsConvertTest, ToFloats64) { - std::vector h_strings{"1234", - nullptr, - "-876", - "543.2", - "-0.12", - ".25", - "-.002", - "", - "-0.0", - "1.28e256", - "NaN", - "abc123", - "123abc", - "456e", - "-1.78e+5", - "-122.33644782", - "12e+309", - "1.7976931348623159E308"}; + std::vector h_strings{ + "1234", nullptr, "-876", "543.2", "-0.12", ".25", + "-.002", "", "-0.0", "1.28e256", "NaN", "abc123", + "123abc", "456e", "-1.78e+5", "-122.33644782", "12e+309", "1.7976931348623159E308", + "-Inf", "-INFINITY"}; cudf::test::strings_column_wrapper strings( h_strings.begin(), h_strings.end(), diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index a582541a0d4..cf602c26717 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -4919,11 +4919,12 @@ void testIsFloat() { try (ColumnVector floatStringCV = ColumnVector.fromStrings(floatStrings); ColumnVector isFloat = floatStringCV.isFloat(); ColumnVector floats = floatStringCV.asFloats(); - ColumnVector expectedFloats = ColumnVector.fromBoxedFloats(0f, 0f, Float.POSITIVE_INFINITY, - Float.NEGATIVE_INFINITY, 0f, 0f, -0f, 0f, Float.MAX_VALUE, Float.POSITIVE_INFINITY, - -Float.MAX_VALUE, Float.NEGATIVE_INFINITY, 1.2e-24f, 0f, 0f, null, 423f); - ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, true, true, false, - false, true, true, true, true, true, true, true, false, false, null, true)) { + ColumnVector expectedFloats = ColumnVector.fromBoxedFloats(0f, Float.NaN, Float.POSITIVE_INFINITY, + Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY, Float.POSITIVE_INFINITY, -0f, 0f, + Float.MAX_VALUE, Float.POSITIVE_INFINITY, -Float.MAX_VALUE, Float.NEGATIVE_INFINITY, + 1.2e-24f, 0f, 0f, null, 423f); + ColumnVector expected = ColumnVector.fromBoxedBooleans(false, true, true, true, true, + true, true, true, true, true, true, true, true, false, false, null, true)) { assertColumnsAreEqual(expected, isFloat); assertColumnsAreEqual(expectedFloats, floats); } @@ -4944,12 +4945,12 @@ void testIsDouble() { try (ColumnVector doubleStringCV = ColumnVector.fromStrings(doubleStrings); ColumnVector isDouble = doubleStringCV.isFloat(); ColumnVector doubles = doubleStringCV.asDoubles(); - ColumnVector expectedDoubles = ColumnVector.fromBoxedDoubles(0d, 0d, - Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, 0d, 0d, -0d, 0d, Double.MAX_VALUE, - Double.POSITIVE_INFINITY, -Double.MAX_VALUE, Double.NEGATIVE_INFINITY, 1.2e-234d, 0d, - 0d, null, 423d); - ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, true, true, false, - false, true, true, true, true, true, true, true, false, false, null, true)) { + ColumnVector expectedDoubles = ColumnVector.fromBoxedDoubles(0d, Double.NaN, + Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY, + -0d, 0d, Double.MAX_VALUE, Double.POSITIVE_INFINITY, -Double.MAX_VALUE, Double.NEGATIVE_INFINITY, + 1.2e-234d, 0d, 0d, null, 423d); + ColumnVector expected = ColumnVector.fromBoxedBooleans(false, true, true, true, true, + true, true, true, true, true, true, true, true, false, false, null, true)) { assertColumnsAreEqual(expected, isDouble); assertColumnsAreEqual(expectedDoubles, doubles); } diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index a167383c65c..2a91abc5701 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -97,69 +97,6 @@ def str_to_boolean(column: StringColumn): cudf.dtype("timedelta64[ns]"): str_cast.int2timedelta, } -_NAN_INF_VARIATIONS = [ - "nan", - "NAN", - "Nan", - "naN", - "nAN", - "NAn", - "nAn", - "-inf", - "-INF", - "-InF", - "-inF", - "-iNF", - "-INf", - "-iNf", - "+inf", - "+INF", - "+InF", - "+inF", - "+iNF", - "+INf", - "+Inf", - "+iNf", - "inf", - "INF", - "InF", - "inF", - "iNF", - "INf", - "iNf", -] -_LIBCUDF_SUPPORTED_NAN_INF_VARIATIONS = [ - "NaN", - "NaN", - "NaN", - "NaN", - "NaN", - "NaN", - "NaN", - "-Inf", - "-Inf", - "-Inf", - "-Inf", - "-Inf", - "-Inf", - "-Inf", - "Inf", - "Inf", - "Inf", - "Inf", - "Inf", - "Inf", - "Inf", - "Inf", - "Inf", - "Inf", - "Inf", - "Inf", - "Inf", - "Inf", - "Inf", -] - def _is_supported_regex_flags(flags): return flags == 0 or ( @@ -5309,16 +5246,6 @@ def as_numerical_column( "type due to presence of non-integer values." ) elif out_dtype.kind == "f": - # TODO: Replace this `replace` call with a - # case-insensitive method once following - # issue is fixed: https://github.com/rapidsai/cudf/issues/5217 - old_values = cudf.core.column.as_column(_NAN_INF_VARIATIONS) - new_values = cudf.core.column.as_column( - _LIBCUDF_SUPPORTED_NAN_INF_VARIATIONS - ) - string_col = libcudf.replace.replace( - string_col, old_values, new_values - ) if not libstrings.is_float(string_col).all(): raise ValueError( "Could not convert strings to float "