Update check for inf/nan strings in libcudf float conversion to ignor…

…e case (#9694) Reference https://github.com/rapidsai/cudf/pull/9613/files#r743579126 Add support to ignore case for strings `INF`, `INFINITY` and `NAN` to `cudf::strings::is_float` and `cudf::strings::to_float` for consistency with https://en.cppreference.com/w/cpp/string/basic_string/stof Also, remove the expensive `replace` call in the cudf before calling this from Python. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Robert Maynard (https://github.com/robertmaynard) - Nghia Truong (https://github.com/ttnghia) - Jason Lowe (https://github.com/jlowe) URL: #9694
rapidsai · Nov 30, 2021 · 69d5765 · 69d5765
1 parent 1697f63
commit 69d5765
Show file tree

Hide file tree

Showing 5 changed files with 85 additions and 139 deletions.
diff --git a/cpp/include/cudf/strings/string.cuh b/cpp/include/cudf/strings/string.cuh
@@ -52,6 +52,43 @@ inline __device__ bool is_integer(string_view const& d_str)
            thrust::seq, begin, end, [] __device__(auto chr) { return chr >= '0' && chr <= '9'; });
 }
 
+/**
+ * @brief Returns true if input contains the not-a-number string.
+ *
+ * The following are valid for this function: "NAN" and "NaN"
+ * @param d_str input string
+ * @return true if input is as valid NaN string.
+ */
+inline __device__ bool is_nan_str(string_view const& d_str)
+{
+  auto const ptr = d_str.data();
+  return (d_str.size_bytes() == 3) && (ptr[0] == 'N' || ptr[0] == 'n') &&
+         (ptr[1] == 'A' || ptr[1] == 'a') && (ptr[2] == 'N' || ptr[2] == 'n');
+}
+
+/**
+ * @brief Returns true if input contains the infinity string.
+ *
+ * The following are valid for this function: "INF", "INFINITY", and "Inf"
+ * @param d_str input string
+ * @return true if input is as valid Inf string.
+ */
+inline __device__ bool is_inf_str(string_view const& d_str)
+{
+  auto const ptr  = d_str.data();
+  auto const size = d_str.size_bytes();
+
+  if (size != 3 && size != 8) return false;
+
+  auto const prefix_valid = (ptr[0] == 'I' || ptr[0] == 'i') && (ptr[1] == 'N' || ptr[1] == 'n') &&
+                            (ptr[2] == 'F' || ptr[2] == 'f');
+
+  return prefix_valid &&
+         ((size == 3) || ((ptr[3] == 'I' || ptr[3] == 'i') && (ptr[4] == 'N' || ptr[4] == 'n') &&
+                          (ptr[5] == 'I' || ptr[5] == 'i') && (ptr[6] == 'T' || ptr[6] == 't') &&
+                          (ptr[7] == 'Y' || ptr[7] == 'y')));
+}
+
 /**
  * @brief Returns `true` if all characters in the string
  * are valid for conversion to a float type.
@@ -65,38 +102,41 @@ inline __device__ bool is_integer(string_view const& d_str)
  * An empty string returns `false`.
  * No bounds checking is performed to verify if the value would fit
  * within a specific float type.
- * The following strings are also allowed "NaN", "Inf" and, "-Inf"
- * and will return true.
+ * The following strings are also allowed and will return true:
+ *  "NaN", "NAN", "Inf", "INF", "INFINITY"
  *
  * @param d_str String to check.
  * @return true if string has valid float characters
  */
 inline __device__ bool is_float(string_view const& d_str)
 {
   if (d_str.empty()) return false;
-  // strings allowed by the converter
-  if (d_str.compare("NaN", 3) == 0) return true;
-  if (d_str.compare("Inf", 3) == 0) return true;
-  if (d_str.compare("-Inf", 4) == 0) return true;
   bool decimal_found  = false;
   bool exponent_found = false;
   size_type bytes     = d_str.size_bytes();
   const char* data    = d_str.data();
   // sign character allowed at the beginning of the string
-  size_type chidx = (*data == '-' || *data == '+') ? 1 : 0;
-  bool result     = chidx < bytes;
+  size_type ch_idx = (*data == '-' || *data == '+') ? 1 : 0;
+
+  bool result = ch_idx < bytes;
+  // check for nan and infinity strings
+  if (result && data[ch_idx] > '9') {
+    auto const inf_nan = string_view(data + ch_idx, bytes - ch_idx);
+    if (is_nan_str(inf_nan) || is_inf_str(inf_nan)) return true;
+  }
+
   // check for float chars [0-9] and a single decimal '.'
   // and scientific notation [eE][+-][0-9]
-  for (; chidx < bytes; ++chidx) {
-    auto chr = data[chidx];
+  for (; ch_idx < bytes; ++ch_idx) {
+    auto chr = data[ch_idx];
     if (chr >= '0' && chr <= '9') continue;
     if (!decimal_found && chr == '.') {
       decimal_found = true;  // no more decimals
       continue;
     }
     if (!exponent_found && (chr == 'e' || chr == 'E')) {
-      if (chidx + 1 < bytes) chr = data[chidx + 1];
-      if (chr == '-' || chr == '+') ++chidx;
+      if (ch_idx + 1 < bytes) chr = data[ch_idx + 1];
+      if (chr == '-' || chr == '+') ++ch_idx;
       decimal_found  = true;  // no decimal allowed in exponent
       exponent_found = true;  // no more exponents
       continue;

diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
@@ -45,7 +45,7 @@ namespace {
  * @brief This function converts the given string into a
  * floating point double value.
  *
- * This will also map strings containing "NaN", "Inf" and "-Inf"
+ * This will also map strings containing "NaN", "Inf", etc.
  * to the appropriate float values.
  *
  * This function will also handle scientific notation format.
@@ -55,16 +55,19 @@ __device__ inline double stod(string_view const& d_str)
   const char* in_ptr = d_str.data();
   const char* end    = in_ptr + d_str.size_bytes();
   if (end == in_ptr) return 0.0;
-  // special strings
-  if (d_str.compare("NaN", 3) == 0) return std::numeric_limits<double>::quiet_NaN();
-  if (d_str.compare("Inf", 3) == 0) return std::numeric_limits<double>::infinity();
-  if (d_str.compare("-Inf", 4) == 0) return -std::numeric_limits<double>::infinity();
   double sign{1.0};
   if (*in_ptr == '-' || *in_ptr == '+') {
     sign = (*in_ptr == '-' ? -1 : 1);
     ++in_ptr;
   }
 
+  // special strings: NaN, Inf
+  if ((in_ptr < end) && *in_ptr > '9') {
+    auto const inf_nan = string_view(in_ptr, static_cast<size_type>(thrust::distance(in_ptr, end)));
+    if (string::is_nan_str(inf_nan)) return std::numeric_limits<double>::quiet_NaN();
+    if (string::is_inf_str(inf_nan)) return sign * std::numeric_limits<double>::infinity();
+  }
+
   // Parse and store the mantissa as much as we can,
   // until we are about to exceed the limit of uint64_t
   constexpr uint64_t max_holding = (std::numeric_limits<uint64_t>::max() - 9L) / 10L;

diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp
@@ -58,32 +58,20 @@ TEST_F(StringsConvertTest, IsFloat)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
 
   cudf::test::strings_column_wrapper strings2(
-    {"+175", "-34", "9.8", "1234567890", "6.7e17", "-917.2e5"});
+    {"-34", "9.8", "1234567890", "-917.2e5", "INF", "NAN", "-Inf", "INFINITY"});
   results = cudf::strings::is_float(cudf::strings_column_view(strings2));
-  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1, 1, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
 }
 
 TEST_F(StringsConvertTest, ToFloats32)
 {
-  std::vector<const char*> h_strings{"1234",
-                                     nullptr,
-                                     "-876",
-                                     "543.2",
-                                     "-0.12",
-                                     ".25",
-                                     "-.002",
-                                     "",
-                                     "-0.0",
-                                     "1.2e4",
-                                     "NaN",
-                                     "abc123",
-                                     "123abc",
-                                     "456e",
-                                     "-1.78e+5",
-                                     "-122.33644782123456789",
-                                     "12e+309",
-                                     "3.4028236E38"};
+  std::vector<const char*> h_strings{
+    "1234",    nullptr,        "-876",     "543.2",
+    "-0.12",   ".25",          "-.002",    "",
+    "-0.0",    "1.2e4",        "NAN",      "abc123",
+    "123abc",  "456e",         "-1.78e+5", "-122.33644782123456789",
+    "12e+309", "3.4028236E38", "INF",      "Infinity"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
@@ -135,24 +123,11 @@ TEST_F(StringsConvertTest, FromFloats32)
 
 TEST_F(StringsConvertTest, ToFloats64)
 {
-  std::vector<const char*> h_strings{"1234",
-                                     nullptr,
-                                     "-876",
-                                     "543.2",
-                                     "-0.12",
-                                     ".25",
-                                     "-.002",
-                                     "",
-                                     "-0.0",
-                                     "1.28e256",
-                                     "NaN",
-                                     "abc123",
-                                     "123abc",
-                                     "456e",
-                                     "-1.78e+5",
-                                     "-122.33644782",
-                                     "12e+309",
-                                     "1.7976931348623159E308"};
+  std::vector<const char*> h_strings{
+    "1234",   nullptr,    "-876",     "543.2",         "-0.12",   ".25",
+    "-.002",  "",         "-0.0",     "1.28e256",      "NaN",     "abc123",
+    "123abc", "456e",     "-1.78e+5", "-122.33644782", "12e+309", "1.7976931348623159E308",
+    "-Inf",   "-INFINITY"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),

diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -4919,11 +4919,12 @@ void testIsFloat() {
     try (ColumnVector floatStringCV = ColumnVector.fromStrings(floatStrings);
          ColumnVector isFloat = floatStringCV.isFloat();
          ColumnVector floats = floatStringCV.asFloats();
-         ColumnVector expectedFloats = ColumnVector.fromBoxedFloats(0f, 0f, Float.POSITIVE_INFINITY,
-             Float.NEGATIVE_INFINITY, 0f, 0f, -0f, 0f, Float.MAX_VALUE, Float.POSITIVE_INFINITY,
-             -Float.MAX_VALUE, Float.NEGATIVE_INFINITY, 1.2e-24f, 0f, 0f, null, 423f);
-         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, true, true, false,
-             false, true, true, true, true, true, true, true, false, false, null, true)) {
+         ColumnVector expectedFloats = ColumnVector.fromBoxedFloats(0f, Float.NaN, Float.POSITIVE_INFINITY,
+             Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY, Float.POSITIVE_INFINITY, -0f, 0f,
+             Float.MAX_VALUE, Float.POSITIVE_INFINITY, -Float.MAX_VALUE, Float.NEGATIVE_INFINITY,
+             1.2e-24f, 0f, 0f, null, 423f);
+         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, true, true, true, true,
+             true, true, true, true, true, true, true, true, false, false, null, true)) {
       assertColumnsAreEqual(expected, isFloat);
       assertColumnsAreEqual(expectedFloats, floats);
     }
@@ -4944,12 +4945,12 @@ void testIsDouble() {
     try (ColumnVector doubleStringCV = ColumnVector.fromStrings(doubleStrings);
          ColumnVector isDouble = doubleStringCV.isFloat();
          ColumnVector doubles = doubleStringCV.asDoubles();
-         ColumnVector expectedDoubles = ColumnVector.fromBoxedDoubles(0d, 0d,
-             Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, 0d, 0d, -0d, 0d, Double.MAX_VALUE,
-             Double.POSITIVE_INFINITY, -Double.MAX_VALUE, Double.NEGATIVE_INFINITY, 1.2e-234d, 0d,
-             0d, null, 423d);
-         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, true, true, false,
-             false, true, true, true, true, true, true, true, false, false, null, true)) {
+         ColumnVector expectedDoubles = ColumnVector.fromBoxedDoubles(0d, Double.NaN,
+             Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY,
+             -0d, 0d, Double.MAX_VALUE, Double.POSITIVE_INFINITY, -Double.MAX_VALUE, Double.NEGATIVE_INFINITY,
+             1.2e-234d, 0d, 0d, null, 423d);
+         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, true, true, true, true,
+             true, true, true, true, true, true, true, true, false, false, null, true)) {
       assertColumnsAreEqual(expected, isDouble);
       assertColumnsAreEqual(expectedDoubles, doubles);
     }

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
@@ -97,69 +97,6 @@ def str_to_boolean(column: StringColumn):
     cudf.dtype("timedelta64[ns]"): str_cast.int2timedelta,
 }
 
-_NAN_INF_VARIATIONS = [
-    "nan",
-    "NAN",
-    "Nan",
-    "naN",
-    "nAN",
-    "NAn",
-    "nAn",
-    "-inf",
-    "-INF",
-    "-InF",
-    "-inF",
-    "-iNF",
-    "-INf",
-    "-iNf",
-    "+inf",
-    "+INF",
-    "+InF",
-    "+inF",
-    "+iNF",
-    "+INf",
-    "+Inf",
-    "+iNf",
-    "inf",
-    "INF",
-    "InF",
-    "inF",
-    "iNF",
-    "INf",
-    "iNf",
-]
-_LIBCUDF_SUPPORTED_NAN_INF_VARIATIONS = [
-    "NaN",
-    "NaN",
-    "NaN",
-    "NaN",
-    "NaN",
-    "NaN",
-    "NaN",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-]
-
 
 def _is_supported_regex_flags(flags):
     return flags == 0 or (
@@ -5309,16 +5246,6 @@ def as_numerical_column(
                     "type due to presence of non-integer values."
                 )
         elif out_dtype.kind == "f":
-            # TODO: Replace this `replace` call with a
-            # case-insensitive method once following
-            # issue is fixed: https://github.com/rapidsai/cudf/issues/5217
-            old_values = cudf.core.column.as_column(_NAN_INF_VARIATIONS)
-            new_values = cudf.core.column.as_column(
-                _LIBCUDF_SUPPORTED_NAN_INF_VARIATIONS
-            )
-            string_col = libcudf.replace.replace(
-                string_col, old_values, new_values
-            )
             if not libstrings.is_float(string_col).all():
                 raise ValueError(
                     "Could not convert strings to float "