rapidsai · rapids-bot · Mar 2, 2021 · Feb 18, 2021 · Feb 18, 2021 · Feb 18, 2021
@@ -181,6 +181,7 @@ ConfigureBench(STRINGS_BENCH
   string/case_benchmark.cpp
   string/contains_benchmark.cpp
   string/convert_durations_benchmark.cpp
+  string/convert_floats_benchmark.cpp
   string/copy_benchmark.cpp
   string/filter_benchmark.cpp
   string/find_benchmark.cpp

@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <fixture/benchmark_fixture.hpp>
+#include <synchronization/synchronization.hpp>
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+
+#include <cudf/strings/convert/convert_floats.hpp>
+#include <cudf/types.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+namespace {
+template <class FloatType>
+std::unique_ptr<cudf::column> get_floats_column(int64_t array_size)
+{
+  std::unique_ptr<cudf::table> tbl;
+  if (sizeof(FloatType) == sizeof(float)) {
+    tbl = create_random_table(
+      {cudf::type_id::FLOAT32}, 1, row_count{static_cast<cudf::size_type>(array_size)});
+  } else {
+    tbl = create_random_table(
+      {cudf::type_id::FLOAT64}, 1, row_count{static_cast<cudf::size_type>(array_size)});
+  }
+  return std::move(tbl->release().front());
+}
+
+std::unique_ptr<cudf::column> get_floats_string_column(int64_t array_size)
+{
+  const auto floats = get_floats_column<double>(array_size);
+  return cudf::strings::from_floats(floats->view());
+}
+}  // anonymous namespace
+
+class StringToFloatNumber : public cudf::benchmark {
+};
+
+template <cudf::type_id float_type>
+void convert_to_float_number(benchmark::State& state)
+{
+  const auto array_size   = state.range(0);
+  const auto strings_col  = get_floats_string_column(array_size);
+  const auto strings_view = cudf::strings_column_view(strings_col->view());
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true);
+    volatile auto results = cudf::strings::to_floats(strings_view, cudf::data_type{float_type});
+  }
+
+  // bytes_processed = bytes_input + bytes_output
+  state.SetBytesProcessed(
+    state.iterations() *
+    (strings_view.chars_size() + array_size * cudf::size_of(cudf::data_type{float_type})));
+}
+
+class StringFromFloatNumber : public cudf::benchmark {
+};
+
+template <class FloatType>
+void convert_from_float_number(benchmark::State& state)
+{
+  const auto array_size                 = state.range(0);
+  const auto floats                     = get_floats_column<FloatType>(array_size);
+  const auto floats_view                = floats->view();
+  std::unique_ptr<cudf::column> results = nullptr;
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
+    results = cudf::strings::from_floats(floats_view);
+  }
+
+  // bytes_processed = bytes_input + bytes_output
+  state.SetBytesProcessed(
+    state.iterations() *
+    (cudf::strings_column_view(results->view()).chars_size() + array_size * sizeof(FloatType)));
+}
+
+#define CV_TO_FLOATS_BENCHMARK_DEFINE(name, float_type_id)                  \
+  BENCHMARK_DEFINE_F(StringToFloatNumber, name)(::benchmark::State & state) \
+  {                                                                         \
+    convert_to_float_number<float_type_id>(state);                          \
+  }                                                                         \
+  BENCHMARK_REGISTER_F(StringToFloatNumber, name)                           \
+    ->RangeMultiplier(4)                                                    \
+    ->Range(1 << 10, 1 << 17)                                               \
+    ->UseManualTime()                                                       \
+    ->Unit(benchmark::kMicrosecond);
+
+#define CV_FROM_FLOATS_BENCHMARK_DEFINE(name, float_type)                     \
+  BENCHMARK_DEFINE_F(StringFromFloatNumber, name)(::benchmark::State & state) \
+  {                                                                           \
+    convert_from_float_number<float_type>(state);                             \
+  }                                                                           \
+  BENCHMARK_REGISTER_F(StringFromFloatNumber, name)                           \
+    ->RangeMultiplier(4)                                                      \
+    ->Range(1 << 10, 1 << 17)                                                 \
+    ->UseManualTime()                                                         \
+    ->Unit(benchmark::kMicrosecond);
+
+CV_TO_FLOATS_BENCHMARK_DEFINE(string_to_float32, cudf::type_id::FLOAT32);
+CV_TO_FLOATS_BENCHMARK_DEFINE(string_to_float64, cudf::type_id::FLOAT64);
+
+CV_FROM_FLOATS_BENCHMARK_DEFINE(string_from_float32, float);
+CV_FROM_FLOATS_BENCHMARK_DEFINE(string_from_float64, double);
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -58,15 +58,18 @@ __device__ inline double stod(string_view const& d_str)
   if (d_str.compare("NaN", 3) == 0) return std::numeric_limits<double>::quiet_NaN();
   if (d_str.compare("Inf", 3) == 0) return std::numeric_limits<double>::infinity();
   if (d_str.compare("-Inf", 4) == 0) return -std::numeric_limits<double>::infinity();
-  double sign = 1.0;
+  double sign{1.0};
   if (*in_ptr == '-' || *in_ptr == '+') {
     sign = (*in_ptr == '-' ? -1 : 1);
     ++in_ptr;
   }
-  unsigned long max_mantissa = 0x0FFFFFFFFFFFFF;
-  unsigned long digits       = 0;
-  int exp_off                = 0;
-  bool decimal               = false;
+
+  // Parse and store the mantissa as much as we can,
+  // until we are about to exceed the limit of uint64_t
+  constexpr uint64_t max_holding = (std::numeric_limits<uint64_t>::max() - 9L) / 10L;
+  uint64_t digits                = 0;
+  int exp_off                    = 0;
+  bool decimal                   = false;
   while (in_ptr < end) {
     char ch = *in_ptr;
     if (ch == '.') {
@@ -75,18 +78,20 @@ __device__ inline double stod(string_view const& d_str)
       continue;
     }
     if (ch < '0' || ch > '9') break;
-    if (digits > max_mantissa)
+    if (digits > max_holding)
       exp_off += (int)!decimal;
     else {
-      digits = (digits * 10L) + (unsigned long)(ch - '0');
-      if (digits > max_mantissa) {
+      digits = (digits * 10L) + static_cast<uint64_t>(ch - '0');
+      if (digits > max_holding) {
         digits = digits / 10L;
         exp_off += (int)!decimal;
       } else
         exp_off -= (int)decimal;
     }
     ++in_ptr;
   }
+  if (digits == 0) return sign * static_cast<double>(0);
+
   // check for exponent char
   int exp_ten  = 0;
   int exp_sign = 1;
@@ -107,17 +112,23 @@ __device__ inline double stod(string_view const& d_str)
       }
     }
   }
+
+  int const num_digits = static_cast<int>(log10(digits)) + 1;
   exp_ten *= exp_sign;
   exp_ten += exp_off;
-  if (exp_ten > 308)
+  exp_ten += num_digits - 1;
+  if (exp_ten > std::numeric_limits<double>::max_exponent10)
     return sign > 0 ? std::numeric_limits<double>::infinity()
                     : -std::numeric_limits<double>::infinity();
-  else if (exp_ten < -308)
-    return 0.0;
+  else if (exp_ten < std::numeric_limits<double>::min_exponent10)
+    return double{0};
+
   // using exp10() since the pow(10.0,exp_ten) function is
   // very inaccurate in 10.2: http://nvbugs/2971187
-  double value = static_cast<double>(digits) * exp10(static_cast<double>(exp_ten));
-  return (value * sign);
+  double const base =
+    sign * static_cast<double>(digits) * exp10(static_cast<double>(1 - num_digits));
+  double const exponent = exp10(static_cast<double>(exp_ten));
+  return base * exponent;
 }
 
 /**
@@ -132,8 +143,8 @@ struct string_to_float_fn {
   __device__ FloatType operator()(size_type idx)
   {
     if (strings_column.is_null(idx)) return static_cast<FloatType>(0);
-    // the cast to FloatType will create predictable results
-    // for floats that are larger than the FloatType can hold
+    // The cast to FloatType will create predictable results for floats that are larger than the
+    // FloatType can hold
     return static_cast<FloatType>(stod(strings_column.element<string_view>(idx)));
   }
 };

@@ -45,31 +45,17 @@ TEST_F(StringsConvertTest, ToFloats32)
                                      "456e",
                                      "-1.78e+5",
                                      "-122.33644782123456789",
-                                     "12e+309"};
+                                     "12e+309",
+                                     "3.4028236E38"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
 
-  float nanval = std::numeric_limits<float>::quiet_NaN();
-  float infval = std::numeric_limits<float>::infinity();
-  std::vector<float> h_expected{1234.0,
-                                0,
-                                -876.0,
-                                543.2,
-                                -0.12,
-                                0.25,
-                                -0.002,
-                                0,
-                                -0.0,
-                                12000,
-                                nanval,
-                                0,
-                                123.0,
-                                456.0,
-                                -178000.0,
-                                -122.3364486694336,
-                                infval};
+  std::vector<float> h_expected;
+  std::for_each(h_strings.begin(), h_strings.end(), [&](const char* str) {
+    h_expected.push_back(str ? std::atof(str) : 0);
+  });
 
   auto strings_view = cudf::strings_column_view(strings);
   auto results = cudf::strings::to_floats(strings_view, cudf::data_type{cudf::type_id::FLOAT32});
@@ -78,7 +64,7 @@ TEST_F(StringsConvertTest, ToFloats32)
     h_expected.begin(),
     h_expected.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, true);
 }
 
 TEST_F(StringsConvertTest, FromFloats32)
@@ -107,7 +93,7 @@ TEST_F(StringsConvertTest, FromFloats32)
     h_expected.end(),
     thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, true);
 }
 
 TEST_F(StringsConvertTest, ToFloats64)
@@ -128,31 +114,17 @@ TEST_F(StringsConvertTest, ToFloats64)
                                      "456e",
                                      "-1.78e+5",
                                      "-122.33644782",
-                                     "12e+309"};
+                                     "12e+309",
+                                     "1.7976931348623159E308"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
 
-  double nanval = std::numeric_limits<double>::quiet_NaN();
-  double infval = std::numeric_limits<double>::infinity();
-  std::vector<double> h_expected{1234.0,
-                                 0,
-                                 -876.0,
-                                 543.2,
-                                 -0.12,
-                                 0.25,
-                                 -0.002,
-                                 0,
-                                 -0.0,
-                                 1.28e256,
-                                 nanval,
-                                 0,
-                                 123.0,
-                                 456.0,
-                                 -178000.0,
-                                 -122.33644781999999,
-                                 infval};
+  std::vector<double> h_expected;
+  std::for_each(h_strings.begin(), h_strings.end(), [&](const char* str) {
+    h_expected.push_back(str ? std::atof(str) : 0);
+  });
 
   auto strings_view = cudf::strings_column_view(strings);
   auto results = cudf::strings::to_floats(strings_view, cudf::data_type{cudf::type_id::FLOAT64});
@@ -161,7 +133,7 @@ TEST_F(StringsConvertTest, ToFloats64)
     h_expected.begin(),
     h_expected.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, true);
 }
 
 TEST_F(StringsConvertTest, FromFloats64)
@@ -190,7 +162,7 @@ TEST_F(StringsConvertTest, FromFloats64)
     h_expected.end(),
     thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, true);
 }
 
 TEST_F(StringsConvertTest, ZeroSizeStringsColumnFloat)

@@ -127,11 +127,20 @@ class corresponding_rows_not_equivalent {
       column_device_view const& lhs, column_device_view const& rhs, size_type index)
     {
       if (lhs.is_valid(index) and rhs.is_valid(index)) {
-        int ulp = 4;  // value taken from google test
-        T x     = lhs.element<T>(index);
-        T y     = rhs.element<T>(index);
-        return std::abs(x - y) > std::numeric_limits<T>::epsilon() * std::abs(x + y) * ulp &&
-               std::abs(x - y) >= std::numeric_limits<T>::min();
+        T const x = lhs.element<T>(index);
+        T const y = rhs.element<T>(index);
+
+        // Must handle inf and nan separately
+        if (std::isinf(x) || std::isinf(y)) {
+          return x != y;  // comparison of (inf==inf) returns true
+        } else if (std::isnan(x) || std::isnan(y)) {
+          return std::isnan(x) != std::isnan(y);  // comparison of (nan==nan) returns false
+        } else {
+          constexpr int ulp     = 4;  // ulp = unit of least precision, value taken from google test
+          T const abs_x_minus_y = std::abs(x - y);
+          return abs_x_minus_y >= std::numeric_limits<T>::min() &&
+                 abs_x_minus_y > std::numeric_limits<T>::epsilon() * std::abs(x + y) * ulp;
+        }
       } else {
         // if either is null, then the inequality was checked already
         return true;