Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix string to double conversion and row equivalent comparison #7410

Merged
merged 24 commits into from
Mar 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
7a93e07
Add a benchmark for string <=> floats conversion.
ttnghia Feb 18, 2021
3ea06fb
Fix error and improve the function converting fromn string to float t…
ttnghia Feb 18, 2021
4bfd906
Fix the equivalent check function for floating point numbers that inc…
ttnghia Feb 18, 2021
0cfdbcd
Add a test for converting string to double number
ttnghia Feb 18, 2021
7778f85
Merge remote-tracking branch 'origin/branch-0.19' into branch-0.19-is…
ttnghia Feb 18, 2021
a5ede5d
Update copyright header
ttnghia Feb 18, 2021
2584840
Fix const qualifier position and change CMakeLists.txt
ttnghia Feb 18, 2021
7fadcd0
Some improvement to atof
ttnghia Feb 19, 2021
f3ead3d
Use the results generated from std::stof to test our atof implementation
ttnghia Feb 19, 2021
30e98b3
Add a simple test case
ttnghia Feb 19, 2021
354e65e
Fix convert_floats_benchmark
ttnghia Feb 19, 2021
0130e51
Merge remote-tracking branch 'origin/branch-0.19' into branch-0.19-is…
ttnghia Feb 19, 2021
d411f7f
Merge remote-tracking branch 'origin/branch-0.19' into branch-0.19-is…
ttnghia Feb 20, 2021
8f5a580
Improve accuracy by casting from string to double then to float
ttnghia Feb 20, 2021
a112a1b
Change the template function `stof` to a regular function `stod`.
ttnghia Feb 22, 2021
f8077a7
Re-instating important comment
ttnghia Feb 22, 2021
1238dba
Minor change in handling nan in typed_element_not_equivalent struct
ttnghia Feb 22, 2021
4aaa83f
Simplify data generation and change ranges for the benchmarks
ttnghia Feb 22, 2021
6b1af3b
Merge remote-tracking branch 'origin/branch-0.19' into branch-0.19-is…
ttnghia Feb 23, 2021
3a7c52e
Update header format for cpp/benchmarks/string/convert_floats_benchma…
ttnghia Feb 23, 2021
4fbd4f6
Add a comment to the stod function
ttnghia Mar 1, 2021
0b8633b
Generate random float numbers by calling to create_random_table, and …
ttnghia Mar 1, 2021
886fc57
Fix format check
ttnghia Mar 1, 2021
da19661
Merge branch 'branch-0.19' into branch-0.19-issue-5225
ttnghia Mar 2, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ ConfigureBench(STRINGS_BENCH
string/case_benchmark.cpp
string/contains_benchmark.cpp
string/convert_durations_benchmark.cpp
string/convert_floats_benchmark.cpp
string/copy_benchmark.cpp
string/filter_benchmark.cpp
string/find_benchmark.cpp
Expand Down
120 changes: 120 additions & 0 deletions cpp/benchmarks/string/convert_floats_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <fixture/benchmark_fixture.hpp>
#include <synchronization/synchronization.hpp>

#include <benchmark/benchmark.h>
#include <benchmarks/common/generate_benchmark_input.hpp>

#include <cudf/strings/convert/convert_floats.hpp>
#include <cudf/types.hpp>

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_wrapper.hpp>

namespace {
template <class FloatType>
std::unique_ptr<cudf::column> get_floats_column(int64_t array_size)
{
std::unique_ptr<cudf::table> tbl;
if (sizeof(FloatType) == sizeof(float)) {
tbl = create_random_table(
{cudf::type_id::FLOAT32}, 1, row_count{static_cast<cudf::size_type>(array_size)});
} else {
tbl = create_random_table(
{cudf::type_id::FLOAT64}, 1, row_count{static_cast<cudf::size_type>(array_size)});
}
return std::move(tbl->release().front());
}

std::unique_ptr<cudf::column> get_floats_string_column(int64_t array_size)
{
const auto floats = get_floats_column<double>(array_size);
return cudf::strings::from_floats(floats->view());
}
} // anonymous namespace

class StringToFloatNumber : public cudf::benchmark {
};

template <cudf::type_id float_type>
void convert_to_float_number(benchmark::State& state)
{
const auto array_size = state.range(0);
const auto strings_col = get_floats_string_column(array_size);
const auto strings_view = cudf::strings_column_view(strings_col->view());

for (auto _ : state) {
cuda_event_timer raii(state, true);
volatile auto results = cudf::strings::to_floats(strings_view, cudf::data_type{float_type});
}

// bytes_processed = bytes_input + bytes_output
state.SetBytesProcessed(
state.iterations() *
(strings_view.chars_size() + array_size * cudf::size_of(cudf::data_type{float_type})));
}

class StringFromFloatNumber : public cudf::benchmark {
};

template <class FloatType>
void convert_from_float_number(benchmark::State& state)
{
const auto array_size = state.range(0);
const auto floats = get_floats_column<FloatType>(array_size);
const auto floats_view = floats->view();
std::unique_ptr<cudf::column> results = nullptr;

for (auto _ : state) {
cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0
results = cudf::strings::from_floats(floats_view);
}

// bytes_processed = bytes_input + bytes_output
state.SetBytesProcessed(
state.iterations() *
(cudf::strings_column_view(results->view()).chars_size() + array_size * sizeof(FloatType)));
}

#define CV_TO_FLOATS_BENCHMARK_DEFINE(name, float_type_id) \
BENCHMARK_DEFINE_F(StringToFloatNumber, name)(::benchmark::State & state) \
{ \
convert_to_float_number<float_type_id>(state); \
} \
BENCHMARK_REGISTER_F(StringToFloatNumber, name) \
->RangeMultiplier(4) \
->Range(1 << 10, 1 << 17) \
->UseManualTime() \
->Unit(benchmark::kMicrosecond);

#define CV_FROM_FLOATS_BENCHMARK_DEFINE(name, float_type) \
BENCHMARK_DEFINE_F(StringFromFloatNumber, name)(::benchmark::State & state) \
{ \
convert_from_float_number<float_type>(state); \
} \
BENCHMARK_REGISTER_F(StringFromFloatNumber, name) \
->RangeMultiplier(4) \
->Range(1 << 10, 1 << 17) \
->UseManualTime() \
->Unit(benchmark::kMicrosecond);

CV_TO_FLOATS_BENCHMARK_DEFINE(string_to_float32, cudf::type_id::FLOAT32);
CV_TO_FLOATS_BENCHMARK_DEFINE(string_to_float64, cudf::type_id::FLOAT64);

CV_FROM_FLOATS_BENCHMARK_DEFINE(string_from_float32, float);
CV_FROM_FLOATS_BENCHMARK_DEFINE(string_from_float64, double);
43 changes: 27 additions & 16 deletions cpp/src/strings/convert/convert_floats.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -58,15 +58,18 @@ __device__ inline double stod(string_view const& d_str)
if (d_str.compare("NaN", 3) == 0) return std::numeric_limits<double>::quiet_NaN();
if (d_str.compare("Inf", 3) == 0) return std::numeric_limits<double>::infinity();
if (d_str.compare("-Inf", 4) == 0) return -std::numeric_limits<double>::infinity();
double sign = 1.0;
double sign{1.0};
if (*in_ptr == '-' || *in_ptr == '+') {
sign = (*in_ptr == '-' ? -1 : 1);
++in_ptr;
}
unsigned long max_mantissa = 0x0FFFFFFFFFFFFF;
unsigned long digits = 0;
int exp_off = 0;
bool decimal = false;

// Parse and store the mantissa as much as we can,
// until we are about to exceed the limit of uint64_t
constexpr uint64_t max_holding = (std::numeric_limits<uint64_t>::max() - 9L) / 10L;
ttnghia marked this conversation as resolved.
Show resolved Hide resolved
uint64_t digits = 0;
int exp_off = 0;
bool decimal = false;
while (in_ptr < end) {
char ch = *in_ptr;
if (ch == '.') {
Expand All @@ -75,18 +78,20 @@ __device__ inline double stod(string_view const& d_str)
continue;
}
if (ch < '0' || ch > '9') break;
if (digits > max_mantissa)
if (digits > max_holding)
exp_off += (int)!decimal;
else {
digits = (digits * 10L) + (unsigned long)(ch - '0');
if (digits > max_mantissa) {
digits = (digits * 10L) + static_cast<uint64_t>(ch - '0');
if (digits > max_holding) {
digits = digits / 10L;
exp_off += (int)!decimal;
} else
exp_off -= (int)decimal;
}
++in_ptr;
}
if (digits == 0) return sign * static_cast<double>(0);

// check for exponent char
int exp_ten = 0;
int exp_sign = 1;
Expand All @@ -107,17 +112,23 @@ __device__ inline double stod(string_view const& d_str)
}
}
}

int const num_digits = static_cast<int>(log10(digits)) + 1;
exp_ten *= exp_sign;
exp_ten += exp_off;
if (exp_ten > 308)
exp_ten += num_digits - 1;
if (exp_ten > std::numeric_limits<double>::max_exponent10)
return sign > 0 ? std::numeric_limits<double>::infinity()
: -std::numeric_limits<double>::infinity();
else if (exp_ten < -308)
return 0.0;
else if (exp_ten < std::numeric_limits<double>::min_exponent10)
return double{0};

// using exp10() since the pow(10.0,exp_ten) function is
// very inaccurate in 10.2: http://nvbugs/2971187
double value = static_cast<double>(digits) * exp10(static_cast<double>(exp_ten));
return (value * sign);
double const base =
sign * static_cast<double>(digits) * exp10(static_cast<double>(1 - num_digits));
double const exponent = exp10(static_cast<double>(exp_ten));
return base * exponent;
}

/**
Expand All @@ -132,8 +143,8 @@ struct string_to_float_fn {
__device__ FloatType operator()(size_type idx)
{
if (strings_column.is_null(idx)) return static_cast<FloatType>(0);
// the cast to FloatType will create predictable results
// for floats that are larger than the FloatType can hold
// The cast to FloatType will create predictable results for floats that are larger than the
// FloatType can hold
return static_cast<FloatType>(stod(strings_column.element<string_view>(idx)));
ttnghia marked this conversation as resolved.
Show resolved Hide resolved
}
};
Expand Down
60 changes: 16 additions & 44 deletions cpp/tests/strings/floats_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,31 +45,17 @@ TEST_F(StringsConvertTest, ToFloats32)
"456e",
"-1.78e+5",
"-122.33644782123456789",
"12e+309"};
"12e+309",
"3.4028236E38"};
cudf::test::strings_column_wrapper strings(
h_strings.begin(),
h_strings.end(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));

float nanval = std::numeric_limits<float>::quiet_NaN();
float infval = std::numeric_limits<float>::infinity();
std::vector<float> h_expected{1234.0,
0,
-876.0,
543.2,
-0.12,
0.25,
-0.002,
0,
-0.0,
12000,
nanval,
0,
123.0,
456.0,
-178000.0,
-122.3364486694336,
infval};
std::vector<float> h_expected;
std::for_each(h_strings.begin(), h_strings.end(), [&](const char* str) {
h_expected.push_back(str ? std::atof(str) : 0);
});

auto strings_view = cudf::strings_column_view(strings);
auto results = cudf::strings::to_floats(strings_view, cudf::data_type{cudf::type_id::FLOAT32});
Expand All @@ -78,7 +64,7 @@ TEST_F(StringsConvertTest, ToFloats32)
h_expected.begin(),
h_expected.end(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, true);
}

TEST_F(StringsConvertTest, FromFloats32)
Expand Down Expand Up @@ -107,7 +93,7 @@ TEST_F(StringsConvertTest, FromFloats32)
h_expected.end(),
thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));

CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, true);
}

TEST_F(StringsConvertTest, ToFloats64)
Expand All @@ -128,31 +114,17 @@ TEST_F(StringsConvertTest, ToFloats64)
"456e",
"-1.78e+5",
"-122.33644782",
"12e+309"};
"12e+309",
"1.7976931348623159E308"};
ttnghia marked this conversation as resolved.
Show resolved Hide resolved
cudf::test::strings_column_wrapper strings(
h_strings.begin(),
h_strings.end(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));

double nanval = std::numeric_limits<double>::quiet_NaN();
double infval = std::numeric_limits<double>::infinity();
std::vector<double> h_expected{1234.0,
0,
-876.0,
543.2,
-0.12,
0.25,
-0.002,
0,
-0.0,
1.28e256,
nanval,
0,
123.0,
456.0,
-178000.0,
-122.33644781999999,
infval};
std::vector<double> h_expected;
std::for_each(h_strings.begin(), h_strings.end(), [&](const char* str) {
h_expected.push_back(str ? std::atof(str) : 0);
});

auto strings_view = cudf::strings_column_view(strings);
auto results = cudf::strings::to_floats(strings_view, cudf::data_type{cudf::type_id::FLOAT64});
Expand All @@ -161,7 +133,7 @@ TEST_F(StringsConvertTest, ToFloats64)
h_expected.begin(),
h_expected.end(),
thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, true);
}

TEST_F(StringsConvertTest, FromFloats64)
Expand Down Expand Up @@ -190,7 +162,7 @@ TEST_F(StringsConvertTest, FromFloats64)
h_expected.end(),
thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));

CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, true);
}

TEST_F(StringsConvertTest, ZeroSizeStringsColumnFloat)
Expand Down
19 changes: 14 additions & 5 deletions cpp/tests/utilities/column_utilities.cu
Original file line number Diff line number Diff line change
Expand Up @@ -127,11 +127,20 @@ class corresponding_rows_not_equivalent {
column_device_view const& lhs, column_device_view const& rhs, size_type index)
{
if (lhs.is_valid(index) and rhs.is_valid(index)) {
int ulp = 4; // value taken from google test
T x = lhs.element<T>(index);
T y = rhs.element<T>(index);
return std::abs(x - y) > std::numeric_limits<T>::epsilon() * std::abs(x + y) * ulp &&
std::abs(x - y) >= std::numeric_limits<T>::min();
T const x = lhs.element<T>(index);
T const y = rhs.element<T>(index);

// Must handle inf and nan separately
if (std::isinf(x) || std::isinf(y)) {
return x != y; // comparison of (inf==inf) returns true
} else if (std::isnan(x) || std::isnan(y)) {
return std::isnan(x) != std::isnan(y); // comparison of (nan==nan) returns false
} else {
constexpr int ulp = 4; // ulp = unit of least precision, value taken from google test
T const abs_x_minus_y = std::abs(x - y);
return abs_x_minus_y >= std::numeric_limits<T>::min() &&
abs_x_minus_y > std::numeric_limits<T>::epsilon() * std::abs(x + y) * ulp;
}
} else {
// if either is null, then the inequality was checked already
return true;
Expand Down