From f144689d69bba97559798be4b11acbf5e8939df1 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Wed, 10 Feb 2021 07:49:12 -0500 Subject: [PATCH 01/21] Add libcudf strings to/from fixed-point convert APIs --- cpp/include/cudf/fixed_point/fixed_point.hpp | 14 +- .../strings/convert/convert_fixed_point.hpp | 102 ++++++ .../cudf/strings/detail/converters.hpp | 21 +- .../strings/convert/convert_fixed_point.cu | 322 ++++++++++++++++++ cpp/tests/CMakeLists.txt | 1 + cpp/tests/strings/fixed_point_tests.cpp | 115 +++++++ 6 files changed, 568 insertions(+), 7 deletions(-) create mode 100644 cpp/include/cudf/strings/convert/convert_fixed_point.hpp create mode 100644 cpp/src/strings/convert/convert_fixed_point.cu create mode 100644 cpp/tests/strings/fixed_point_tests.cpp diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp index 20246022d8e..a1b6c625b23 100644 --- a/cpp/include/cudf/fixed_point/fixed_point.hpp +++ b/cpp/include/cudf/fixed_point/fixed_point.hpp @@ -537,21 +537,23 @@ class fixed_point { explicit operator std::string() const { if (_scale < 0) { - int const n = std::pow(10, -_scale); - int const f = _value % n; + auto const av = std::abs(_value); + int const n = std::pow(10, -_scale); + int const f = av % n; auto const num_zeros = std::max(0, (-_scale - static_cast(std::to_string(f).size()))); auto const zeros = std::string(num_zeros, '0'); - return std::to_string(_value / n) + std::string(".") + zeros + - std::to_string(std::abs(_value) % n); + auto const sign = _value < 0 ? std::string("-") : std::string(); + return sign + std::to_string(av / n) + std::string(".") + zeros + std::to_string(av % n); } else { auto const zeros = std::string(_scale, '0'); return std::to_string(_value) + zeros; } } -}; // namespace numeric +}; -/** @brief Function for identifying integer overflow when adding +/** + * @brief Function for identifying integer overflow when adding * * @tparam Rep Type of integer to check for overflow on * @tparam T Types of lhs and rhs (ensures they are the same type) diff --git a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp new file mode 100644 index 00000000000..f8d84707ecf --- /dev/null +++ b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +namespace cudf { +namespace strings { +/** + * @addtogroup strings_convert + * @{ + * @file + */ + +/** + * @brief Returns a new fixed-point column parsing decimal values from the + * provided strings column. + * + * Any null entries will result in corresponding null entries in the output column. + * + * Only characters [0-9] plus a prefix '-' and '+' and a single decimal point + * are recognized. When any other character is encountered, the parsing ends + * for that string and the current digits are converted into a fixed-point value. + * + * Overflow of the resulting value type is not checked. + * Each string is converted using an int64 type and then cast to the + * target type before storing it into the output column. + * The decimal point is used only for determing the output scale value. + * + * @throw cudf::logic_error if output_type is not a fixed-point type. + * + * @param strings Strings instance for this operation. + * @param output_type Type of fixed-point column to return. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New column of `output_type`. + */ +std::unique_ptr to_fixed_point( + strings_column_view const& input, + data_type output_type, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns a new strings column converting the fixed-point values from the + * provided column into strings. + * + * Any null entries will result in corresponding null entries in the output column. + * + * For each value, a string is created in base-10 decimal. + * Negative numbers will include a '-' prefix. + * The column's scale value is used to place the decimal point. + * + * @throw cudf::logic_error if the input column is not a fixed-point type. + * + * @param input Fixed-point column to convert. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New strings column. + */ +std::unique_ptr from_fixed_point( + column_view const& input, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns a boolean column identifying strings in which all + * characters are valid for conversion to fixed-point. + * + * The output row entry will be set to `true` if the corresponding string element + * has at least one character in [+-0-9.]. + * + * @code{.pseudo} + * Example: + * s = ['123', '-456', '', '1.2.3', '+17E30', '12.34' '.789'] + * b = s.is_hex(s) + * b is [true, false, false, false, false, true, true] + * @endcode + * + * Any null row results in a null entry for that row in the output column. + * + * @param strings Strings instance for this operation. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New column of boolean results for each string. + */ +std::unique_ptr is_fixed_point( + strings_column_view const& input, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of doxygen group +} // namespace strings +} // namespace cudf diff --git a/cpp/include/cudf/strings/detail/converters.hpp b/cpp/include/cudf/strings/detail/converters.hpp index 098dc1a38dc..d91979708e0 100644 --- a/cpp/include/cudf/strings/detail/converters.hpp +++ b/cpp/include/cudf/strings/detail/converters.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -132,6 +132,25 @@ std::unique_ptr from_durations(column_view const& durations, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +/** + * @copydoc to_fixed_point(strings_column_view const&,data_type,rmm::mr::device_memory_resource*) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr to_fixed_point(strings_column_view const& strings, + data_type output_type, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +/** + * @copydoc from_fixed_point(strings_column_view const&,rmm::mr::device_memory_resource*) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr from_fixed_point(column_view const& integers, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + } // namespace detail } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu new file mode 100644 index 00000000000..13890225754 --- /dev/null +++ b/cpp/src/strings/convert/convert_fixed_point.cu @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include + +namespace cudf { +namespace strings { +namespace detail { +namespace { +/** + * @brief Converts strings into an integers and records decimal point. + * + * Used by the dispatch method to convert to different fixed-point types. + */ +template +struct string_to_decimal_fn { + column_device_view const d_strings; + DecimalType* values; + int32_t* scales; + + __device__ void operator()(size_type idx) + { + values[idx] = DecimalType{0}; + scales[idx] = numeric::scale_type{0}; + if (d_strings.is_null(idx)) return; + auto const d_str = d_strings.element(idx); + if (d_str.empty()) return; + auto const sign = [&] { + auto const first = d_str.data(); + if (*first == '-') return -1; + if (*first == '+') return 1; + return 0; + }(); + auto iter = d_str.begin() + (sign != 0); + + int64_t value = 0; + bool decimal_found = false; + while (iter != d_str.end()) { + auto const chr = *iter++; + if (chr >= '0' && chr <= '9') { + value = (value * 10) + static_cast(chr - '0'); + scales[idx] -= static_cast(decimal_found); + } else if (chr == '.') + decimal_found = true; + else + break; + } + values[idx] = static_cast(value * (sign == 0 ? 1 : sign)); + } +}; + +template +struct rescale_decimals_fn { + column_device_view const d_strings; + int32_t const max_scale; + int32_t const* scales; + DecimalType* values; + + __device__ void operator()(size_type idx) + { + if (d_strings.is_null(idx)) return; + numeric::scaled_integer si{values[idx], numeric::scale_type{scales[idx]}}; + numeric::fixed_point fp{si}; + values[idx] = fp.rescaled(numeric::scale_type{max_scale}).value(); + } +}; + +/** + * @brief The dispatch functions for converting strings. + * + * The output_column is expected to be one of the integer types only. + */ +struct dispatch_to_fixed_point_fn { + template ()>* = nullptr> + std::unique_ptr operator()(strings_column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const + { + using DecimalType = device_storage_type_t; + rmm::device_uvector d_scales(input.size(), stream); + rmm::device_uvector d_values(input.size(), stream, mr); + auto d_column = column_device_view::create(input.parent(), stream); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + input.size(), + string_to_decimal_fn{*d_column, d_values.data(), d_scales.data()}); + // find the maximum scale size -- min is used since all scale values will be <= 0 + auto const min_elem = + thrust::min_element(rmm::exec_policy(stream), d_scales.begin(), d_scales.end()); + auto const scale = d_scales.element(thrust::distance(d_scales.begin(), min_elem), stream); + // re-scale all the values to the max scale + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + input.size(), + rescale_decimals_fn{*d_column, scale, d_scales.data(), d_values.data()}); + // build output column + return std::make_unique(data_type{type_to_id(), scale}, + input.size(), + d_values.release(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count()); + } + // non-integral types throw an exception + template ()>* = nullptr> + std::unique_ptr operator()(strings_column_view const&, + rmm::cuda_stream_view, + rmm::mr::device_memory_resource*) const + { + CUDF_FAIL("Output for to_fixed_point must be an fixed-point type."); + } +}; + +} // namespace + +// This will convert a strings column into any integer column type. +std::unique_ptr to_fixed_point(strings_column_view const& input, + data_type output_type, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (input.is_empty()) return make_empty_column(output_type); + return type_dispatcher(output_type, dispatch_to_fixed_point_fn{}, input, stream, mr); +} + +} // namespace detail + +// external API +std::unique_ptr to_fixed_point(strings_column_view const& strings, + data_type output_type, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::to_fixed_point(strings, output_type, rmm::cuda_stream_default, mr); +} + +namespace detail { +namespace { +/** + * @brief Calculate the size of the each string required for + * converting each value in base-10 format. + */ +template +struct decimal_to_string_size_fn { + column_device_view const d_column; + + __device__ int32_t operator()(size_type idx) const + { + if (d_column.is_null(idx)) return 0; + auto const value = d_column.element(idx); + auto const scale = d_column.type().scale(); + + if (scale >= 0) return count_digits(value) + scale; + + auto const abs_value = std::abs(value); + auto const exp_ten = static_cast(exp10(static_cast(-scale))); + auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten))); + return static_cast(value < 0) + count_digits(abs_value / exp_ten) + 1 + num_zeros + + count_digits(abs_value % exp_ten); + } +}; + +/** + * @brief Convert each value into a string. + * + * The value is converted into base-10 using only characters [0-9]. + */ +template +struct decimal_to_string_fn { + column_device_view const d_column; + int32_t const* d_offsets; + char* d_chars; + + __device__ void operator()(size_type idx) + { + if (d_column.is_null(idx)) return; + auto const value = d_column.element(idx); + auto const scale = d_column.type().scale(); + char* d_buffer = d_chars + d_offsets[idx]; + if (scale >= 0) { + integer_to_string(value, d_buffer); + d_buffer += count_digits(value); + // add zeros + thrust::generate_n(thrust::seq, d_buffer, scale, []() { return '0'; }); + return; + } + + // scale < 0 + // write format: [-]integer.fraction + // where integer = abs(value) / (10 ^ abs(scale)) + // fraction = abs(value) % (10 ^ abs(scale)) + auto const abs_value = std::abs(value); + if (value < 0) *d_buffer++ = '-'; // add sign + auto const exp_ten = static_cast(exp10(static_cast(-scale))); + auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten))); + // add the integer part + integer_to_string(abs_value / exp_ten, d_buffer); + d_buffer += count_digits(abs_value / exp_ten); + *d_buffer++ = '.'; // add decimal point + // add zeros + thrust::generate_n(thrust::seq, d_buffer, num_zeros, []() { return '0'; }); + d_buffer += num_zeros; + // add the fractional part + integer_to_string(abs_value % exp_ten, d_buffer); + } +}; + +/** + * @brief The dispatcher functor for converting fixed-point values into strings. + */ +struct dispatch_from_fixed_point_fn { + template ()>* = nullptr> + std::unique_ptr operator()(column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const + { + using DecimalType = device_storage_type_t; // underlying value type + auto d_column = column_device_view::create(input, stream); + + // build offsets column + // !!!!!!!!!!!!!!!!!!!! use cudf::detail::make-something-iterator + auto offsets_transformer_itr = + thrust::make_transform_iterator(thrust::make_counting_iterator(0), + decimal_to_string_size_fn{*d_column}); + // !!!!!!!!!!!!!!!!!!!! + auto offsets_column = detail::make_offsets_child_column( + offsets_transformer_itr, offsets_transformer_itr + input.size(), stream, mr); + auto d_offsets = offsets_column->view().template data(); + + // build chars column + auto const bytes = + cudf::detail::get_value(offsets_column->view(), input.size(), stream); + auto chars_column = + detail::create_chars_child_column(input.size(), input.null_count(), bytes, stream, mr); + auto chars_view = chars_column->mutable_view(); + auto d_chars = chars_view.template data(); + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + input.size(), + decimal_to_string_fn{*d_column, d_offsets, d_chars}); + + return make_strings_column(input.size(), + std::move(offsets_column), + std::move(chars_column), + input.null_count(), + cudf::detail::copy_bitmask(input, stream, mr), + stream, + mr); + } + + // non-integral types throw an exception + template ()>* = nullptr> + std::unique_ptr operator()(column_view const&, + rmm::cuda_stream_view, + rmm::mr::device_memory_resource*) const + { + CUDF_FAIL("Values for from_fixed_point function must be an fixed-point type."); + } +}; + +} // namespace + +// This will convert all integer column types into a strings column. +std::unique_ptr from_fixed_point(column_view const& integers, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + size_type strings_count = integers.size(); + if (strings_count == 0) return detail::make_empty_strings_column(stream, mr); + + return type_dispatcher(integers.type(), dispatch_from_fixed_point_fn{}, integers, stream, mr); +} + +} // namespace detail + +// external API + +std::unique_ptr from_fixed_point(column_view const& integers, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::from_fixed_point(integers, rmm::cuda_stream_default, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 55b1d50767f..1dcbf17f9e5 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -576,6 +576,7 @@ set(STRINGS_TEST_SRC "${CMAKE_CURRENT_SOURCE_DIR}/strings/findall_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/strings/find_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/strings/find_multiple_tests.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/strings/fixed_point_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/strings/floats_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/strings/hash_string.cu" "${CMAKE_CURRENT_SOURCE_DIR}/strings/integers_tests.cu" diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp new file mode 100644 index 00000000000..717dc42f897 --- /dev/null +++ b/cpp/tests/strings/fixed_point_tests.cpp @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include + +struct StringsConvertTest : public cudf::test::BaseFixture { +}; + +template +class StringsFixedPointConvertTest : public StringsConvertTest { +}; + +TYPED_TEST_CASE(StringsFixedPointConvertTest, cudf::test::FixedPointTypes); + +TYPED_TEST(StringsFixedPointConvertTest, ToFixedPoint) +{ + using DecimalType = TypeParam; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + + cudf::test::strings_column_wrapper strings( + {"1234", "-876", "543.2", "-0.12", ".25", "-.002", "", "-0.0"}); + auto results = cudf::strings::to_fixed_point(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_to_id()}); + auto const expected = + fp_wrapper{{1234000, -876000, 543200, -120, 250, -2, 0, 0}, numeric::scale_type{-3}}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + + cudf::test::strings_column_wrapper strings_nulls( + {"1234", "-876", "543", "900000", "2500000", "", ""}, {1, 1, 1, 1, 1, 1, 0}); + results = cudf::strings::to_fixed_point(cudf::strings_column_view(strings_nulls), + cudf::data_type{cudf::type_to_id()}); + auto const expected_nulls = fp_wrapper{ + {1234, -876, 543, 900000, 2500000, 0, 0}, {1, 1, 1, 1, 1, 1, 0}, numeric::scale_type{0}}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_nulls); +} + +TYPED_TEST(StringsFixedPointConvertTest, FromFixedPoint) +{ + using DecimalType = TypeParam; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + + auto const negative_scale = + fp_wrapper{{110, 222, 3330, 4444, -550, -6666}, numeric::scale_type{-2}}; + auto results = cudf::strings::from_fixed_point(negative_scale); + cudf::test::strings_column_wrapper negative_expected( + {"1.10", "2.22", "33.30", "44.44", "-5.50", "-66.66"}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, negative_expected); + + auto const positive_scale = + fp_wrapper({110, -222, 3330, 4, -550, 0}, {1, 1, 1, 1, 1, 0}, numeric::scale_type{2}); + results = cudf::strings::from_fixed_point(positive_scale); + cudf::test::strings_column_wrapper positive_expected( + {"11000", "-22200", "333000", "400", "-55000", ""}, {1, 1, 1, 1, 1, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, positive_expected); + + auto const zero_scale = + fp_wrapper({0, -222, 3330, 4, -550, 0}, {0, 1, 1, 1, 1, 1}, numeric::scale_type{0}); + results = cudf::strings::from_fixed_point(zero_scale); + cudf::test::strings_column_wrapper zero_expected({"", "-222", "3330", "4", "-550", "0"}, + {0, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, zero_expected); +} + +TEST_F(StringsConvertTest, ZeroSizeStringsColumnFixedPoint) +{ + cudf::column_view zero_size_column( + cudf::data_type{cudf::type_id::DECIMAL32}, 0, nullptr, nullptr, 0); + auto results = cudf::strings::from_fixed_point(zero_size_column); + cudf::test::expect_strings_empty(results->view()); +} + +TEST_F(StringsConvertTest, ZeroSizeFixedPointColumn) +{ + cudf::column_view zero_size_column( + cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); + auto results = + cudf::strings::to_fixed_point(zero_size_column, cudf::data_type{cudf::type_id::DECIMAL32}); + EXPECT_EQ(0, results->size()); +} + +TEST_F(StringsConvertTest, FromToFixedPointError) +{ + auto dtype = cudf::data_type{cudf::type_id::INT32}; + auto column = cudf::make_numeric_column(dtype, 100); + EXPECT_THROW(cudf::strings::from_fixed_point(column->view()), cudf::logic_error); + + cudf::test::strings_column_wrapper strings{"this string intentionally left blank"}; + EXPECT_THROW(cudf::strings::to_fixed_point(column->view(), dtype), cudf::logic_error); +} From 66ab11167782d4d382092f09a652d0e32b6a19d0 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Wed, 10 Feb 2021 08:43:42 -0500 Subject: [PATCH 02/21] use make_counting_transform_iterator utility --- cpp/src/strings/convert/convert_fixed_point.cu | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu index 13890225754..149bc9484fb 100644 --- a/cpp/src/strings/convert/convert_fixed_point.cu +++ b/cpp/src/strings/convert/convert_fixed_point.cu @@ -17,15 +17,14 @@ #include #include #include +#include #include #include -#include #include #include #include #include #include -#include #include #include @@ -36,7 +35,6 @@ #include #include -#include namespace cudf { namespace strings { @@ -254,11 +252,8 @@ struct dispatch_from_fixed_point_fn { auto d_column = column_device_view::create(input, stream); // build offsets column - // !!!!!!!!!!!!!!!!!!!! use cudf::detail::make-something-iterator - auto offsets_transformer_itr = - thrust::make_transform_iterator(thrust::make_counting_iterator(0), - decimal_to_string_size_fn{*d_column}); - // !!!!!!!!!!!!!!!!!!!! + auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator( + 0, decimal_to_string_size_fn{*d_column}); auto offsets_column = detail::make_offsets_child_column( offsets_transformer_itr, offsets_transformer_itr + input.size(), stream, mr); auto d_offsets = offsets_column->view().template data(); From 224f07b3f098edd58e3ffefa9054602a87114220 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Wed, 10 Feb 2021 08:53:11 -0500 Subject: [PATCH 03/21] fix doxygen example for is_fixed_point --- cpp/include/cudf/strings/convert/convert_fixed_point.hpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp index f8d84707ecf..f2e6628f41d 100644 --- a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp +++ b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp @@ -37,9 +37,7 @@ namespace strings { * for that string and the current digits are converted into a fixed-point value. * * Overflow of the resulting value type is not checked. - * Each string is converted using an int64 type and then cast to the - * target type before storing it into the output column. - * The decimal point is used only for determing the output scale value. + * The decimal point is used only for determining the output scale value. * * @throw cudf::logic_error if output_type is not a fixed-point type. * @@ -83,7 +81,7 @@ std::unique_ptr from_fixed_point( * @code{.pseudo} * Example: * s = ['123', '-456', '', '1.2.3', '+17E30', '12.34' '.789'] - * b = s.is_hex(s) + * b = s.is_fixed_point(s) * b is [true, false, false, false, false, true, true] * @endcode * From b11a5a9874c7c9711deb750efd2f1d1d9e449d55 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Wed, 10 Feb 2021 09:06:38 -0500 Subject: [PATCH 04/21] add new header file to meta.yaml --- conda/recipes/libcudf/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index a1953a2d358..d8799e276c5 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -153,6 +153,7 @@ test: - test -f $PREFIX/include/cudf/strings/convert/convert_booleans.hpp - test -f $PREFIX/include/cudf/strings/convert/convert_datetime.hpp - test -f $PREFIX/include/cudf/strings/convert/convert_durations.hpp + - test -f $PREFIX/include/cudf/strings/convert/convert_fixed_point.hpp - test -f $PREFIX/include/cudf/strings/convert/convert_floats.hpp - test -f $PREFIX/include/cudf/strings/convert/convert_integers.hpp - test -f $PREFIX/include/cudf/strings/convert/convert_ipv4.hpp From bf68768d3917633348755cdac09e92b06051519c Mon Sep 17 00:00:00 2001 From: davidwendt Date: Wed, 10 Feb 2021 09:06:59 -0500 Subject: [PATCH 05/21] add examples to doxygen comments --- .../strings/convert/convert_fixed_point.hpp | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp index f2e6628f41d..911f7c0bebe 100644 --- a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp +++ b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp @@ -36,6 +36,13 @@ namespace strings { * are recognized. When any other character is encountered, the parsing ends * for that string and the current digits are converted into a fixed-point value. * + * @code{.pseudo} + * Example: + * s = ['123', '-876', '543.2', '-0.12'] + * fp = to_fixed_point(s) + * fp is [123400, -87600, 54320, -12] with scale = -2 + * @endcode + * * Overflow of the resulting value type is not checked. * The decimal point is used only for determining the output scale value. * @@ -61,6 +68,13 @@ std::unique_ptr to_fixed_point( * Negative numbers will include a '-' prefix. * The column's scale value is used to place the decimal point. * + * @code{.pseudo} + * Example: + * fp is [110, 222, 3330, -440, -1] with scale = -2 + * s = from_fixed_point(fp) + * s is now ['1.10', '2.22', '33.30', '-4.40', '-0.01'] + * @endcode + * * @throw cudf::logic_error if the input column is not a fixed-point type. * * @param input Fixed-point column to convert. @@ -80,9 +94,9 @@ std::unique_ptr from_fixed_point( * * @code{.pseudo} * Example: - * s = ['123', '-456', '', '1.2.3', '+17E30', '12.34' '.789'] - * b = s.is_fixed_point(s) - * b is [true, false, false, false, false, true, true] + * s = ['123', '-456', '', '1.2.3', '+17E30', '12.34' '.789', '-0.005] + * b = is_fixed_point(s) + * b is [true, true, false, false, false, true, true, true] * @endcode * * Any null row results in a null entry for that row in the output column. From b2b232effcc4d548baa88f57ee48e3ea2de475b5 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Wed, 10 Feb 2021 09:07:56 -0500 Subject: [PATCH 06/21] include fraction-only test value in FromFixedPoint test --- cpp/tests/strings/fixed_point_tests.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp index 717dc42f897..4b88aabae5d 100644 --- a/cpp/tests/strings/fixed_point_tests.cpp +++ b/cpp/tests/strings/fixed_point_tests.cpp @@ -65,11 +65,10 @@ TYPED_TEST(StringsFixedPointConvertTest, FromFixedPoint) using RepType = cudf::device_storage_type_t; using fp_wrapper = cudf::test::fixed_point_column_wrapper; - auto const negative_scale = - fp_wrapper{{110, 222, 3330, 4444, -550, -6666}, numeric::scale_type{-2}}; - auto results = cudf::strings::from_fixed_point(negative_scale); + auto const negative_scale = fp_wrapper{{110, 222, 3330, 4444, -550, -6}, numeric::scale_type{-2}}; + auto results = cudf::strings::from_fixed_point(negative_scale); cudf::test::strings_column_wrapper negative_expected( - {"1.10", "2.22", "33.30", "44.44", "-5.50", "-66.66"}); + {"1.10", "2.22", "33.30", "44.44", "-5.50", "-0.06"}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, negative_expected); auto const positive_scale = From 00915858e1cc20fd993ce6c0e4ec5d5378e9f72a Mon Sep 17 00:00:00 2001 From: davidwendt Date: Wed, 10 Feb 2021 14:20:59 -0500 Subject: [PATCH 07/21] fix doxygen comments --- cpp/include/cudf/strings/convert/convert_fixed_point.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp index 911f7c0bebe..deb739d1dd7 100644 --- a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp +++ b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp @@ -90,7 +90,7 @@ std::unique_ptr from_fixed_point( * characters are valid for conversion to fixed-point. * * The output row entry will be set to `true` if the corresponding string element - * has at least one character in [+-0-9.]. + * has at least one character in [+-0123456789.]. * * @code{.pseudo} * Example: @@ -101,12 +101,14 @@ std::unique_ptr from_fixed_point( * * Any null row results in a null entry for that row in the output column. * - * @param strings Strings instance for this operation. + * @param input Strings instance for this operation. + * @param decimal_type Fixed-point type used only for checking overflow * @param mr Device memory resource used to allocate the returned column's device memory. * @return New column of boolean results for each string. */ std::unique_ptr is_fixed_point( strings_column_view const& input, + data_type decimal_type = data_type{type_id::DECIMAL64}, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group From 8cde31a90f245f418d0e4e3954f8557b679210e4 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Wed, 10 Feb 2021 14:21:17 -0500 Subject: [PATCH 08/21] add is_fixed_point code logic --- .../strings/convert/convert_fixed_point.cu | 143 +++++++++++++++--- cpp/tests/strings/fixed_point_tests.cpp | 48 +++++- 2 files changed, 158 insertions(+), 33 deletions(-) diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu index 149bc9484fb..8276d8b3abe 100644 --- a/cpp/src/strings/convert/convert_fixed_point.cu +++ b/cpp/src/strings/convert/convert_fixed_point.cu @@ -41,9 +41,7 @@ namespace strings { namespace detail { namespace { /** - * @brief Converts strings into an integers and records decimal point. - * - * Used by the dispatch method to convert to different fixed-point types. + * @brief Converts strings into an integers and records decimal places. */ template struct string_to_decimal_fn { @@ -55,9 +53,11 @@ struct string_to_decimal_fn { { values[idx] = DecimalType{0}; scales[idx] = numeric::scale_type{0}; + if (d_strings.is_null(idx)) return; auto const d_str = d_strings.element(idx); if (d_str.empty()) return; + auto const sign = [&] { auto const first = d_str.data(); if (*first == '-') return -1; @@ -78,30 +78,67 @@ struct string_to_decimal_fn { else break; } + values[idx] = static_cast(value * (sign == 0 ? 1 : sign)); } }; +/** + * @brief This only checks the string format for valid decimal characters. + */ +template +struct string_to_decimal_check_fn { + column_device_view const d_strings; + + __device__ bool operator()(size_type idx) + { + if (d_strings.is_null(idx)) return false; + auto const d_str = d_strings.element(idx); + if (d_str.empty()) return false; + + auto iter = d_str.begin() + (d_str.data()[0] == '-' || d_str.data()[0] == '+'); + + DecimalType value = 0; + bool decimal_found = false; + while (iter != d_str.end()) { + auto const chr = *iter++; + if (chr == '.' && !decimal_found) { + decimal_found = true; + continue; + } + if (chr < '0' || chr > '9') return false; + auto const digit = static_cast(chr - '0'); + auto const max_check = (std::numeric_limits::max() - digit) / DecimalType{10}; + if (value > max_check) return false; + value = (value * DecimalType{10}) + digit; + } + return true; + } +}; + +/** + * @brief Uses the max_scale to rescale all the integer values. + */ template struct rescale_decimals_fn { column_device_view const d_strings; - int32_t const max_scale; - int32_t const* scales; - DecimalType* values; + int32_t const new_scale; + int32_t const* scales; // original scales + DecimalType* values; // values to rescale __device__ void operator()(size_type idx) { if (d_strings.is_null(idx)) return; - numeric::scaled_integer si{values[idx], numeric::scale_type{scales[idx]}}; - numeric::fixed_point fp{si}; - values[idx] = fp.rescaled(numeric::scale_type{max_scale}).value(); + + auto const fp = numeric::fixed_point{ + numeric::scaled_integer{values[idx], numeric::scale_type{scales[idx]}}}; + + values[idx] = fp.rescaled(numeric::scale_type{new_scale}).value(); } }; /** - * @brief The dispatch functions for converting strings. - * - * The output_column is expected to be one of the integer types only. + * @brief The dispatch function for converting strings column to fixed-point column. */ struct dispatch_to_fixed_point_fn { template ()>* = nullptr> @@ -118,11 +155,11 @@ struct dispatch_to_fixed_point_fn { thrust::make_counting_iterator(0), input.size(), string_to_decimal_fn{*d_column, d_values.data(), d_scales.data()}); - // find the maximum scale size -- min is used since all scale values will be <= 0 + // find the largest scale size -- min is used since all scale values will be <= 0 auto const min_elem = thrust::min_element(rmm::exec_policy(stream), d_scales.begin(), d_scales.end()); auto const scale = d_scales.element(thrust::distance(d_scales.begin(), min_elem), stream); - // re-scale all the values to the max scale + // re-scale all the values to the new scale thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -135,13 +172,13 @@ struct dispatch_to_fixed_point_fn { cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count()); } - // non-integral types throw an exception + template ()>* = nullptr> std::unique_ptr operator()(strings_column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) const { - CUDF_FAIL("Output for to_fixed_point must be an fixed-point type."); + CUDF_FAIL("Output for to_fixed_point must be a decimal type."); } }; @@ -279,7 +316,6 @@ struct dispatch_from_fixed_point_fn { mr); } - // non-integral types throw an exception template ()>* = nullptr> std::unique_ptr operator()(column_view const&, rmm::cuda_stream_view, @@ -291,26 +327,83 @@ struct dispatch_from_fixed_point_fn { } // namespace -// This will convert all integer column types into a strings column. -std::unique_ptr from_fixed_point(column_view const& integers, +std::unique_ptr from_fixed_point(column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - size_type strings_count = integers.size(); - if (strings_count == 0) return detail::make_empty_strings_column(stream, mr); - - return type_dispatcher(integers.type(), dispatch_from_fixed_point_fn{}, integers, stream, mr); + if (input.is_empty()) return detail::make_empty_strings_column(stream, mr); + return type_dispatcher(input.type(), dispatch_from_fixed_point_fn{}, input, stream, mr); } } // namespace detail // external API -std::unique_ptr from_fixed_point(column_view const& integers, +std::unique_ptr from_fixed_point(column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::from_fixed_point(integers, rmm::cuda_stream_default, mr); + return detail::from_fixed_point(input, rmm::cuda_stream_default, mr); +} + +namespace detail { +namespace { + +struct dispatch_is_fixed_point_fn { + template ()>* = nullptr> + std::unique_ptr operator()(strings_column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const + { + using DecimalType = device_storage_type_t; + auto d_column = column_device_view::create(input.parent(), stream); + + // create output column + auto results = make_numeric_column(data_type{type_id::BOOL8}, + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), + stream, + mr); + auto d_results = results->mutable_view().data(); + + // check strings for valid fixed-point chars + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + d_results, + string_to_decimal_check_fn{*d_column}); + results->set_null_count(input.null_count()); + return results; + } + + template ()>* = nullptr> + std::unique_ptr operator()(strings_column_view const&, + rmm::cuda_stream_view, + rmm::mr::device_memory_resource*) const + { + CUDF_FAIL("is_fixed_point is expecting a decimal type"); + } +}; + +} // namespace + +std::unique_ptr is_fixed_point(strings_column_view const& input, + data_type decimal_type, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (input.is_empty()) return cudf::make_empty_column(data_type{type_id::BOOL8}); + return type_dispatcher(decimal_type, dispatch_is_fixed_point_fn{}, input, stream, mr); +} +} // namespace detail + +std::unique_ptr is_fixed_point(strings_column_view const& input, + data_type decimal_type, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::is_fixed_point(input, decimal_type, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp index 4b88aabae5d..82a473ee51c 100644 --- a/cpp/tests/strings/fixed_point_tests.cpp +++ b/cpp/tests/strings/fixed_point_tests.cpp @@ -88,18 +88,20 @@ TYPED_TEST(StringsFixedPointConvertTest, FromFixedPoint) TEST_F(StringsConvertTest, ZeroSizeStringsColumnFixedPoint) { - cudf::column_view zero_size_column( - cudf::data_type{cudf::type_id::DECIMAL32}, 0, nullptr, nullptr, 0); - auto results = cudf::strings::from_fixed_point(zero_size_column); + auto zero_size_column = cudf::make_empty_column(cudf::data_type{cudf::type_id::DECIMAL32}); + + auto results = cudf::strings::from_fixed_point(zero_size_column->view()); cudf::test::expect_strings_empty(results->view()); } TEST_F(StringsConvertTest, ZeroSizeFixedPointColumn) { - cudf::column_view zero_size_column( - cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); - auto results = - cudf::strings::to_fixed_point(zero_size_column, cudf::data_type{cudf::type_id::DECIMAL32}); + auto zero_size_column = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + + auto results = cudf::strings::to_fixed_point(zero_size_column->view(), + cudf::data_type{cudf::type_id::DECIMAL32}); + EXPECT_EQ(0, results->size()); + results = cudf::strings::is_fixed_point(zero_size_column->view()); EXPECT_EQ(0, results->size()); } @@ -110,5 +112,35 @@ TEST_F(StringsConvertTest, FromToFixedPointError) EXPECT_THROW(cudf::strings::from_fixed_point(column->view()), cudf::logic_error); cudf::test::strings_column_wrapper strings{"this string intentionally left blank"}; - EXPECT_THROW(cudf::strings::to_fixed_point(column->view(), dtype), cudf::logic_error); + cudf::strings_column_view strings_view(strings); + EXPECT_THROW(cudf::strings::to_fixed_point(strings_view, dtype), cudf::logic_error); + EXPECT_THROW(cudf::strings::is_fixed_point(strings_view, dtype), cudf::logic_error); +} + +TEST_F(StringsConvertTest, IsFixedPoint) +{ + cudf::test::strings_column_wrapper strings( + {"1234", "+876", "543.2", "-00.120", "1E34", "1.0.02", "", "-0.0"}); + auto results = cudf::strings::is_fixed_point(cudf::strings_column_view(strings)); + auto const expected = cudf::test::fixed_width_column_wrapper( + {true, true, true, true, false, false, false, true}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + + cudf::test::strings_column_wrapper big_numbers({"2147483647", + "-2147483647", + "2147483648", + "9223372036854775807", + "-9223372036854775807", + "9223372036854775808"}); + results = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers), + cudf::data_type{cudf::type_id::DECIMAL32}); + auto const expected32 = + cudf::test::fixed_width_column_wrapper({true, true, false, false, false, false}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected32); + + results = cudf::strings::is_fixed_point(cudf::strings_column_view(big_numbers), + cudf::data_type{cudf::type_id::DECIMAL64}); + auto const expected64 = + cudf::test::fixed_width_column_wrapper({true, true, true, true, true, false}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64); } From f3825e0561824e73196926aa7ec2af270496e1a5 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Thu, 11 Feb 2021 08:26:14 -0500 Subject: [PATCH 09/21] add missing @throw doxygen tag --- .../strings/convert/convert_fixed_point.hpp | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp index deb739d1dd7..c978608b3b8 100644 --- a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp +++ b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp @@ -44,9 +44,9 @@ namespace strings { * @endcode * * Overflow of the resulting value type is not checked. - * The decimal point is used only for determining the output scale value. + * The decimal point is used for determining the output scale value. * - * @throw cudf::logic_error if output_type is not a fixed-point type. + * @throw cudf::logic_error if output_type is not a fixed-point decimal type. * * @param strings Strings instance for this operation. * @param output_type Type of fixed-point column to return. @@ -59,14 +59,15 @@ std::unique_ptr to_fixed_point( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Returns a new strings column converting the fixed-point values from the - * provided column into strings. + * @brief Returns a new strings column converting the fixed-point values + * into a strings column. * * Any null entries will result in corresponding null entries in the output column. * * For each value, a string is created in base-10 decimal. * Negative numbers will include a '-' prefix. * The column's scale value is used to place the decimal point. + * A negative scale value may add padded zeros after the decimal point. * * @code{.pseudo} * Example: @@ -75,7 +76,7 @@ std::unique_ptr to_fixed_point( * s is now ['1.10', '2.22', '33.30', '-4.40', '-0.01'] * @endcode * - * @throw cudf::logic_error if the input column is not a fixed-point type. + * @throw cudf::logic_error if the input column is not a fixed-point decimal type. * * @param input Fixed-point column to convert. * @param mr Device memory resource used to allocate the returned column's device memory. @@ -90,7 +91,10 @@ std::unique_ptr from_fixed_point( * characters are valid for conversion to fixed-point. * * The output row entry will be set to `true` if the corresponding string element - * has at least one character in [+-0123456789.]. + * has at least one character in [+-0123456789.]. The optional sign character + * must only be in the first position. The decimal point may only appear once. + * Also, the integer component must fit within the size limits of the + * underlying fixed-point storage type. * * @code{.pseudo} * Example: @@ -101,8 +105,10 @@ std::unique_ptr from_fixed_point( * * Any null row results in a null entry for that row in the output column. * + * @throw cudf::logic_error if the `decimal_type` is not a fixed-point decimal type. + * * @param input Strings instance for this operation. - * @param decimal_type Fixed-point type used only for checking overflow + * @param decimal_type Fixed-point type used only for checking overflow. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New column of boolean results for each string. */ From ea7090714547145e78c30c0bb016b88caead1f3b Mon Sep 17 00:00:00 2001 From: davidwendt Date: Thu, 11 Feb 2021 08:26:50 -0500 Subject: [PATCH 10/21] add const to variables and member functions --- .../strings/convert/convert_fixed_point.cu | 50 ++++++++++++------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu index 8276d8b3abe..d73ac779073 100644 --- a/cpp/src/strings/convert/convert_fixed_point.cu +++ b/cpp/src/strings/convert/convert_fixed_point.cu @@ -46,10 +46,10 @@ namespace { template struct string_to_decimal_fn { column_device_view const d_strings; - DecimalType* values; - int32_t* scales; + DecimalType* const values; + int32_t* const scales; - __device__ void operator()(size_type idx) + __device__ void operator()(size_type idx) const { values[idx] = DecimalType{0}; scales[idx] = numeric::scale_type{0}; @@ -90,7 +90,7 @@ template struct string_to_decimal_check_fn { column_device_view const d_strings; - __device__ bool operator()(size_type idx) + __device__ bool operator()(size_type idx) const { if (d_strings.is_null(idx)) return false; auto const d_str = d_strings.element(idx); @@ -123,10 +123,10 @@ template struct rescale_decimals_fn { column_device_view const d_strings; int32_t const new_scale; - int32_t const* scales; // original scales - DecimalType* values; // values to rescale + int32_t const* scales; // original scales + DecimalType* const values; // values to rescale - __device__ void operator()(size_type idx) + __device__ void operator()(size_type idx) const { if (d_strings.is_null(idx)) return; @@ -147,25 +147,30 @@ struct dispatch_to_fixed_point_fn { rmm::mr::device_memory_resource* mr) const { using DecimalType = device_storage_type_t; - rmm::device_uvector d_scales(input.size(), stream); - rmm::device_uvector d_values(input.size(), stream, mr); - auto d_column = column_device_view::create(input.parent(), stream); + + auto d_scales = rmm::device_uvector(input.size(), stream); + auto d_values = rmm::device_uvector(input.size(), stream, mr); + + auto const d_column = column_device_view::create(input.parent(), stream); + // compute the integer component values and count the decimal places thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(0), input.size(), string_to_decimal_fn{*d_column, d_values.data(), d_scales.data()}); + // find the largest scale size -- min is used since all scale values will be <= 0 auto const min_elem = thrust::min_element(rmm::exec_policy(stream), d_scales.begin(), d_scales.end()); auto const scale = d_scales.element(thrust::distance(d_scales.begin(), min_elem), stream); + // re-scale all the values to the new scale thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(0), input.size(), rescale_decimals_fn{*d_column, scale, d_scales.data(), d_values.data()}); - // build output column + return std::make_unique(data_type{type_to_id(), scale}, input.size(), d_values.release(), @@ -210,6 +215,8 @@ namespace { /** * @brief Calculate the size of the each string required for * converting each value in base-10 format. + * + * ouput format is [-]integer.fraction */ template struct decimal_to_string_size_fn { @@ -226,15 +233,19 @@ struct decimal_to_string_size_fn { auto const abs_value = std::abs(value); auto const exp_ten = static_cast(exp10(static_cast(-scale))); auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten))); - return static_cast(value < 0) + count_digits(abs_value / exp_ten) + 1 + num_zeros + - count_digits(abs_value % exp_ten); + return static_cast(value < 0) + // sign if negative + count_digits(abs_value / exp_ten) + // integer + 1 + // decimal point + num_zeros + // zeros padding + count_digits(abs_value % exp_ten); // fraction } }; /** * @brief Convert each value into a string. * - * The value is converted into base-10 using only characters [0-9]. + * The value is converted into base-10 digits [0-9] + * plus the decimal point and a negative sign prefix. */ template struct decimal_to_string_fn { @@ -286,22 +297,22 @@ struct dispatch_from_fixed_point_fn { rmm::mr::device_memory_resource* mr) const { using DecimalType = device_storage_type_t; // underlying value type - auto d_column = column_device_view::create(input, stream); + + auto const d_column = column_device_view::create(input, stream); // build offsets column auto offsets_transformer_itr = cudf::detail::make_counting_transform_iterator( 0, decimal_to_string_size_fn{*d_column}); auto offsets_column = detail::make_offsets_child_column( offsets_transformer_itr, offsets_transformer_itr + input.size(), stream, mr); - auto d_offsets = offsets_column->view().template data(); + auto const d_offsets = offsets_column->view().template data(); // build chars column auto const bytes = cudf::detail::get_value(offsets_column->view(), input.size(), stream); auto chars_column = detail::create_chars_child_column(input.size(), input.null_count(), bytes, stream, mr); - auto chars_view = chars_column->mutable_view(); - auto d_chars = chars_view.template data(); + auto d_chars = chars_column->mutable_view().template data(); thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), input.size(), @@ -356,7 +367,8 @@ struct dispatch_is_fixed_point_fn { rmm::mr::device_memory_resource* mr) const { using DecimalType = device_storage_type_t; - auto d_column = column_device_view::create(input.parent(), stream); + + auto const d_column = column_device_view::create(input.parent(), stream); // create output column auto results = make_numeric_column(data_type{type_id::BOOL8}, From cfece1d6b4a18a2d71ba90d22f3c27fe0242c872 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Fri, 12 Feb 2021 15:54:42 -0500 Subject: [PATCH 11/21] recode to use passed in scale type instead of inferring --- .../strings/convert/convert_fixed_point.cu | 194 ++++++++++-------- cpp/tests/strings/fixed_point_tests.cpp | 35 +++- 2 files changed, 144 insertions(+), 85 deletions(-) diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu index d73ac779073..fec0ea4965b 100644 --- a/cpp/src/strings/convert/convert_fixed_point.cu +++ b/cpp/src/strings/convert/convert_fixed_point.cu @@ -31,10 +31,9 @@ #include #include -#include #include -#include +#include namespace cudf { namespace strings { @@ -42,21 +41,20 @@ namespace detail { namespace { /** * @brief Converts strings into an integers and records decimal places. + * + * The conversion uses the provided scale to build the resulting + * integer. This can minimize overflow for strings with many digits. */ template struct string_to_decimal_fn { column_device_view const d_strings; - DecimalType* const values; - int32_t* const scales; + int32_t const scale; - __device__ void operator()(size_type idx) const + __device__ DecimalType operator()(size_type idx) const { - values[idx] = DecimalType{0}; - scales[idx] = numeric::scale_type{0}; - - if (d_strings.is_null(idx)) return; + if (d_strings.is_null(idx)) return 0; auto const d_str = d_strings.element(idx); - if (d_str.empty()) return; + if (d_str.empty()) return 0; auto const sign = [&] { auto const first = d_str.data(); @@ -64,31 +62,59 @@ struct string_to_decimal_fn { if (*first == '+') return 1; return 0; }(); - auto iter = d_str.begin() + (sign != 0); + auto iter = d_str.data() + (sign != 0); - int64_t value = 0; - bool decimal_found = false; - while (iter != d_str.end()) { - auto const chr = *iter++; - if (chr >= '0' && chr <= '9') { + int64_t value = 0; + if (scale >= 0) { + // find end-point which is (begin + max(0,length-scale)) + // where length = number bytes up to the decimal point + auto const iter_end = + iter + + std::max(0, + static_cast(thrust::distance( + iter, thrust::find(thrust::seq, iter, d_str.data() + d_str.size_bytes(), '.'))) - + scale); + // only convert up to the number characters needed for the specified scale + while (iter != iter_end) { + auto const chr = *iter++; + if (chr < '0' || chr > '9') break; value = (value * 10) + static_cast(chr - '0'); - scales[idx] -= static_cast(decimal_found); - } else if (chr == '.') - decimal_found = true; - else - break; + } + } else { // scale < 0 + auto const iter_end = d_str.data() + d_str.size_bytes(); + int32_t curr_scale = scale; + bool decimal_found = false; + // convert up through the decimal point until the + // end of the string or until curr_scale==0 + while (iter != iter_end) { + auto const chr = *iter++; + if (chr >= '0' && chr <= '9') { + if (decimal_found && (curr_scale == 0)) break; // processing done + value = (value * 10) + static_cast(chr - '0'); + curr_scale += (decimal_found && (curr_scale < 0)); + } else if (chr == '.') { + decimal_found = true; + } else + break; + } + // account for any left over scale + value *= static_cast(exp10(static_cast(-curr_scale))); } - values[idx] = static_cast(value * (sign == 0 ? 1 : sign)); + return static_cast(value * (sign == 0 ? 1 : sign)); } }; /** * @brief This only checks the string format for valid decimal characters. + * + * This follows closely the logic above but mainly ensures there are valid + * characters for conversion. */ template struct string_to_decimal_check_fn { column_device_view const d_strings; + int32_t const scale; __device__ bool operator()(size_type idx) const { @@ -96,44 +122,50 @@ struct string_to_decimal_check_fn { auto const d_str = d_strings.element(idx); if (d_str.empty()) return false; - auto iter = d_str.begin() + (d_str.data()[0] == '-' || d_str.data()[0] == '+'); - - DecimalType value = 0; - bool decimal_found = false; - while (iter != d_str.end()) { + auto iter = d_str.data() + static_cast((d_str.data()[0] == '-' || d_str.data()[0] == '+')); + + // These identify 3 possible locations in the decimal string + // +123456789.09876543 + // ^ ^ ^ + // check-^ ^ ^- end + // ^- decimal + // The iter_check value will be unique when scale > 0 and + // the number of digits left of the decimal is larger than the scale. + auto const iter_end = d_str.data() + d_str.size_bytes(); + auto const iter_decimal = thrust::find(thrust::seq, iter, iter_end, '.'); + auto const iter_check = + scale < 0 + ? iter_decimal + : iter + std::max(0, static_cast(thrust::distance(iter, iter_decimal)) - scale); + + DecimalType value = 0; // running value for overflow check + bool decimal_found = false; // mainly for checking duplicate decimal points + int32_t curr_scale = scale; // running scale for scale < 0 case + while (iter != iter_end) { // check all bytes for valid characters auto const chr = *iter++; if (chr == '.' && !decimal_found) { decimal_found = true; continue; } - if (chr < '0' || chr > '9') return false; + if (chr < '0' || chr > '9') return false; // invalid character check + if (iter > iter_check && curr_scale >= 0) continue; // overflow checking no longer needed + auto const digit = static_cast(chr - '0'); auto const max_check = (std::numeric_limits::max() - digit) / DecimalType{10}; if (value > max_check) return false; value = (value * DecimalType{10}) + digit; - } - return true; - } -}; - -/** - * @brief Uses the max_scale to rescale all the integer values. - */ -template -struct rescale_decimals_fn { - column_device_view const d_strings; - int32_t const new_scale; - int32_t const* scales; // original scales - DecimalType* const values; // values to rescale - - __device__ void operator()(size_type idx) const - { - if (d_strings.is_null(idx)) return; - auto const fp = numeric::fixed_point{ - numeric::scaled_integer{values[idx], numeric::scale_type{scales[idx]}}}; + // increment running scale if we are right of the decimal point + curr_scale += (decimal_found && curr_scale < 0); + } + // check overflow on any remaining negative scale value + if ((curr_scale < 0) && + (value > (std::numeric_limits::max() / + static_cast(exp10(static_cast(-curr_scale)))))) + return false; - values[idx] = fp.rescaled(numeric::scale_type{new_scale}).value(); + // everything passed + return true; } }; @@ -143,43 +175,36 @@ struct rescale_decimals_fn { struct dispatch_to_fixed_point_fn { template ()>* = nullptr> std::unique_ptr operator()(strings_column_view const& input, + data_type output_type, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { using DecimalType = device_storage_type_t; - auto d_scales = rmm::device_uvector(input.size(), stream); - auto d_values = rmm::device_uvector(input.size(), stream, mr); - auto const d_column = column_device_view::create(input.parent(), stream); - // compute the integer component values and count the decimal places - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - input.size(), - string_to_decimal_fn{*d_column, d_values.data(), d_scales.data()}); - - // find the largest scale size -- min is used since all scale values will be <= 0 - auto const min_elem = - thrust::min_element(rmm::exec_policy(stream), d_scales.begin(), d_scales.end()); - auto const scale = d_scales.element(thrust::distance(d_scales.begin(), min_elem), stream); - - // re-scale all the values to the new scale - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - input.size(), - rescale_decimals_fn{*d_column, scale, d_scales.data(), d_values.data()}); - - return std::make_unique(data_type{type_to_id(), scale}, - input.size(), - d_values.release(), - cudf::detail::copy_bitmask(input.parent(), stream, mr), - input.null_count()); + + // create output column + auto results = make_fixed_point_column(output_type, + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), + stream, + mr); + auto d_results = results->mutable_view().data(); + + // convert strings into decimal values + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + d_results, + string_to_decimal_fn{*d_column, output_type.scale()}); + results->set_null_count(input.null_count()); + return results; } template ()>* = nullptr> std::unique_ptr operator()(strings_column_view const&, + data_type, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) const { @@ -196,7 +221,7 @@ std::unique_ptr to_fixed_point(strings_column_view const& input, rmm::mr::device_memory_resource* mr) { if (input.is_empty()) return make_empty_column(output_type); - return type_dispatcher(output_type, dispatch_to_fixed_point_fn{}, input, stream, mr); + return type_dispatcher(output_type, dispatch_to_fixed_point_fn{}, input, output_type, stream, mr); } } // namespace detail @@ -269,8 +294,8 @@ struct decimal_to_string_fn { // scale < 0 // write format: [-]integer.fraction - // where integer = abs(value) / (10 ^ abs(scale)) - // fraction = abs(value) % (10 ^ abs(scale)) + // where integer = abs(value) / (10^abs(scale)) + // fraction = abs(value) % (10^abs(scale)) auto const abs_value = std::abs(value); if (value < 0) *d_buffer++ = '-'; // add sign auto const exp_ten = static_cast(exp10(static_cast(-scale))); @@ -282,7 +307,7 @@ struct decimal_to_string_fn { // add zeros thrust::generate_n(thrust::seq, d_buffer, num_zeros, []() { return '0'; }); d_buffer += num_zeros; - // add the fractional part + // add the fraction part integer_to_string(abs_value % exp_ten, d_buffer); } }; @@ -332,7 +357,7 @@ struct dispatch_from_fixed_point_fn { rmm::cuda_stream_view, rmm::mr::device_memory_resource*) const { - CUDF_FAIL("Values for from_fixed_point function must be an fixed-point type."); + CUDF_FAIL("Values for from_fixed_point function must be a decimal type."); } }; @@ -363,6 +388,7 @@ namespace { struct dispatch_is_fixed_point_fn { template ()>* = nullptr> std::unique_ptr operator()(strings_column_view const& input, + data_type decimal_type, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { @@ -384,13 +410,14 @@ struct dispatch_is_fixed_point_fn { thrust::make_counting_iterator(0), thrust::make_counting_iterator(input.size()), d_results, - string_to_decimal_check_fn{*d_column}); + string_to_decimal_check_fn{*d_column, decimal_type.scale()}); results->set_null_count(input.null_count()); return results; } template ()>* = nullptr> std::unique_ptr operator()(strings_column_view const&, + data_type, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) const { @@ -406,7 +433,8 @@ std::unique_ptr is_fixed_point(strings_column_view const& input, rmm::mr::device_memory_resource* mr) { if (input.is_empty()) return cudf::make_empty_column(data_type{type_id::BOOL8}); - return type_dispatcher(decimal_type, dispatch_is_fixed_point_fn{}, input, stream, mr); + return type_dispatcher( + decimal_type, dispatch_is_fixed_point_fn{}, input, decimal_type, stream, mr); } } // namespace detail diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp index 82a473ee51c..a9210b7dadf 100644 --- a/cpp/tests/strings/fixed_point_tests.cpp +++ b/cpp/tests/strings/fixed_point_tests.cpp @@ -44,12 +44,19 @@ TYPED_TEST(StringsFixedPointConvertTest, ToFixedPoint) cudf::test::strings_column_wrapper strings( {"1234", "-876", "543.2", "-0.12", ".25", "-.002", "", "-0.0"}); - auto results = cudf::strings::to_fixed_point(cudf::strings_column_view(strings), - cudf::data_type{cudf::type_to_id()}); + auto results = cudf::strings::to_fixed_point( + cudf::strings_column_view(strings), + cudf::data_type{cudf::type_to_id(), numeric::scale_type{-3}}); auto const expected = fp_wrapper{{1234000, -876000, 543200, -120, 250, -2, 0, 0}, numeric::scale_type{-3}}; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + results = cudf::strings::to_fixed_point( + cudf::strings_column_view(strings), + cudf::data_type{cudf::type_to_id(), numeric::scale_type{2}}); + auto const expected_scaled = fp_wrapper{{12, -8, 5, 0, 0, 0, 0, 0}, numeric::scale_type{2}}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_scaled); + cudf::test::strings_column_wrapper strings_nulls( {"1234", "-876", "543", "900000", "2500000", "", ""}, {1, 1, 1, 1, 1, 1, 0}); results = cudf::strings::to_fixed_point(cudf::strings_column_view(strings_nulls), @@ -126,6 +133,16 @@ TEST_F(StringsConvertTest, IsFixedPoint) {true, true, true, true, false, false, false, true}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + results = cudf::strings::is_fixed_point( + cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::DECIMAL32, numeric::scale_type{-1}}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + + results = cudf::strings::is_fixed_point( + cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::DECIMAL32, numeric::scale_type{1}}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + cudf::test::strings_column_wrapper big_numbers({"2147483647", "-2147483647", "2147483648", @@ -143,4 +160,18 @@ TEST_F(StringsConvertTest, IsFixedPoint) auto const expected64 = cudf::test::fixed_width_column_wrapper({true, true, true, true, true, false}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64); + + results = cudf::strings::is_fixed_point( + cudf::strings_column_view(big_numbers), + cudf::data_type{cudf::type_id::DECIMAL32, numeric::scale_type{10}}); + auto const expected32_scaled = + cudf::test::fixed_width_column_wrapper({true, true, true, true, true, true}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected32_scaled); + + results = cudf::strings::is_fixed_point( + cudf::strings_column_view(big_numbers), + cudf::data_type{cudf::type_id::DECIMAL64, numeric::scale_type{-5}}); + auto const expected64_scaled = + cudf::test::fixed_width_column_wrapper({true, true, true, false, false, false}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64_scaled); } From f9b8707ddec311733e3f5951f8209518c549d318 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Tue, 16 Feb 2021 10:43:19 -0500 Subject: [PATCH 12/21] fix doxygen for scale input --- .../strings/convert/convert_fixed_point.hpp | 14 ++++---- .../strings/convert/convert_fixed_point.cu | 34 +++++++++---------- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp index c978608b3b8..88d36568fb2 100644 --- a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp +++ b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp @@ -39,17 +39,18 @@ namespace strings { * @code{.pseudo} * Example: * s = ['123', '-876', '543.2', '-0.12'] - * fp = to_fixed_point(s) - * fp is [123400, -87600, 54320, -12] with scale = -2 + * dt = {DECIMAL32, scale=-2} + * fp = to_fixed_point(s, dt) + * fp is [123400, -87600, 54320, -12] * @endcode * * Overflow of the resulting value type is not checked. - * The decimal point is used for determining the output scale value. + * The scale in the `output_type` is used for setting the integer component. * * @throw cudf::logic_error if output_type is not a fixed-point decimal type. * * @param strings Strings instance for this operation. - * @param output_type Type of fixed-point column to return. + * @param output_type Type of fixed-point column to return including the scale value. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New column of `output_type`. */ @@ -94,7 +95,8 @@ std::unique_ptr from_fixed_point( * has at least one character in [+-0123456789.]. The optional sign character * must only be in the first position. The decimal point may only appear once. * Also, the integer component must fit within the size limits of the - * underlying fixed-point storage type. + * underlying fixed-point storage type. The value of the integer component + * is based on the scale of the `decimal_type` provided. * * @code{.pseudo} * Example: @@ -108,7 +110,7 @@ std::unique_ptr from_fixed_point( * @throw cudf::logic_error if the `decimal_type` is not a fixed-point decimal type. * * @param input Strings instance for this operation. - * @param decimal_type Fixed-point type used only for checking overflow. + * @param decimal_type Fixed-point type (with scale) used only for checking overflow. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New column of boolean results for each string. */ diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu index fec0ea4965b..922f0cc3fd3 100644 --- a/cpp/src/strings/convert/convert_fixed_point.cu +++ b/cpp/src/strings/convert/convert_fixed_point.cu @@ -43,7 +43,7 @@ namespace { * @brief Converts strings into an integers and records decimal places. * * The conversion uses the provided scale to build the resulting - * integer. This can minimize overflow for strings with many digits. + * integer. This can prevent overflow for strings with many digits. */ template struct string_to_decimal_fn { @@ -57,9 +57,8 @@ struct string_to_decimal_fn { if (d_str.empty()) return 0; auto const sign = [&] { - auto const first = d_str.data(); - if (*first == '-') return -1; - if (*first == '+') return 1; + if (d_str.data()[0] == '-') return -1; + if (d_str.data()[0] == '+') return 1; return 0; }(); auto iter = d_str.data() + (sign != 0); @@ -108,8 +107,8 @@ struct string_to_decimal_fn { /** * @brief This only checks the string format for valid decimal characters. * - * This follows closely the logic above but mainly ensures there are valid - * characters for conversion. + * This follows closely the logic above but just ensures there are valid + * characters for conversion and the integer component does not overflow. */ template struct string_to_decimal_check_fn { @@ -124,13 +123,13 @@ struct string_to_decimal_check_fn { auto iter = d_str.data() + static_cast((d_str.data()[0] == '-' || d_str.data()[0] == '+')); - // These identify 3 possible locations in the decimal string + // The following variables identify 3 possible locations in the decimal string // +123456789.09876543 // ^ ^ ^ // check-^ ^ ^- end // ^- decimal // The iter_check value will be unique when scale > 0 and - // the number of digits left of the decimal is larger than the scale. + // the number of digits left of the decimal point is larger than the scale. auto const iter_end = d_str.data() + d_str.size_bytes(); auto const iter_decimal = thrust::find(thrust::seq, iter, iter_end, '.'); auto const iter_check = @@ -138,7 +137,7 @@ struct string_to_decimal_check_fn { ? iter_decimal : iter + std::max(0, static_cast(thrust::distance(iter, iter_decimal)) - scale); - DecimalType value = 0; // running value for overflow check + DecimalType value = 0; // used for overflow checking bool decimal_found = false; // mainly for checking duplicate decimal points int32_t curr_scale = scale; // running scale for scale < 0 case while (iter != iter_end) { // check all bytes for valid characters @@ -150,6 +149,7 @@ struct string_to_decimal_check_fn { if (chr < '0' || chr > '9') return false; // invalid character check if (iter > iter_check && curr_scale >= 0) continue; // overflow checking no longer needed + // check for overflow in the integer component auto const digit = static_cast(chr - '0'); auto const max_check = (std::numeric_limits::max() - digit) / DecimalType{10}; if (value > max_check) return false; @@ -284,11 +284,11 @@ struct decimal_to_string_fn { auto const value = d_column.element(idx); auto const scale = d_column.type().scale(); char* d_buffer = d_chars + d_offsets[idx]; + if (scale >= 0) { integer_to_string(value, d_buffer); d_buffer += count_digits(value); - // add zeros - thrust::generate_n(thrust::seq, d_buffer, scale, []() { return '0'; }); + thrust::generate_n(thrust::seq, d_buffer, scale, []() { return '0'; }); // add zeros return; } @@ -300,15 +300,15 @@ struct decimal_to_string_fn { if (value < 0) *d_buffer++ = '-'; // add sign auto const exp_ten = static_cast(exp10(static_cast(-scale))); auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten))); - // add the integer part - integer_to_string(abs_value / exp_ten, d_buffer); + + integer_to_string(abs_value / exp_ten, d_buffer); // add the integer part d_buffer += count_digits(abs_value / exp_ten); *d_buffer++ = '.'; // add decimal point - // add zeros - thrust::generate_n(thrust::seq, d_buffer, num_zeros, []() { return '0'; }); + + thrust::generate_n(thrust::seq, d_buffer, num_zeros, []() { return '0'; }); // add zeros d_buffer += num_zeros; - // add the fraction part - integer_to_string(abs_value % exp_ten, d_buffer); + + integer_to_string(abs_value % exp_ten, d_buffer); // add the fraction part } }; From 5a380fc3e0acff8faad4b4d0cef7f3bf0b7509bf Mon Sep 17 00:00:00 2001 From: davidwendt Date: Tue, 16 Feb 2021 13:40:26 -0500 Subject: [PATCH 13/21] return size from integer_to_string; make count_digits constexpr --- cpp/src/strings/convert/utilities.cuh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/src/strings/convert/utilities.cuh b/cpp/src/strings/convert/utilities.cuh index 591d3650fb3..bdc94fadba4 100644 --- a/cpp/src/strings/convert/utilities.cuh +++ b/cpp/src/strings/convert/utilities.cuh @@ -58,11 +58,11 @@ __device__ inline int64_t string_to_integer(string_view const& d_str) * @param d_buffer character buffer to store the converted string */ template -__device__ inline void integer_to_string(IntegerType value, char* d_buffer) +__device__ inline size_type integer_to_string(IntegerType value, char* d_buffer) { if (value == 0) { *d_buffer = '0'; - return; + return 1; } bool is_negative = std::is_signed::value ? (value < 0) : false; // @@ -76,10 +76,13 @@ __device__ inline void integer_to_string(IntegerType value, char* d_buffer) // next digit value = value / base; } + size_type const bytes = digits_idx + static_cast(is_negative); + char* ptr = d_buffer; if (is_negative) *ptr++ = '-'; // digits are backwards, reverse the string into the output while (digits_idx-- > 0) *ptr++ = digits[digits_idx]; + return bytes; } /** @@ -90,7 +93,7 @@ __device__ inline void integer_to_string(IntegerType value, char* d_buffer) * @return size_type number of digits in input value */ template -__device__ inline size_type count_digits(IntegerType value) +constexpr size_type count_digits(IntegerType value) { if (value == 0) return 1; bool is_negative = std::is_signed::value ? (value < 0) : false; From d261be924b8eabf861085d6b9e56e60285600123 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Tue, 16 Feb 2021 13:40:58 -0500 Subject: [PATCH 14/21] fix doxygen parameter name reference --- cpp/include/cudf/strings/convert/convert_fixed_point.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp index 88d36568fb2..a1808a38825 100644 --- a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp +++ b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp @@ -47,7 +47,7 @@ namespace strings { * Overflow of the resulting value type is not checked. * The scale in the `output_type` is used for setting the integer component. * - * @throw cudf::logic_error if output_type is not a fixed-point decimal type. + * @throw cudf::logic_error if `output_type` is not a fixed-point decimal type. * * @param strings Strings instance for this operation. * @param output_type Type of fixed-point column to return including the scale value. @@ -77,7 +77,7 @@ std::unique_ptr to_fixed_point( * s is now ['1.10', '2.22', '33.30', '-4.40', '-0.01'] * @endcode * - * @throw cudf::logic_error if the input column is not a fixed-point decimal type. + * @throw cudf::logic_error if the `input` column is not a fixed-point decimal type. * * @param input Fixed-point column to convert. * @param mr Device memory resource used to allocate the returned column's device memory. From d4d84419209dcabef9f910bfb14b0f7bbc5f9c6c Mon Sep 17 00:00:00 2001 From: davidwendt Date: Tue, 16 Feb 2021 13:41:21 -0500 Subject: [PATCH 15/21] use integer_to_string return to incr output buffer pointer --- cpp/src/strings/convert/convert_fixed_point.cu | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu index 922f0cc3fd3..f16ad4eca16 100644 --- a/cpp/src/strings/convert/convert_fixed_point.cu +++ b/cpp/src/strings/convert/convert_fixed_point.cu @@ -257,12 +257,13 @@ struct decimal_to_string_size_fn { auto const abs_value = std::abs(value); auto const exp_ten = static_cast(exp10(static_cast(-scale))); - auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten))); + auto const fraction = count_digits(abs_value % exp_ten); + auto const num_zeros = std::max(0, (-scale - fraction)); return static_cast(value < 0) + // sign if negative count_digits(abs_value / exp_ten) + // integer 1 + // decimal point num_zeros + // zeros padding - count_digits(abs_value % exp_ten); // fraction + fraction; // size of fraction } }; @@ -286,8 +287,7 @@ struct decimal_to_string_fn { char* d_buffer = d_chars + d_offsets[idx]; if (scale >= 0) { - integer_to_string(value, d_buffer); - d_buffer += count_digits(value); + d_buffer += integer_to_string(value, d_buffer); thrust::generate_n(thrust::seq, d_buffer, scale, []() { return '0'; }); // add zeros return; } @@ -301,9 +301,8 @@ struct decimal_to_string_fn { auto const exp_ten = static_cast(exp10(static_cast(-scale))); auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten))); - integer_to_string(abs_value / exp_ten, d_buffer); // add the integer part - d_buffer += count_digits(abs_value / exp_ten); - *d_buffer++ = '.'; // add decimal point + d_buffer += integer_to_string(abs_value / exp_ten, d_buffer); // add the integer part + *d_buffer++ = '.'; // add decimal point thrust::generate_n(thrust::seq, d_buffer, num_zeros, []() { return '0'; }); // add zeros d_buffer += num_zeros; From 279708867e63fad71c2efcccfdf6ae72dcd89cfa Mon Sep 17 00:00:00 2001 From: davidwendt Date: Wed, 17 Feb 2021 08:24:41 -0500 Subject: [PATCH 16/21] update doxygen to claim UB for invalid data format --- .../strings/convert/convert_fixed_point.hpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp index a1808a38825..d631e1bacdb 100644 --- a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp +++ b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp @@ -30,17 +30,16 @@ namespace strings { * @brief Returns a new fixed-point column parsing decimal values from the * provided strings column. * - * Any null entries will result in corresponding null entries in the output column. + * Any null entries result in corresponding null entries in the output column. * - * Only characters [0-9] plus a prefix '-' and '+' and a single decimal point - * are recognized. When any other character is encountered, the parsing ends - * for that string and the current digits are converted into a fixed-point value. + * The expected format is [-+][0-9].[0-9] for each string. An invalid data format + * result in undefined behavior in the corresponding output row result. * * @code{.pseudo} * Example: * s = ['123', '-876', '543.2', '-0.12'] - * dt = {DECIMAL32, scale=-2} - * fp = to_fixed_point(s, dt) + * datatype = {DECIMAL32, scale=-2} + * fp = to_fixed_point(s, datatype) * fp is [123400, -87600, 54320, -12] * @endcode * @@ -63,10 +62,10 @@ std::unique_ptr to_fixed_point( * @brief Returns a new strings column converting the fixed-point values * into a strings column. * - * Any null entries will result in corresponding null entries in the output column. + * Any null entries result in corresponding null entries in the output column. * * For each value, a string is created in base-10 decimal. - * Negative numbers will include a '-' prefix. + * Negative numbers include a '-' prefix in the output string. * The column's scale value is used to place the decimal point. * A negative scale value may add padded zeros after the decimal point. * @@ -91,7 +90,7 @@ std::unique_ptr from_fixed_point( * @brief Returns a boolean column identifying strings in which all * characters are valid for conversion to fixed-point. * - * The output row entry will be set to `true` if the corresponding string element + * The output row entry is set to `true` if the corresponding string element * has at least one character in [+-0123456789.]. The optional sign character * must only be in the first position. The decimal point may only appear once. * Also, the integer component must fit within the size limits of the @@ -105,7 +104,7 @@ std::unique_ptr from_fixed_point( * b is [true, true, false, false, false, true, true, true] * @endcode * - * Any null row results in a null entry for that row in the output column. + * Any null entries result in corresponding null entries in the output column. * * @throw cudf::logic_error if the `decimal_type` is not a fixed-point decimal type. * From 28b70213d7bcc67aaba6f6190a9f511fedfb5cd2 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Wed, 17 Feb 2021 08:24:52 -0500 Subject: [PATCH 17/21] update copyright year --- cpp/src/strings/convert/utilities.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/strings/convert/utilities.cuh b/cpp/src/strings/convert/utilities.cuh index bdc94fadba4..b90b7db62bc 100644 --- a/cpp/src/strings/convert/utilities.cuh +++ b/cpp/src/strings/convert/utilities.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 3aa8d773e663162d6581f6b4b663c086c9d5d9ae Mon Sep 17 00:00:00 2001 From: davidwendt Date: Wed, 17 Feb 2021 13:29:07 -0500 Subject: [PATCH 18/21] add const to bool var declaration --- cpp/src/strings/convert/utilities.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/strings/convert/utilities.cuh b/cpp/src/strings/convert/utilities.cuh index b90b7db62bc..75ae7b3af6c 100644 --- a/cpp/src/strings/convert/utilities.cuh +++ b/cpp/src/strings/convert/utilities.cuh @@ -64,7 +64,7 @@ __device__ inline size_type integer_to_string(IntegerType value, char* d_buffer) *d_buffer = '0'; return 1; } - bool is_negative = std::is_signed::value ? (value < 0) : false; + bool const is_negative = std::is_signed::value ? (value < 0) : false; // constexpr IntegerType base = 10; constexpr int MAX_DIGITS = 20; // largest 64-bit integer is 20 digits From b01c3476b85d1a022292656f02a7be1a2106d083 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Wed, 17 Feb 2021 16:14:54 -0500 Subject: [PATCH 19/21] update invalid data format doxygen statement --- cpp/include/cudf/strings/convert/convert_fixed_point.hpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp index d631e1bacdb..b0ad9a0bc67 100644 --- a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp +++ b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp @@ -32,8 +32,11 @@ namespace strings { * * Any null entries result in corresponding null entries in the output column. * - * The expected format is [-+][0-9].[0-9] for each string. An invalid data format - * result in undefined behavior in the corresponding output row result. + * The expected format is `[sign][integer][.][fraction]`, where the sign is either + * not present, `-` or `+`, The decimal point `[.]` may or may not be present, and + * `integer` and `fraction` are comprised of zero or more digits in [0-9]. + * An invalid data format results in undefined behavior in the corresponding + * output row result. * * @code{.pseudo} * Example: From 4f7ab54231317fdd00fd24fddb2b242300bed8cc Mon Sep 17 00:00:00 2001 From: davidwendt Date: Thu, 18 Feb 2021 16:11:49 -0500 Subject: [PATCH 20/21] add very-large and very-small test strings --- cpp/tests/strings/fixed_point_tests.cpp | 47 +++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp index a9210b7dadf..b7bfb9e3924 100644 --- a/cpp/tests/strings/fixed_point_tests.cpp +++ b/cpp/tests/strings/fixed_point_tests.cpp @@ -43,18 +43,18 @@ TYPED_TEST(StringsFixedPointConvertTest, ToFixedPoint) using fp_wrapper = cudf::test::fixed_point_column_wrapper; cudf::test::strings_column_wrapper strings( - {"1234", "-876", "543.2", "-0.12", ".25", "-.002", "", "-0.0"}); + {"1234", "-876", "543.2", "-0.12", ".25", "-.002", "-.0027", "", "-0.0"}); auto results = cudf::strings::to_fixed_point( cudf::strings_column_view(strings), cudf::data_type{cudf::type_to_id(), numeric::scale_type{-3}}); auto const expected = - fp_wrapper{{1234000, -876000, 543200, -120, 250, -2, 0, 0}, numeric::scale_type{-3}}; + fp_wrapper{{1234000, -876000, 543200, -120, 250, -2, -2, 0, 0}, numeric::scale_type{-3}}; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); results = cudf::strings::to_fixed_point( cudf::strings_column_view(strings), cudf::data_type{cudf::type_to_id(), numeric::scale_type{2}}); - auto const expected_scaled = fp_wrapper{{12, -8, 5, 0, 0, 0, 0, 0}, numeric::scale_type{2}}; + auto const expected_scaled = fp_wrapper{{12, -8, 5, 0, 0, 0, 0, 0, 0}, numeric::scale_type{2}}; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_scaled); cudf::test::strings_column_wrapper strings_nulls( @@ -66,6 +66,47 @@ TYPED_TEST(StringsFixedPointConvertTest, ToFixedPoint) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_nulls); } +TYPED_TEST(StringsFixedPointConvertTest, ToFixedPointVeryLarge) +{ + using DecimalType = TypeParam; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + auto const strings = cudf::test::strings_column_wrapper({"1234000000000000000000", + "-876000000000000000000", + "543200000000000000000", + "-120000000000000000", + "250000000000000000", + "-2800000000000000", + "", + "-0.0"}); + auto const results = cudf::strings::to_fixed_point( + cudf::strings_column_view(strings), + cudf::data_type{cudf::type_to_id(), numeric::scale_type{20}}); + auto const expected = fp_wrapper{{12, -8, 5, 0, 0, 0, 0, 0}, numeric::scale_type{20}}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); +} + +TYPED_TEST(StringsFixedPointConvertTest, ToFixedPointVerySmall) +{ + using DecimalType = TypeParam; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + auto const strings = cudf::test::strings_column_wrapper({"0.00000000000000001234", + "-0.0000000000000000876", + ".000000000000000005432", + "-.000000000000000012", + "+.000000000000000025", + "-.00000000002147483647", + "", + "+0.0"}); + auto const results = cudf::strings::to_fixed_point( + cudf::strings_column_view(strings), + cudf::data_type{cudf::type_to_id(), numeric::scale_type{-20}}); + auto const expected = + fp_wrapper{{1234, -8760, 543, -1200, 2500, -2147483647, 0, 0}, numeric::scale_type{-20}}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); +} + TYPED_TEST(StringsFixedPointConvertTest, FromFixedPoint) { using DecimalType = TypeParam; From 89921f25f0be8d806491f167f905d315bd8c83ec Mon Sep 17 00:00:00 2001 From: davidwendt Date: Thu, 18 Feb 2021 16:12:36 -0500 Subject: [PATCH 21/21] fix overflow bug handling very small numbers --- cpp/include/cudf/fixed_point/fixed_point.hpp | 6 +++--- cpp/src/strings/convert/convert_fixed_point.cu | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp index a1b6c625b23..8f8e2b7394c 100644 --- a/cpp/include/cudf/fixed_point/fixed_point.hpp +++ b/cpp/include/cudf/fixed_point/fixed_point.hpp @@ -537,9 +537,9 @@ class fixed_point { explicit operator std::string() const { if (_scale < 0) { - auto const av = std::abs(_value); - int const n = std::pow(10, -_scale); - int const f = av % n; + auto const av = std::abs(_value); + int64_t const n = std::pow(10, -_scale); + int64_t const f = av % n; auto const num_zeros = std::max(0, (-_scale - static_cast(std::to_string(f).size()))); auto const zeros = std::string(num_zeros, '0'); diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu index f16ad4eca16..c22fee5ec05 100644 --- a/cpp/src/strings/convert/convert_fixed_point.cu +++ b/cpp/src/strings/convert/convert_fixed_point.cu @@ -256,7 +256,7 @@ struct decimal_to_string_size_fn { if (scale >= 0) return count_digits(value) + scale; auto const abs_value = std::abs(value); - auto const exp_ten = static_cast(exp10(static_cast(-scale))); + auto const exp_ten = static_cast(exp10(static_cast(-scale))); auto const fraction = count_digits(abs_value % exp_ten); auto const num_zeros = std::max(0, (-scale - fraction)); return static_cast(value < 0) + // sign if negative @@ -298,7 +298,7 @@ struct decimal_to_string_fn { // fraction = abs(value) % (10^abs(scale)) auto const abs_value = std::abs(value); if (value < 0) *d_buffer++ = '-'; // add sign - auto const exp_ten = static_cast(exp10(static_cast(-scale))); + auto const exp_ten = static_cast(exp10(static_cast(-scale))); auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten))); d_buffer += integer_to_string(abs_value / exp_ten, d_buffer); // add the integer part