From de55832127fba6fd79cf4f4ce3fb647ea1c4ab8b Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 24 Mar 2021 03:36:06 -0600 Subject: [PATCH] Add is_integer API that can check for the validity of a string-to-integer conversion (#7642) This PR addresses #5110, #7080, and rework https://github.com/rapidsai/cudf/pull/7094. It adds the function `cudf::strings::is_integer` that can check if strings can be correctly converted into integer values. Underflow and overflow are also taken into account. Note that this `cudf::strings::is_integer` is different from the existing `cudf::strings::string::is_integer`, which only checks for pattern and does not care about under/overflow. Examples: ``` s = { "eee", "-200", "-100", "127", "128", "1.5", NULL} is_integer(s, INT8) = { 0, 0, 1, 1, 0, 0, NULL} is_integer(s, INT32) = { 0, 1, 1, 1, 1, 0, NULL} ``` Authors: - Nghia Truong (@ttnghia) Approvers: - David (@davidwendt) - Jake Hemstad (@jrhemstad) - Mark Harris (@harrism) URL: https://github.com/rapidsai/cudf/pull/7642 --- .../cudf/strings/convert/convert_integers.hpp | 43 +++- cpp/src/strings/convert/convert_integers.cu | 212 ++++++++++++++---- cpp/tests/strings/integers_tests.cu | 197 ++++++++++++++-- 3 files changed, 383 insertions(+), 69 deletions(-) diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp index 1e2fa80b129..4d29b0a5b6a 100644 --- a/cpp/include/cudf/strings/convert/convert_integers.hpp +++ b/cpp/include/cudf/strings/convert/convert_integers.hpp @@ -78,7 +78,10 @@ std::unique_ptr from_integers( * characters are valid for conversion to integers. * * The output row entry will be set to `true` if the corresponding string element - * has at least one character in [-+0-9]. + * have all characters in [-+0-9]. The optional sign character must only be in the first + * position. Notice that the the integer value is not checked to be within its storage limits. + * For strict integer type check, use the other `is_integer()` API which accepts `data_type` + * argument. * * @code{.pseudo} * Example: @@ -89,12 +92,44 @@ std::unique_ptr from_integers( * * Any null row results in a null entry for that row in the output column. * - * @param strings Strings instance for this operation. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column of boolean results for each string. + * @param strings Strings instance for this operation. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New column of boolean results for each string. + */ +std::unique_ptr is_integer( + strings_column_view const& strings, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns a boolean column identifying strings in which all + * characters are valid for conversion to integers. + * + * The output row entry will be set to `true` if the corresponding string element + * has all characters in [-+0-9]. The optional sign character must only be in the first + * position. Also, the integer component must fit within the size limits of the underlying + * storage type, which is provided by the int_type parameter. + * + * @code{.pseudo} + * Example: + * s = ['123456', '-456', '', 'A', '+7'] + * + * output1 = s.is_integer(s, data_type{type_id::INT32}) + * output1 is [true, true, false, false, true] + * + * output2 = s.is_integer(s, data_type{type_id::INT8}) + * output2 is [false, false, false, false, true] + * @endcode + * + * Any null row results in a null entry for that row in the output column. + * + * @param strings Strings instance for this operation. + * @param int_type Integer type used for checking underflow and overflow. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New column of boolean results for each string. */ std::unique_ptr is_integer( strings_column_view const& strings, + data_type int_type, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu index 5c5032b5c87..7eee2b3cc0e 100644 --- a/cpp/src/strings/convert/convert_integers.cu +++ b/cpp/src/strings/convert/convert_integers.cu @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -38,6 +37,160 @@ namespace cudf { namespace strings { + +namespace detail { +namespace { + +/** + * @brief This only checks if a string is a valid integer within the bounds of its storage type. + */ +template +struct string_to_integer_check_fn { + __device__ bool operator()(thrust::pair const& p) const + { + if (!p.second || p.first.empty()) { return false; } + + auto const d_str = p.first.data(); + if (d_str[0] == '-' && std::is_unsigned::value) { return false; } + + auto iter = d_str + static_cast((d_str[0] == '-' || d_str[0] == '+')); + auto const iter_end = d_str + p.first.size_bytes(); + if (iter == iter_end) { return false; } + + auto const sign = d_str[0] == '-' ? IntegerType{-1} : IntegerType{1}; + auto const bound_val = + sign > 0 ? std::numeric_limits::max() : std::numeric_limits::min(); + + IntegerType value = 0; // parse the string to integer and check for overflow along the way + while (iter != iter_end) { // check all bytes for valid characters + auto const chr = *iter++; + // Check for valid character + if (chr < '0' || chr > '9') { return false; } + + // Check for underflow and overflow: + auto const digit = static_cast(chr - '0'); + auto const bound_check = (bound_val - sign * digit) / IntegerType{10} * sign; + if (value > bound_check) return false; + value = value * IntegerType{10} + digit; + } + + return true; + } +}; + +/** + * @brief The dispatch functions for checking if strings are valid integers. + */ +struct dispatch_is_integer_fn { + template ::value>* = nullptr> + std::unique_ptr operator()(strings_column_view const& strings, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const + { + auto const d_column = column_device_view::create(strings.parent(), stream); + auto results = make_numeric_column(data_type{type_id::BOOL8}, + strings.size(), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); + + auto d_results = results->mutable_view().data(); + if (strings.has_nulls()) { + thrust::transform(rmm::exec_policy(stream), + d_column->pair_begin(), + d_column->pair_end(), + d_results, + string_to_integer_check_fn{}); + } else { + thrust::transform(rmm::exec_policy(stream), + d_column->pair_begin(), + d_column->pair_end(), + d_results, + string_to_integer_check_fn{}); + } + + // Calling mutable_view() on a column invalidates it's null count so we need to set it back + results->set_null_count(strings.null_count()); + + return results; + } + + template ::value>* = nullptr> + std::unique_ptr operator()(strings_column_view const&, + rmm::cuda_stream_view, + rmm::mr::device_memory_resource*) const + { + CUDF_FAIL("is_integer is expecting an integer type"); + } +}; + +} // namespace + +std::unique_ptr is_integer( + strings_column_view const& strings, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + auto const d_column = column_device_view::create(strings.parent(), stream); + auto results = make_numeric_column(data_type{type_id::BOOL8}, + strings.size(), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); + + auto d_results = results->mutable_view().data(); + if (strings.has_nulls()) { + thrust::transform( + rmm::exec_policy(stream), + d_column->pair_begin(), + d_column->pair_end(), + d_results, + [] __device__(auto const& p) { return p.second ? string::is_integer(p.first) : false; }); + } else { + thrust::transform( + rmm::exec_policy(stream), + d_column->pair_begin(), + d_column->pair_end(), + d_results, + [] __device__(auto const& p) { return p.second ? string::is_integer(p.first) : false; }); + } + + // Calling mutable_view() on a column invalidates it's null count so we need to set it back + results->set_null_count(strings.null_count()); + + return results; +} + +std::unique_ptr is_integer( + strings_column_view const& strings, + data_type int_type, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + if (strings.is_empty()) { return cudf::make_empty_column(data_type{type_id::BOOL8}); } + return type_dispatcher(int_type, dispatch_is_integer_fn{}, strings, stream, mr); +} + +} // namespace detail + +// external APIs +std::unique_ptr is_integer(strings_column_view const& strings, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::is_integer(strings, rmm::cuda_stream_default, mr); +} + +std::unique_ptr is_integer(strings_column_view const& strings, + data_type int_type, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::is_integer(strings, int_type, rmm::cuda_stream_default, mr); +} + namespace detail { namespace { /** @@ -69,11 +222,10 @@ struct dispatch_to_integers_fn { mutable_column_view& output_column, rmm::cuda_stream_view stream) const { - auto d_results = output_column.data(); thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_column.size()), - d_results, + output_column.data(), string_to_integer_fn{strings_column}); } // non-integral types throw an exception @@ -102,19 +254,22 @@ std::unique_ptr to_integers(strings_column_view const& strings, { size_type strings_count = strings.size(); if (strings_count == 0) return make_numeric_column(output_type, 0); - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_strings = *strings_column; - // create integer output column copying the strings null-mask - auto results = make_numeric_column(output_type, + + // Create integer output column copying the strings null-mask + auto results = make_numeric_column(output_type, strings_count, cudf::detail::copy_bitmask(strings.parent(), stream, mr), strings.null_count(), stream, mr); - auto results_view = results->mutable_view(); - // fill output column with integers - type_dispatcher(output_type, dispatch_to_integers_fn{}, d_strings, results_view, stream); + // Fill output column with integers + auto const strings_dev_view = column_device_view::create(strings.parent(), stream); + auto results_view = results->mutable_view(); + type_dispatcher(output_type, dispatch_to_integers_fn{}, *strings_dev_view, results_view, stream); + + // Calling mutable_view() on a column invalidates it's null count so we need to set it back results->set_null_count(strings.null_count()); + return results; } @@ -253,42 +408,5 @@ std::unique_ptr from_integers(column_view const& integers, return detail::from_integers(integers, rmm::cuda_stream_default, mr); } -namespace detail { -std::unique_ptr is_integer( - strings_column_view const& strings, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) -{ - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; - // create output column - auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings.size(), - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); - auto d_results = results->mutable_view().data(); - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings.size()), - d_results, - [d_column] __device__(size_type idx) { - if (d_column.is_null(idx)) return false; - return string::is_integer(d_column.element(idx)); - }); - results->set_null_count(strings.null_count()); - return results; -} -} // namespace detail - -// external API -std::unique_ptr is_integer(strings_column_view const& strings, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::is_integer(strings, rmm::cuda_stream_default, mr); -} - } // namespace strings } // namespace cudf diff --git a/cpp/tests/strings/integers_tests.cu b/cpp/tests/strings/integers_tests.cu index d6bf03b3f76..f15116ae4c2 100644 --- a/cpp/tests/strings/integers_tests.cu +++ b/cpp/tests/strings/integers_tests.cu @@ -26,20 +26,18 @@ #include #include +// Using an alias variable for the null elements +// This will make the code looks cleaner +constexpr auto NULL_VAL = 0; + struct StringsConvertTest : public cudf::test::BaseFixture { }; -TEST_F(StringsConvertTest, IsInteger) +TEST_F(StringsConvertTest, IsIntegerBasicCheck) { - cudf::test::strings_column_wrapper strings; - auto strings_view = cudf::strings_column_view(strings); - auto results = cudf::strings::is_integer(strings_view); - EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id()); - EXPECT_EQ(0, results->view().size()); - cudf::test::strings_column_wrapper strings1( {"+175", "-34", "9.8", "17+2", "+-14", "1234567890", "67de", "", "1e10", "-", "++", ""}); - results = cudf::strings::is_integer(cudf::strings_column_view(strings1)); + auto results = cudf::strings::is_integer(cudf::strings_column_view(strings1)); cudf::test::fixed_width_column_wrapper expected1({1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1); @@ -50,24 +48,187 @@ TEST_F(StringsConvertTest, IsInteger) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); } +TEST_F(StringsConvertTest, ZeroSizeIsIntegerBasicCheck) +{ + cudf::test::strings_column_wrapper strings; + auto strings_view = cudf::strings_column_view(strings); + auto results = cudf::strings::is_integer(strings_view); + EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id()); + EXPECT_EQ(0, results->view().size()); +} + +TEST_F(StringsConvertTest, IsIntegerBoundCheckNoNull) +{ + auto strings = cudf::test::strings_column_wrapper( + {"+175", "-34", "9.8", "17+2", "+-14", "1234567890", "67de", "", "1e10", "-", "++", ""}); + auto results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::INT32}); + auto expected = + cudf::test::fixed_width_column_wrapper({1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + strings = cudf::test::strings_column_wrapper( + {"0", "+0", "-0", "1234567890", "-27341132", "+012", "023", "-045"}); + results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::INT32}); + expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + +TEST_F(StringsConvertTest, IsIntegerBoundCheckWithNulls) +{ + std::vector const h_strings{ + "eee", "1234", nullptr, "", "-9832", "93.24", "765é", nullptr}; + auto const strings = cudf::test::strings_column_wrapper( + h_strings.begin(), + h_strings.end(), + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + auto const results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::INT32}); + // Input has null elements then the output should have the same null mask + auto const expected = cudf::test::fixed_width_column_wrapper( + std::initializer_list{0, 1, NULL_VAL, 0, 1, 0, 0, NULL_VAL}, + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + +TEST_F(StringsConvertTest, ZeroSizeIsIntegerBoundCheck) +{ + // Empty input + auto strings = cudf::test::strings_column_wrapper{}; + auto results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::INT32}); + EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id()); + EXPECT_EQ(0, results->view().size()); +} + +TEST_F(StringsConvertTest, IsIntegerBoundCheckSmallNumbers) +{ + auto strings = cudf::test::strings_column_wrapper( + {"-200", "-129", "-128", "-120", "0", "120", "127", "130", "150", "255", "300", "500"}); + auto results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::INT8}); + auto expected = + cudf::test::fixed_width_column_wrapper({0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::UINT8}); + expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + strings = cudf::test::strings_column_wrapper( + {"-40000", "-32769", "-32768", "-32767", "-32766", "32765", "32766", "32767", "32768"}); + results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::INT16}); + expected = cudf::test::fixed_width_column_wrapper({0, 0, 1, 1, 1, 1, 1, 1, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::UINT16}); + expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::INT32}); + expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + +TEST_F(StringsConvertTest, IsIntegerBoundCheckLargeNumbers) +{ + auto strings = + cudf::test::strings_column_wrapper({"-2147483649", // std::numeric_limits::min() - 1 + "-2147483648", // std::numeric_limits::min() + "-2147483647", // std::numeric_limits::min() + 1 + "2147483646", // std::numeric_limits::max() - 1 + "2147483647", // std::numeric_limits::max() + "2147483648", // std::numeric_limits::max() + 1 + "4294967294", // std::numeric_limits::max() - 1 + "4294967295", // std::numeric_limits::max() + "4294967296"}); // std::numeric_limits::max() + 1 + auto results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::INT32}); + auto expected = cudf::test::fixed_width_column_wrapper({0, 1, 1, 1, 1, 0, 0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::UINT32}); + expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 1, 1, 1, 1, 1, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + strings = cudf::test::strings_column_wrapper( + {"-9223372036854775809", // std::numeric_limits::min() - 1 + "-9223372036854775808", // std::numeric_limits::min() + "-9223372036854775807", // std::numeric_limits::min() + 1 + "9223372036854775806", // std::numeric_limits::max() - 1 + "9223372036854775807", // std::numeric_limits::max() + "9223372036854775808", // std::numeric_limits::max() + 1 + "18446744073709551614", // std::numeric_limits::max() - 1 + "18446744073709551615", // std::numeric_limits::max() + "18446744073709551616"}); // std::numeric_limits::max() + 1 + results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::INT64}); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 1, 1, 1, 0, 0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = cudf::strings::is_integer(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::UINT64}); + expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 1, 1, 1, 1, 1, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsConvertTest, ToInteger) { - std::vector h_strings{ - "eee", "1234", nullptr, "", "-9832", "93.24", "765é", "-1.78e+5", "2147483647", "-2147483648"}; + std::vector h_strings{"eee", + "1234", + nullptr, + "", + "-9832", + "93.24", + "765é", + nullptr, + "-1.78e+5", + "2147483647", + "-2147483648", + "2147483648"}; cudf::test::strings_column_wrapper strings( h_strings.begin(), h_strings.end(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - std::vector h_expected{0, 1234, 0, 0, -9832, 93, 765, -1, 2147483647, -2147483648}; - auto strings_view = cudf::strings_column_view(strings); - auto results = cudf::strings::to_integers(strings_view, cudf::data_type{cudf::type_id::INT32}); + auto results = cudf::strings::to_integers(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::INT16}); + auto const expected_i16 = cudf::test::fixed_width_column_wrapper( + std::initializer_list{0, 1234, NULL_VAL, 0, -9832, 93, 765, NULL_VAL, -1, -1, 0, 0}, + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_i16); - cudf::test::fixed_width_column_wrapper expected( - h_expected.begin(), - h_expected.end(), + results = cudf::strings::to_integers(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::INT32}); + auto const expected_i32 = cudf::test::fixed_width_column_wrapper( + std::initializer_list{ + 0, 1234, NULL_VAL, 0, -9832, 93, 765, NULL_VAL, -1, 2147483647, -2147483648, -2147483648}, thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_i32); + + results = cudf::strings::to_integers(cudf::strings_column_view(strings), + cudf::data_type{cudf::type_id::UINT32}); + auto const expected_u32 = cudf::test::fixed_width_column_wrapper( + std::initializer_list{0, + 1234, + NULL_VAL, + 0, + 4294957464, + 93, + 765, + NULL_VAL, + 4294967295, + 2147483647, + 2147483648, + 2147483648}, + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_u32); } TEST_F(StringsConvertTest, FromInteger) @@ -114,7 +275,7 @@ TEST_F(StringsConvertTest, EmptyStringsColumn) cudf::test::strings_column_wrapper strings({"", "", ""}); auto results = cudf::strings::to_integers(cudf::strings_column_view(strings), cudf::data_type{cudf::type_id::INT64}); - cudf::test::fixed_width_column_wrapper expected({0, 0, 0}); + cudf::test::fixed_width_column_wrapper expected{0, 0, 0}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); }