From 3349764b9a0699bf96f7403895df88f7308647fb Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 17 Mar 2021 12:48:09 -0600 Subject: [PATCH 1/8] Refactor string conversion check (#7599) This addresses #7557. In summary: * Move `cudf::strings::is_integer()` code from `strings/chars_types.*` to `strings/convert/convert_integers.hpp/cu` * Move `cudf::strings::is_float()` code from `strings/chars_types.*` to `strings/convert/convert_floats.hpp/cu` * Remove `cudf::strings::all_integer()` and `cudf::strings::all_float()` Authors: - Nghia Truong (@ttnghia) Approvers: - GALI PREM SAGAR (@galipremsagar) - Jason Lowe (@jlowe) - Jake Hemstad (@jrhemstad) - David (@davidwendt) URL: https://github.com/rapidsai/cudf/pull/7599 --- .../cudf/strings/char_types/char_types.hpp | 78 +----------- .../cudf/strings/convert/convert_floats.hpp | 26 +++- .../cudf/strings/convert/convert_integers.hpp | 26 +++- cpp/src/strings/char_types/char_types.cu | 113 +----------------- cpp/src/strings/convert/convert_floats.cu | 41 ++++++- cpp/src/strings/convert/convert_integers.cu | 41 ++++++- cpp/tests/strings/chars_types_tests.cpp | 63 ---------- cpp/tests/strings/floats_tests.cpp | 35 ++++++ cpp/tests/strings/integers_tests.cu | 23 +++- java/src/main/native/src/ColumnViewJni.cpp | 1 - .../cudf/cudf/_lib/cpp/strings/char_types.pxd | 10 +- .../cpp/strings/convert/convert_floats.pxd | 6 +- .../cpp/strings/convert/convert_integers.pxd | 6 +- python/cudf/cudf/_lib/strings/char_types.pyx | 36 +----- .../_lib/strings/convert/convert_floats.pyx | 29 +++++ .../_lib/strings/convert/convert_integers.pyx | 29 +++++ python/cudf/cudf/core/column/string.py | 6 +- python/cudf/cudf/core/tools/datetimes.py | 4 +- 18 files changed, 265 insertions(+), 308 deletions(-) create mode 100644 python/cudf/cudf/_lib/strings/convert/convert_floats.pyx create mode 100644 python/cudf/cudf/_lib/strings/convert/convert_integers.pyx diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp index 300722920f4..1f5b6241850 100644 --- a/cpp/include/cudf/strings/char_types/char_types.hpp +++ b/cpp/include/cudf/strings/char_types/char_types.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -146,82 +146,6 @@ std::unique_ptr filter_characters_of_type( string_character_types types_to_keep = string_character_types::ALL_TYPES, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** - * @brief Returns a boolean column identifying strings in which all - * characters are valid for conversion to integers. - * - * The output row entry will be set to `true` if the corresponding string element - * has at least one character in [-+0-9]. - * - * @code{.pseudo} - * Example: - * s = ['123', '-456', '', 'A', '+7'] - * b = s.is_integer(s) - * b is [true, true, false, false, true] - * @endcode - * - * Any null row results in a null entry for that row in the output column. - * - * @param strings Strings instance for this operation. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column of boolean results for each string. - */ -std::unique_ptr is_integer( - strings_column_view const& strings, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Returns `true` if all strings contain - * characters that are valid for conversion to integers. - * - * This function will return `true` if all string elements - * has at least one character in [-+0-9]. - * - * Any null entry or empty string will cause this function to return `false`. - * - * @param strings Strings instance for this operation. - * @return true if all string are valid - */ -bool all_integer(strings_column_view const& strings); - -/** - * @brief Returns a boolean column identifying strings in which all - * characters are valid for conversion to floats. - * - * The output row entry will be set to `true` if the corresponding string element - * has at least one character in [-+0-9eE.]. - * - * @code{.pseudo} - * Example: - * s = ['123', '-456', '', 'A', '+7', '8.9' '3.7e+5'] - * b = s.is_float(s) - * b is [true, true, false, false, true, true, true] - * @endcode - * - * Any null row results in a null entry for that row in the output column. - * - * @param strings Strings instance for this operation. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column of boolean results for each string. - */ -std::unique_ptr is_float( - strings_column_view const& strings, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Returns `true` if all strings contain - * characters that are valid for conversion to floats. - * - * This function will return `true` if all string elements - * has at least one character in [-+0-9eE.]. - * - * Any null entry or empty string will cause this function to return `false`. - * - * @param strings Strings instance for this operation. - * @return true if all string are valid - */ -bool all_float(strings_column_view const& strings); - /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/include/cudf/strings/convert/convert_floats.hpp b/cpp/include/cudf/strings/convert/convert_floats.hpp index cb4746dbf40..d1e00b36f6f 100644 --- a/cpp/include/cudf/strings/convert/convert_floats.hpp +++ b/cpp/include/cudf/strings/convert/convert_floats.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -68,6 +68,30 @@ std::unique_ptr from_floats( column_view const& floats, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a boolean column identifying strings in which all + * characters are valid for conversion to floats. + * + * The output row entry will be set to `true` if the corresponding string element + * has at least one character in [-+0-9eE.]. + * + * @code{.pseudo} + * Example: + * s = ['123', '-456', '', 'A', '+7', '8.9' '3.7e+5'] + * b = s.is_float(s) + * b is [true, true, false, false, true, true, true] + * @endcode + * + * Any null row results in a null entry for that row in the output column. + * + * @param strings Strings instance for this operation. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New column of boolean results for each string. + */ +std::unique_ptr is_float( + strings_column_view const& strings, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp index 8f42deb380d..1e2fa80b129 100644 --- a/cpp/include/cudf/strings/convert/convert_integers.hpp +++ b/cpp/include/cudf/strings/convert/convert_integers.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -73,6 +73,30 @@ std::unique_ptr from_integers( column_view const& integers, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a boolean column identifying strings in which all + * characters are valid for conversion to integers. + * + * The output row entry will be set to `true` if the corresponding string element + * has at least one character in [-+0-9]. + * + * @code{.pseudo} + * Example: + * s = ['123', '-456', '', 'A', '+7'] + * b = s.is_integer(s) + * b is [true, true, false, false, true] + * @endcode + * + * Any null row results in a null entry for that row in the output column. + * + * @param strings Strings instance for this operation. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New column of boolean results for each string. + */ +std::unique_ptr is_integer( + strings_column_view const& strings, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Returns a new integer numeric column parsing hexadecimal values from the * provided strings column. diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu index 10496b89328..0b384ad0631 100644 --- a/cpp/src/strings/char_types/char_types.cu +++ b/cpp/src/strings/char_types/char_types.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -186,91 +186,6 @@ std::unique_ptr filter_characters_of_type(strings_column_view const& str mr); } -std::unique_ptr is_integer( - strings_column_view const& strings, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) -{ - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; - // create output column - auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings.size(), - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); - auto d_results = results->mutable_view().data(); - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings.size()), - d_results, - [d_column] __device__(size_type idx) { - if (d_column.is_null(idx)) return false; - return string::is_integer(d_column.element(idx)); - }); - results->set_null_count(strings.null_count()); - return results; -} - -bool all_integer(strings_column_view const& strings, rmm::cuda_stream_view stream) -{ - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; - auto transformer_itr = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), [d_column] __device__(size_type idx) { - if (d_column.is_null(idx)) return false; - return string::is_integer(d_column.element(idx)); - }); - return thrust::all_of(rmm::exec_policy(stream), - transformer_itr, - transformer_itr + strings.size(), - thrust::identity()); -} - -std::unique_ptr is_float( - strings_column_view const& strings, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) -{ - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; - // create output column - auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings.size(), - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); - auto d_results = results->mutable_view().data(); - // check strings for valid float chars - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings.size()), - d_results, - [d_column] __device__(size_type idx) { - if (d_column.is_null(idx)) return false; - return string::is_float(d_column.element(idx)); - }); - results->set_null_count(strings.null_count()); - return results; -} - -bool all_float(strings_column_view const& strings, rmm::cuda_stream_view stream) -{ - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; - auto transformer_itr = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), [d_column] __device__(size_type idx) { - if (d_column.is_null(idx)) return false; - return string::is_float(d_column.element(idx)); - }); - return thrust::all_of(rmm::exec_policy(stream), - transformer_itr, - transformer_itr + strings.size(), - thrust::identity()); -} - } // namespace detail // external API @@ -295,31 +210,5 @@ std::unique_ptr filter_characters_of_type(strings_column_view const& str strings, types_to_remove, replacement, types_to_keep, rmm::cuda_stream_default, mr); } -std::unique_ptr is_integer(strings_column_view const& strings, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::is_integer(strings, rmm::cuda_stream_default, mr); -} - -std::unique_ptr is_float(strings_column_view const& strings, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::is_float(strings, rmm::cuda_stream_default, mr); -} - -bool all_integer(strings_column_view const& strings) -{ - CUDF_FUNC_RANGE(); - return detail::all_integer(strings, rmm::cuda_stream_default); -} - -bool all_float(strings_column_view const& strings) -{ - CUDF_FUNC_RANGE(); - return detail::all_float(strings, rmm::cuda_stream_default); -} - } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu index 2bf65976986..b6d99efd51f 100644 --- a/cpp/src/strings/convert/convert_floats.cu +++ b/cpp/src/strings/convert/convert_floats.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -536,12 +537,50 @@ std::unique_ptr from_floats(column_view const& floats, } // namespace detail // external API - std::unique_ptr from_floats(column_view const& floats, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::from_floats(floats, rmm::cuda_stream_default, mr); } +namespace detail { +std::unique_ptr is_float( + strings_column_view const& strings, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + auto strings_column = column_device_view::create(strings.parent(), stream); + auto d_column = *strings_column; + // create output column + auto results = make_numeric_column(data_type{type_id::BOOL8}, + strings.size(), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); + auto d_results = results->mutable_view().data(); + // check strings for valid float chars + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings.size()), + d_results, + [d_column] __device__(size_type idx) { + if (d_column.is_null(idx)) return false; + return string::is_float(d_column.element(idx)); + }); + results->set_null_count(strings.null_count()); + return results; +} + +} // namespace detail + +// external API +std::unique_ptr is_float(strings_column_view const& strings, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::is_float(strings, rmm::cuda_stream_default, mr); +} + } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu index 112550fc25b..5c5032b5c87 100644 --- a/cpp/src/strings/convert/convert_integers.cu +++ b/cpp/src/strings/convert/convert_integers.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -245,7 +246,6 @@ std::unique_ptr from_integers(column_view const& integers, } // namespace detail // external API - std::unique_ptr from_integers(column_view const& integers, rmm::mr::device_memory_resource* mr) { @@ -253,5 +253,42 @@ std::unique_ptr from_integers(column_view const& integers, return detail::from_integers(integers, rmm::cuda_stream_default, mr); } +namespace detail { +std::unique_ptr is_integer( + strings_column_view const& strings, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + auto strings_column = column_device_view::create(strings.parent(), stream); + auto d_column = *strings_column; + // create output column + auto results = make_numeric_column(data_type{type_id::BOOL8}, + strings.size(), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); + auto d_results = results->mutable_view().data(); + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings.size()), + d_results, + [d_column] __device__(size_type idx) { + if (d_column.is_null(idx)) return false; + return string::is_integer(d_column.element(idx)); + }); + results->set_null_count(strings.null_count()); + return results; +} +} // namespace detail + +// external API +std::unique_ptr is_integer(strings_column_view const& strings, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::is_integer(strings, rmm::cuda_stream_default, mr); +} + } // namespace strings } // namespace cudf diff --git a/cpp/tests/strings/chars_types_tests.cpp b/cpp/tests/strings/chars_types_tests.cpp index 803a9b01b07..702329edaba 100644 --- a/cpp/tests/strings/chars_types_tests.cpp +++ b/cpp/tests/strings/chars_types_tests.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include #include @@ -228,54 +227,6 @@ TEST_F(StringsCharsTest, Numerics) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } -TEST_F(StringsCharsTest, Integers) -{ - cudf::test::strings_column_wrapper strings1( - {"+175", "-34", "9.8", "17+2", "+-14", "1234567890", "67de", "", "1e10", "-", "++", ""}); - auto results = cudf::strings::is_integer(cudf::strings_column_view(strings1)); - cudf::test::fixed_width_column_wrapper expected1({1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1); - EXPECT_FALSE(cudf::strings::all_integer(cudf::strings_column_view(strings1))); - - cudf::test::strings_column_wrapper strings2( - {"0", "+0", "-0", "1234567890", "-27341132", "+012", "023", "-045"}); - results = cudf::strings::is_integer(cudf::strings_column_view(strings2)); - cudf::test::fixed_width_column_wrapper expected2({1, 1, 1, 1, 1, 1, 1, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); - EXPECT_TRUE(cudf::strings::all_integer(cudf::strings_column_view(strings2))); -} - -TEST_F(StringsCharsTest, Floats) -{ - cudf::test::strings_column_wrapper strings1({"+175", - "-9.8", - "7+2", - "+-4", - "6.7e17", - "-1.2e-5", - "e", - ".e", - "1.e+-2", - "00.00", - "1.0e+1.0", - "1.2.3", - "+", - "--", - ""}); - auto results = cudf::strings::is_float(cudf::strings_column_view(strings1)); - cudf::test::fixed_width_column_wrapper expected1( - {1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1); - EXPECT_FALSE(cudf::strings::all_float(cudf::strings_column_view(strings1))); - - cudf::test::strings_column_wrapper strings2( - {"+175", "-34", "9.8", "1234567890", "6.7e17", "-917.2e5"}); - results = cudf::strings::is_float(cudf::strings_column_view(strings2)); - cudf::test::fixed_width_column_wrapper expected2({1, 1, 1, 1, 1, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); - EXPECT_TRUE(cudf::strings::all_float(cudf::strings_column_view(strings2))); -} - TEST_F(StringsCharsTest, EmptyStrings) { cudf::test::strings_column_wrapper strings({"", "", ""}); @@ -284,12 +235,6 @@ TEST_F(StringsCharsTest, EmptyStrings) auto results = cudf::strings::all_characters_of_type( strings_view, cudf::strings::string_character_types::ALPHANUM); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - results = cudf::strings::is_integer(strings_view); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - EXPECT_FALSE(cudf::strings::all_integer(strings_view)); - results = cudf::strings::is_float(strings_view); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - EXPECT_FALSE(cudf::strings::all_float(strings_view)); } TEST_F(StringsCharsTest, FilterCharTypes) @@ -379,14 +324,6 @@ TEST_F(StringsCharsTest, EmptyStringsColumn) EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id()); EXPECT_EQ(0, results->view().size()); - results = cudf::strings::is_integer(strings_view); - EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id()); - EXPECT_EQ(0, results->view().size()); - - results = cudf::strings::is_float(strings_view); - EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id()); - EXPECT_EQ(0, results->view().size()); - results = cudf::strings::filter_characters_of_type( strings_view, cudf::strings::string_character_types::NUMERIC); EXPECT_EQ(cudf::type_id::STRING, results->view().type().id()); diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp index b98416d9edd..f7151363d83 100644 --- a/cpp/tests/strings/floats_tests.cpp +++ b/cpp/tests/strings/floats_tests.cpp @@ -27,6 +27,41 @@ struct StringsConvertTest : public cudf::test::BaseFixture { }; +TEST_F(StringsConvertTest, IsFloat) +{ + cudf::test::strings_column_wrapper strings; + auto strings_view = cudf::strings_column_view(strings); + auto results = cudf::strings::is_float(strings_view); + EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id()); + EXPECT_EQ(0, results->view().size()); + + cudf::test::strings_column_wrapper strings1({"+175", + "-9.8", + "7+2", + "+-4", + "6.7e17", + "-1.2e-5", + "e", + ".e", + "1.e+-2", + "00.00", + "1.0e+1.0", + "1.2.3", + "+", + "--", + ""}); + results = cudf::strings::is_float(cudf::strings_column_view(strings1)); + cudf::test::fixed_width_column_wrapper expected1( + {1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1); + + cudf::test::strings_column_wrapper strings2( + {"+175", "-34", "9.8", "1234567890", "6.7e17", "-917.2e5"}); + results = cudf::strings::is_float(cudf::strings_column_view(strings2)); + cudf::test::fixed_width_column_wrapper expected2({1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); +} + TEST_F(StringsConvertTest, ToFloats32) { std::vector h_strings{"1234", diff --git a/cpp/tests/strings/integers_tests.cu b/cpp/tests/strings/integers_tests.cu index 9e2b9809b26..d6bf03b3f76 100644 --- a/cpp/tests/strings/integers_tests.cu +++ b/cpp/tests/strings/integers_tests.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,6 +29,27 @@ struct StringsConvertTest : public cudf::test::BaseFixture { }; +TEST_F(StringsConvertTest, IsInteger) +{ + cudf::test::strings_column_wrapper strings; + auto strings_view = cudf::strings_column_view(strings); + auto results = cudf::strings::is_integer(strings_view); + EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id()); + EXPECT_EQ(0, results->view().size()); + + cudf::test::strings_column_wrapper strings1( + {"+175", "-34", "9.8", "17+2", "+-14", "1234567890", "67de", "", "1e10", "-", "++", ""}); + results = cudf::strings::is_integer(cudf::strings_column_view(strings1)); + cudf::test::fixed_width_column_wrapper expected1({1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1); + + cudf::test::strings_column_wrapper strings2( + {"0", "+0", "-0", "1234567890", "-27341132", "+012", "023", "-045"}); + results = cudf::strings::is_integer(cudf::strings_column_view(strings2)); + cudf::test::fixed_width_column_wrapper expected2({1, 1, 1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); +} + TEST_F(StringsConvertTest, ToInteger) { std::vector h_strings{ diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 0ce9d6303e4..ac14e1605d7 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include diff --git a/python/cudf/cudf/_lib/cpp/strings/char_types.pxd b/python/cudf/cudf/_lib/cpp/strings/char_types.pxd index ad675027c10..934269c6f25 100644 --- a/python/cudf/cudf/_lib/cpp/strings/char_types.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/char_types.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2021, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from cudf._lib.cpp.column.column_view cimport column_view @@ -33,11 +33,3 @@ cdef extern from "cudf/strings/char_types/char_types.hpp" \ string_character_types types_to_remove, string_scalar replacement, string_character_types types_to_keep) except + - - cdef unique_ptr[column] is_integer( - column_view source_strings - ) except + - - cdef unique_ptr[column] is_float( - column_view source_strings - ) except + diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd b/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd index baee01b8f99..55a84b60efd 100644 --- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2021, NVIDIA CORPORATION. from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view @@ -14,3 +14,7 @@ cdef extern from "cudf/strings/convert/convert_floats.hpp" namespace \ cdef unique_ptr[column] from_floats( column_view input_col) except + + + cdef unique_ptr[column] is_float( + column_view source_strings + ) except + diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd b/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd index 92f99a2f5cb..6e45d4ba869 100644 --- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2021, NVIDIA CORPORATION. from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view @@ -15,6 +15,10 @@ cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \ cdef unique_ptr[column] from_integers( column_view input_col) except + + cdef unique_ptr[column] is_integer( + column_view source_strings + ) except + + cdef unique_ptr[column] hex_to_integers( column_view input_col, data_type output_type) except + diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx index 5d8d1522418..1890e98f956 100644 --- a/python/cudf/cudf/_lib/strings/char_types.pyx +++ b/python/cudf/cudf/_lib/strings/char_types.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2021, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -14,8 +14,6 @@ from cudf._lib.cpp.strings.char_types cimport ( all_characters_of_type as cpp_all_characters_of_type, filter_characters_of_type as cpp_filter_characters_of_type, string_character_types as string_character_types, - is_integer as cpp_is_integer, - is_float as cpp_is_float, ) @@ -191,35 +189,3 @@ def is_space(Column source_strings): )) return Column.from_unique_ptr(move(c_result)) - - -def is_integer(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that have intergers. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_integer( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) - - -def is_float(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that have floats. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_float( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx new file mode 100644 index 00000000000..195d9b71f6e --- /dev/null +++ b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx @@ -0,0 +1,29 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.column cimport Column +from cudf._lib.cpp.column.column cimport column + +from cudf._lib.cpp.strings.convert.convert_floats cimport ( + is_float as cpp_is_float, +) + + +def is_float(Column source_strings): + """ + Returns a Column of boolean values with True for `source_strings` + that have floats. + """ + cdef unique_ptr[column] c_result + cdef column_view source_view = source_strings.view() + + with nogil: + c_result = move(cpp_is_float( + source_view + )) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx new file mode 100644 index 00000000000..d1bae1edd37 --- /dev/null +++ b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx @@ -0,0 +1,29 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.column cimport Column +from cudf._lib.cpp.column.column cimport column + +from cudf._lib.cpp.strings.convert.convert_integers cimport ( + is_integer as cpp_is_integer, +) + + +def is_integer(Column source_strings): + """ + Returns a Column of boolean values with True for `source_strings` + that have intergers. + """ + cdef unique_ptr[column] c_result + cdef column_view source_view = source_strings.view() + + with nogil: + c_result = move(cpp_is_integer( + source_view + )) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index ea01aa07b91..11dd7556812 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -70,13 +70,15 @@ is_alpha as cpp_is_alpha, is_decimal as cpp_is_decimal, is_digit as cpp_is_digit, - is_float as cpp_is_float, - is_integer as cpp_is_integer, is_lower as cpp_is_lower, is_numeric as cpp_is_numeric, is_space as cpp_isspace, is_upper as cpp_is_upper, ) +from cudf._lib.strings.convert.convert_integers import ( + is_integer as cpp_is_integer, +) +from cudf._lib.strings.convert.convert_floats import is_float as cpp_is_float from cudf._lib.strings.combine import ( concatenate as cpp_concatenate, join as cpp_join, diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 4e5e4ce1987..535e497e8dc 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -8,7 +8,9 @@ from pandas.core.tools.datetimes import _unit_map import cudf -from cudf._lib.strings.char_types import is_integer as cpp_is_integer +from cudf._lib.strings.convert.convert_integers import ( + is_integer as cpp_is_integer, +) from cudf.core import column from cudf.core.index import as_index from cudf.utils.dtypes import is_scalar From 168c489a9415ae7bbbec5ef600b0d3dcde44b583 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Wed, 17 Mar 2021 20:37:11 -0500 Subject: [PATCH 2/8] Fix Series/Dataframe Mixed Arithmetic (#7491) Fixes https://github.com/rapidsai/cudf/issues/7385 Authors: - @brandon-b-miller Approvers: - GALI PREM SAGAR (@galipremsagar) - Michael Wang (@isVoid) URL: https://github.com/rapidsai/cudf/pull/7491 --- python/cudf/cudf/core/dataframe.py | 8 ++--- python/cudf/cudf/core/series.py | 4 +-- python/cudf/cudf/tests/test_dataframe.py | 42 ++++++++++++------------ 3 files changed, 24 insertions(+), 30 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 25f57748765..9672ab3002f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1518,11 +1518,7 @@ def fallback(col, fn): else: if col not in df_cols: r_opr = other_cols[col] - l_opr = Series( - column_empty( - len(self), masked=True, dtype=other.dtype - ) - ) + l_opr = Series(as_column(np.nan, length=len(self))) if col not in other_cols_keys: r_opr = None l_opr = self[col] @@ -2198,7 +2194,7 @@ def rpow(self, other, axis="columns", level=None, fill_value=None): return self._apply_op("rpow", other, fill_value) def __rpow__(self, other): - return self._apply_op("__pow__", other) + return self._apply_op("__rpow__", other) def floordiv(self, other, axis="columns", level=None, fill_value=None): """ diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 5e7121c0488..b06fef178f6 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1501,9 +1501,7 @@ def _binaryop( If ``reflect`` is ``True``, swap the order of the operands. """ if isinstance(other, cudf.DataFrame): - # TODO: fn is not the same as arg expected by _apply_op - # e.g. for fn = 'and', _apply_op equivalent is '__and__' - return other._apply_op(self, fn) + return NotImplemented result_name = utils.get_result_name(self, other) if isinstance(other, Series): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 77548b95277..5f4d571e8c5 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -4996,13 +4996,13 @@ def test_cov_nans(): @pytest.mark.parametrize( "gsr", [ - cudf.Series([1, 2, 3]), - cudf.Series([1, 2, 3], index=["a", "b", "c"]), - cudf.Series([1, 2, 3], index=["a", "b", "d"]), - cudf.Series([1, 2], index=["a", "b"]), - cudf.Series([1, 2, 3], index=cudf.core.index.RangeIndex(0, 3)), + cudf.Series([4, 2, 3]), + cudf.Series([4, 2, 3], index=["a", "b", "c"]), + cudf.Series([4, 2, 3], index=["a", "b", "d"]), + cudf.Series([4, 2], index=["a", "b"]), + cudf.Series([4, 2, 3], index=cudf.core.index.RangeIndex(0, 3)), pytest.param( - cudf.Series([1, 2, 3, 4, 5], index=["a", "b", "d", "0", "12"]), + cudf.Series([4, 2, 3, 4, 5], index=["a", "b", "d", "0", "12"]), marks=pytest.mark.xfail, ), ], @@ -5017,32 +5017,32 @@ def test_cov_nans(): operator.truediv, operator.mod, operator.pow, - # comparison ops will temporarily XFAIL - # see PR https://github.com/rapidsai/cudf/pull/7491 - pytest.param(operator.eq, marks=pytest.mark.xfail()), - pytest.param(operator.lt, marks=pytest.mark.xfail()), - pytest.param(operator.le, marks=pytest.mark.xfail()), - pytest.param(operator.gt, marks=pytest.mark.xfail()), - pytest.param(operator.ge, marks=pytest.mark.xfail()), - pytest.param(operator.ne, marks=pytest.mark.xfail()), + operator.eq, + operator.lt, + operator.le, + operator.gt, + operator.ge, + operator.ne, ], ) def test_df_sr_binop(gsr, colnames, op): - data = [[0, 2, 5], [3, None, 5], [6, 7, np.nan]] + data = [[3.0, 2.0, 5.0], [3.0, None, 5.0], [6.0, 7.0, np.nan]] data = dict(zip(colnames, data)) + gsr = gsr.astype("float64") + gdf = cudf.DataFrame(data) - pdf = pd.DataFrame.from_dict(data) + pdf = gdf.to_pandas(nullable=True) - psr = gsr.to_pandas() + psr = gsr.to_pandas(nullable=True) expect = op(pdf, psr) - got = op(gdf, gsr) - assert_eq(expect.astype(float), got.astype(float)) + got = op(gdf, gsr).to_pandas(nullable=True) + assert_eq(expect, got, check_dtype=False) expect = op(psr, pdf) - got = op(psr, pdf) - assert_eq(expect.astype(float), got.astype(float)) + got = op(gsr, gdf).to_pandas(nullable=True) + assert_eq(expect, got, check_dtype=False) @pytest.mark.parametrize( From 99001d2c8d9b3898e58c74d7979ab6204c5e5bee Mon Sep 17 00:00:00 2001 From: Alfred Xu Date: Thu, 18 Mar 2021 09:57:30 +0800 Subject: [PATCH 3/8] Java support on explode_outer (#7625) This pull request aims to enable `cudf::explode_outer` and `cudf::explode_outer_position` in Java package. Authors: - Alfred Xu (@sperlingxx) Approvers: - Robert (Bobby) Evans (@revans2) URL: https://github.com/rapidsai/cudf/pull/7625 --- java/src/main/java/ai/rapids/cudf/Table.java | 141 +++++++++++++++--- java/src/main/native/src/TableJni.cpp | 28 ++++ .../test/java/ai/rapids/cudf/TableTest.java | 86 +++++++++-- 3 files changed, 218 insertions(+), 37 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 7385b55d0df..d0e59fdc105 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -516,6 +516,10 @@ private static native long[] repeatColumnCount(long tableHandle, private static native long[] explodePosition(long tableHandle, int index); + private static native long[] explodeOuter(long tableHandle, int index); + + private static native long[] explodeOuterPosition(long tableHandle, int index); + private static native long createCudfTableView(long[] nativeColumnViewHandles); private static native long[] columnViewsFromPacked(ByteBuffer metadata, long dataAddress); @@ -1725,7 +1729,7 @@ public ContiguousTable[] contiguousSplit(int... indices) { * Example: * input: [[5,10,15], 100], * [[20,25], 200], - * [[30], 300], + * [[30], 300] * index: 0 * output: [5, 100], * [10, 100], @@ -1737,12 +1741,12 @@ public ContiguousTable[] contiguousSplit(int... indices) { * * Nulls propagate in different ways depending on what is null. * - * [[5,null,15], 100], - * [null, 200] - * returns: - * [5, 100], - * [null, 100], - * [15, 100] + * input: [[5,null,15], 100], + * [null, 200] + * index: 0 + * output: [5, 100], + * [null, 100], + * [15, 100] * * Note that null lists are completely removed from the output * and nulls inside lists are pulled out and remain. @@ -1763,27 +1767,26 @@ public Table explode(int index) { * in the output. The corresponding rows for other columns in the input are duplicated. A position * column is added that has the index inside the original list for each row. Example: * - * [[5,10,15], 100], - * [[20,25], 200], - * [[30], 300], - * returns - * [0, 5, 100], - * [1, 10, 100], - * [2, 15, 100], - * [0, 20, 200], - * [1, 25, 200], - * [0, 30, 300], + * input: [[5,10,15], 100], + * [[20,25], 200], + * [[30], 300] + * index: 0 + * output: [0, 5, 100], + * [1, 10, 100], + * [2, 15, 100], + * [0, 20, 200], + * [1, 25, 200], + * [0, 30, 300] * * * Nulls and empty lists propagate in different ways depending on what is null or empty. * - * [[5,null,15], 100], - * [null, 200], - * [[], 300], - * returns - * [0, 5, 100], - * [1, null, 100], - * [2, 15, 100], + * input: [[5,null,15], 100], + * [null, 200] + * index: 0 + * output: [5, 100], + * [null, 100], + * [15, 100] * * * Note that null lists are not included in the resulting table, but nulls inside @@ -1799,6 +1802,96 @@ public Table explodePosition(int index) { return new Table(explodePosition(nativeHandle, index)); } + /** + * Explodes a list column's elements. + * + * Any list is exploded, which means the elements of the list in each row are expanded + * into new rows in the output. The corresponding rows for other columns in the input + * are duplicated. + * + * + * Example: + * input: [[5,10,15], 100], + * [[20,25], 200], + * [[30], 300], + * index: 0 + * output: [5, 100], + * [10, 100], + * [15, 100], + * [20, 200], + * [25, 200], + * [30, 300] + * + * + * Nulls propagate in different ways depending on what is null. + * + * input: [[5,null,15], 100], + * [null, 200] + * index: 0 + * output: [5, 100], + * [null, 100], + * [15, 100], + * [null, 200] + * + * Note that null lists are completely removed from the output + * and nulls inside lists are pulled out and remain. + * + * @param index Column index to explode inside the table. + * @return A new table with explode_col exploded. + */ + public Table explodeOuter(int index) { + assert 0 <= index && index < columns.length : "Column index is out of range"; + assert columns[index].getType().equals(DType.LIST) : "Column to explode must be of type LIST"; + return new Table(explodeOuter(nativeHandle, index)); + } + + /** + * Explodes a list column's elements retaining any null entries or empty lists and includes a + * position column. + * + * Any list is exploded, which means the elements of the list in each row are expanded into new rows + * in the output. The corresponding rows for other columns in the input are duplicated. A position + * column is added that has the index inside the original list for each row. Example: + * + * + * Example: + * input: [[5,10,15], 100], + * [[20,25], 200], + * [[30], 300], + * index: 0 + * output: [0, 5, 100], + * [1, 10, 100], + * [2, 15, 100], + * [0, 20, 200], + * [1, 25, 200], + * [0, 30, 300] + * + * + * Nulls and empty lists propagate as null entries in the result. + * + * input: [[5,null,15], 100], + * [null, 200], + * [[], 300] + * index: 0 + * output: [0, 5, 100], + * [1, null, 100], + * [2, 15, 100], + * [0, null, 200], + * [0, null, 300] + * + * + * returns + * + * @param index Column index to explode inside the table. + * @return A new table with exploded value and position. The column order of return table is + * [cols before explode_input, explode_position, explode_value, cols after explode_input]. + */ + public Table explodeOuterPosition(int index) { + assert 0 <= index && index < columns.length : "Column index is out of range"; + assert columns[index].getType().equals(DType.LIST) : "Column to explode must be of type LIST"; + return new Table(explodeOuterPosition(nativeHandle, index)); + } + /** * Gathers the rows of this table according to `gatherMap` such that row "i" * in the resulting table's columns will contain row "gatherMap[i]" from this table. diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 4548156055a..02385a453d0 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -2052,4 +2052,32 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodePosition(JNIEnv *e CATCH_STD(env, 0); } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuter(JNIEnv *env, jclass, + jlong input_jtable, + jint column_index) { + JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0); + try { + cudf::jni::auto_set_device(env); + cudf::table_view *input_table = reinterpret_cast(input_jtable); + cudf::size_type col_index = static_cast(column_index); + std::unique_ptr exploded = cudf::explode_outer(*input_table, col_index); + return cudf::jni::convert_table_for_return(env, exploded); + } + CATCH_STD(env, 0); +} + +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuterPosition(JNIEnv *env, jclass, + jlong input_jtable, + jint column_index) { + JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0); + try { + cudf::jni::auto_set_device(env); + cudf::table_view *input_table = reinterpret_cast(input_jtable); + cudf::size_type col_index = static_cast(column_index); + std::unique_ptr exploded = cudf::explode_outer_position(*input_table, col_index); + return cudf::jni::convert_table_for_return(env, exploded); + } + CATCH_STD(env, 0); +} + } // extern "C" diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 626f7828012..c2e28e1cad8 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -4635,7 +4635,7 @@ private Table[] buildExplodeTestTableWithPrimitiveTypes(boolean pos, boolean out } } - private Table[] buildExplodeTestTableWithNestedTypes(boolean pos) { + private Table[] buildExplodeTestTableWithNestedTypes(boolean pos, boolean outer) { StructType nestedType = new StructType(true, new BasicType(false, DType.INT32), new BasicType(false, DType.STRING)); try (Table input = new Table.TestBuilder() @@ -4644,23 +4644,42 @@ private Table[] buildExplodeTestTableWithNestedTypes(boolean pos) { Arrays.asList(struct(4, "k4"), struct(5, "k5")), Arrays.asList(struct(6, "k6")), Arrays.asList(new HostColumnVector.StructData((List) null)), - Arrays.asList()) + null) .column("s1", "s2", "s3", "s4", "s5") .column(1, 3, 5, 7, 9) .column(12.0, 14.0, 13.0, 11.0, 15.0) .build()) { Table.TestBuilder expectedBuilder = new Table.TestBuilder(); if (pos) { - expectedBuilder.column(0, 1, 2, 0, 1, 0, 0); + if (!outer) + expectedBuilder.column(0, 1, 2, 0, 1, 0, 0); + else + expectedBuilder.column(0, 1, 2, 0, 1, 0, 0, 0); } - try (Table expected = expectedBuilder - .column(nestedType, + List expectedData = new ArrayList(){{ + if (!outer) { + this.add(new HostColumnVector.StructData[]{ + struct(1, "k1"), struct(2, "k2"), struct(3, "k3"), + struct(4, "k4"), struct(5, "k5"), struct(6, "k6"), + new HostColumnVector.StructData((List) null)}); + this.add(new String[]{"s1", "s1", "s1", "s2", "s2", "s3", "s4"}); + this.add(new Integer[]{1, 1, 1, 3, 3, 5, 7}); + this.add(new Double[]{12.0, 12.0, 12.0, 14.0, 14.0, 13.0, 11.0}); + } else { + this.add(new HostColumnVector.StructData[]{ struct(1, "k1"), struct(2, "k2"), struct(3, "k3"), struct(4, "k4"), struct(5, "k5"), struct(6, "k6"), - new HostColumnVector.StructData((List) null)) - .column("s1", "s1", "s1", "s2", "s2", "s3", "s4") - .column(1, 1, 1, 3, 3, 5, 7) - .column(12.0, 12.0, 12.0, 14.0, 14.0, 13.0, 11.0) + new HostColumnVector.StructData((List) null), null}); + this.add(new String[]{"s1", "s1", "s1", "s2", "s2", "s3", "s4", "s5"}); + this.add(new Integer[]{1, 1, 1, 3, 3, 5, 7, 9}); + this.add(new Double[]{12.0, 12.0, 12.0, 14.0, 14.0, 13.0, 11.0, 15.0}); + } + }}; + try (Table expected = expectedBuilder + .column(nestedType, (HostColumnVector.StructData[]) expectedData.get(0)) + .column((String[]) expectedData.get(1)) + .column((Integer[]) expectedData.get(2)) + .column((Double[]) expectedData.get(3)) .build()) { return new Table[]{new Table(input.getColumns()), new Table(expected.getColumns())}; } @@ -4679,7 +4698,7 @@ void testExplode() { } // Child is nested type - Table[] testTables2 = buildExplodeTestTableWithNestedTypes(false); + Table[] testTables2 = buildExplodeTestTableWithNestedTypes(false, false); try (Table input = testTables2[0]; Table expected = testTables2[1]) { try (Table exploded = input.explode(0)) { @@ -4689,7 +4708,7 @@ void testExplode() { } @Test - void testPosExplode() { + void testExplodePosition() { // Child is primitive type Table[] testTables = buildExplodeTestTableWithPrimitiveTypes(true, false); try (Table input = testTables[0]; @@ -4699,8 +4718,8 @@ void testPosExplode() { } } - // Child is primitive type - Table[] testTables2 = buildExplodeTestTableWithNestedTypes(true); + // Child is nested type + Table[] testTables2 = buildExplodeTestTableWithNestedTypes(true, false); try (Table input = testTables2[0]; Table expected = testTables2[1]) { try (Table exploded = input.explodePosition(0)) { @@ -4709,4 +4728,45 @@ void testPosExplode() { } } + @Test + void testExplodeOuter() { + // Child is primitive type + Table[] testTables = buildExplodeTestTableWithPrimitiveTypes(false, true); + try (Table input = testTables[0]; + Table expected = testTables[1]) { + try (Table exploded = input.explodeOuter(0)) { + assertTablesAreEqual(expected, exploded); + } + } + + // Child is nested type + Table[] testTables2 = buildExplodeTestTableWithNestedTypes(false, true); + try (Table input = testTables2[0]; + Table expected = testTables2[1]) { + try (Table exploded = input.explodeOuter(0)) { + assertTablesAreEqual(expected, exploded); + } + } + } + + @Test + void testExplodeOuterPosition() { + // Child is primitive type + Table[] testTables = buildExplodeTestTableWithPrimitiveTypes(true, true); + try (Table input = testTables[0]; + Table expected = testTables[1]) { + try (Table exploded = input.explodeOuterPosition(0)) { + assertTablesAreEqual(expected, exploded); + } + } + + // Child is nested type + Table[] testTables2 = buildExplodeTestTableWithNestedTypes(true, true); + try (Table input = testTables2[0]; + Table expected = testTables2[1]) { + try (Table exploded = input.explodeOuterPosition(0)) { + assertTablesAreEqual(expected, exploded); + } + } + } } From 9aa33efa9e931c9de78e047a20e6a7481ee13559 Mon Sep 17 00:00:00 2001 From: David <45795991+davidwendt@users.noreply.github.com> Date: Wed, 17 Mar 2021 22:54:30 -0400 Subject: [PATCH 4/8] Optimize cudf::make_strings_column for long strings (#7576) Reference #7571 This improves the performance of the `cudf::make_strings_column` for long strings. It uses a similar approach from `cudf::strings::detail::gather` and also use thresholding as in the optimized `cudf::strings::replace`. This may not be the right solution for overall optimizing #7571 but may be helpful in other places where long strings are used for created a strings column in libcudf. This PR also includes a gbenchmark to help measure the performance results of this factory function. The results of the benchmark are that longer strings (~ >64 bytes on average) showed about a 10x improvement. I can post benchmark results here if needed. The character-parallel algorithm was slower for shorter strings so the existing algorithm is used based on the a threshold calculation. I also added an additional gtest with a mixture of nulls and empty strings to make sure the new algorithm handles these correctly. Authors: - David (@davidwendt) Approvers: - Jason Lowe (@jlowe) - Nghia Truong (@ttnghia) - Jake Hemstad (@jrhemstad) URL: https://github.com/rapidsai/cudf/pull/7576 --- cpp/benchmarks/CMakeLists.txt | 1 + cpp/benchmarks/string/factory_benchmark.cu | 93 ++++++++++++++++++ cpp/benchmarks/string/string_bench_args.hpp | 2 + cpp/include/cudf/strings/detail/gather.cuh | 87 ++++++++++------ .../detail/strings_column_factories.cuh | 98 +++++++++++++++---- cpp/include/cudf/utilities/traits.hpp | 12 +++ cpp/tests/strings/factories_test.cu | 34 +++++++ 7 files changed, 280 insertions(+), 47 deletions(-) create mode 100644 cpp/benchmarks/string/factory_benchmark.cu diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index dfc340b1459..e63ea38a31b 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -185,6 +185,7 @@ ConfigureBench(STRINGS_BENCH string/convert_floats_benchmark.cpp string/copy_benchmark.cpp string/extract_benchmark.cpp + string/factory_benchmark.cu string/filter_benchmark.cpp string/find_benchmark.cpp string/replace_benchmark.cpp diff --git a/cpp/benchmarks/string/factory_benchmark.cu b/cpp/benchmarks/string/factory_benchmark.cu new file mode 100644 index 00000000000..6c5dceffaa8 --- /dev/null +++ b/cpp/benchmarks/string/factory_benchmark.cu @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "string_bench_args.hpp" + +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include + +#include + +namespace { +using string_pair = thrust::pair; +struct string_view_to_pair { + __device__ string_pair operator()(thrust::pair const& p) + { + return (p.second) ? string_pair{p.first.data(), p.first.size_bytes()} : string_pair{nullptr, 0}; + } +}; +} // namespace + +class StringsFactory : public cudf::benchmark { +}; + +static void BM_factory(benchmark::State& state) +{ + cudf::size_type const n_rows{static_cast(state.range(0))}; + cudf::size_type const max_str_length{static_cast(state.range(1))}; + data_profile table_profile; + table_profile.set_distribution_params( + cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); + auto const table = + create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile); + auto d_column = cudf::column_device_view::create(table->view().column(0)); + rmm::device_vector pairs(d_column->size()); + thrust::transform(thrust::device, + d_column->pair_begin(), + d_column->pair_end(), + pairs.data(), + string_view_to_pair{}); + + for (auto _ : state) { + cuda_event_timer raii(state, true, 0); + cudf::make_strings_column(pairs); + } + + cudf::strings_column_view input(table->view().column(0)); + state.SetBytesProcessed(state.iterations() * input.chars_size()); +} + +static void generate_bench_args(benchmark::internal::Benchmark* b) +{ + int const min_rows = 1 << 12; + int const max_rows = 1 << 24; + int const row_mult = 8; + int const min_rowlen = 1 << 5; + int const max_rowlen = 1 << 13; + int const len_mult = 4; + generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); +} + +#define STRINGS_BENCHMARK_DEFINE(name) \ + BENCHMARK_DEFINE_F(StringsFactory, name) \ + (::benchmark::State & st) { BM_factory(st); } \ + BENCHMARK_REGISTER_F(StringsFactory, name) \ + ->Apply(generate_bench_args) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +STRINGS_BENCHMARK_DEFINE(factory) diff --git a/cpp/benchmarks/string/string_bench_args.hpp b/cpp/benchmarks/string/string_bench_args.hpp index f81f859de74..9c709b064dd 100644 --- a/cpp/benchmarks/string/string_bench_args.hpp +++ b/cpp/benchmarks/string/string_bench_args.hpp @@ -17,6 +17,8 @@ #include +#include + /** * @brief Generate row count and row length argument ranges for a string benchmark. * diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index 28da8ef4324..988fa552100 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -31,15 +31,60 @@ #include namespace cudf { +namespace strings { +namespace detail { -template -constexpr inline bool is_signed_iterator() +/** + * @brief Returns a new chars column using the specified indices to select + * strings from the input iterator. + * + * This uses a character-parallel gather CUDA kernel that performs very + * well on a strings column with long strings (e.g. average > 64 bytes). + * + * @tparam StringIterator Iterator should produce `string_view` objects. + * @tparam MapIterator Iterator for retrieving integer indices of the `StringIterator`. + * + * @param strings_begin Start of the iterator to retrieve `string_view` instances + * @param map_begin Start of index iterator. + * @param map_end End of index iterator. + * @param offsets The offset values to be associated with the output chars column. + * @param chars_bytes The total number of bytes for the output chars column. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @return New chars column fit for a strings column. + */ +template +std::unique_ptr gather_chars(StringIterator strings_begin, + MapIterator map_begin, + MapIterator map_end, + cudf::device_span const offsets, + size_type chars_bytes, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - return std::is_signed::value_type>::value; -} + auto const output_count = std::distance(map_begin, map_end); + if (output_count == 0) return make_empty_column(data_type{type_id::INT8}); -namespace strings { -namespace detail { + auto chars_column = create_chars_child_column(output_count, 0, chars_bytes, stream, mr); + auto const d_chars = chars_column->mutable_view().template data(); + + auto gather_chars_fn = [strings_begin, map_begin, offsets] __device__(size_type out_idx) -> char { + auto const out_row = + thrust::prev(thrust::upper_bound(thrust::seq, offsets.begin(), offsets.end(), out_idx)); + auto const row_idx = map_begin[thrust::distance(offsets.begin(), out_row)]; // get row index + auto const d_str = strings_begin[row_idx]; // get row's string + auto const offset = out_idx - *out_row; // get string's char + return d_str.data()[offset]; + }; + + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(chars_bytes), + d_chars, + gather_chars_fn); + + return chars_column; +} /** * @brief Returns a new strings column using the specified indices to select @@ -107,29 +152,15 @@ std::unique_ptr gather( rmm::exec_policy(stream), d_out_offsets, d_out_offsets + output_count + 1, d_out_offsets); // build chars column - size_type const out_chars_bytes = static_cast(total_bytes); - auto out_chars_column = create_chars_child_column(output_count, 0, out_chars_bytes, stream, mr); - auto const d_out_chars = out_chars_column->mutable_view().template data(); - - // fill in chars cudf::device_span const d_out_offsets_span(d_out_offsets, output_count + 1); - auto const d_in_chars = (strings_count > 0) ? strings.chars().data() : nullptr; - auto gather_chars_fn = - [d_out_offsets_span, begin, d_in_offsets, d_in_chars] __device__(size_type out_char_idx) { - // find output row index for this output char index - auto const next_row_ptr = thrust::upper_bound( - thrust::seq, d_out_offsets_span.begin(), d_out_offsets_span.end(), out_char_idx); - auto const out_row_idx = thrust::distance(d_out_offsets_span.begin(), next_row_ptr) - 1; - auto const str_char_offset = out_char_idx - d_out_offsets_span[out_row_idx]; - auto const in_row_idx = begin[out_row_idx]; - auto const in_char_offset = d_in_offsets[in_row_idx] + str_char_offset; - return d_in_chars[in_char_offset]; - }; - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(out_chars_bytes), - d_out_chars, - gather_chars_fn); + auto const d_strings = column_device_view::create(strings.parent(), stream); + auto out_chars_column = gather_chars(d_strings->begin(), + begin, + end, + d_out_offsets_span, + static_cast(total_bytes), + stream, + mr); return make_strings_column(output_count, std::move(out_offsets_column), diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh index 8e843c555c5..932f7eb0926 100644 --- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh +++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -27,6 +28,7 @@ #include #include +#include #include #include @@ -34,7 +36,27 @@ namespace cudf { namespace strings { namespace detail { -// Create a strings-type column from iterators of pointer/size pairs +/** + * @brief Average string byte-length threshold for deciding character-level + * vs. row-level parallel algorithm. + * + * This value was determined by running the factory_benchmark against different + * string lengths and observing the point where the performance is faster for + * long strings. + */ +constexpr size_type FACTORY_BYTES_PER_ROW_THRESHOLD = 64; + +/** + * @brief Create a strings-type column from iterators of pointer/size pairs + * + * @tparam IndexPairIterator iterator over type `pair` values + * + * @param begin First string row (inclusive) + * @param end Last string row (exclusive) + * @param stream CUDA stream used for device memory operations + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column + */ template std::unique_ptr make_strings_column(IndexPairIterator begin, IndexPairIterator end, @@ -51,7 +73,7 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, auto size_checker = [] __device__(string_index_pair const& item) { return (item.first != nullptr) ? item.second : 0; }; - size_t bytes = thrust::transform_reduce( + size_t const bytes = thrust::transform_reduce( rmm::exec_policy(stream), begin, end, size_checker, 0, thrust::plus()); CUDF_EXPECTS(bytes < static_cast(std::numeric_limits::max()), "total size of strings is too large for cudf column"); @@ -65,26 +87,49 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); // create null mask - auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; }; - auto new_nulls = cudf::detail::valid_if(begin, end, validator, stream, mr); - auto null_count = new_nulls.second; + auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; }; + auto new_nulls = cudf::detail::valid_if(begin, end, validator, stream, mr); + auto const null_count = new_nulls.second; auto null_mask = (null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr}; // build chars column - auto chars_column = - strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr); - auto d_chars = chars_column->mutable_view().template data(); - auto copy_chars = [d_chars] __device__(auto item) { - string_index_pair str = thrust::get<0>(item); - size_type offset = thrust::get<1>(item); - if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second); - }; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_zip_iterator( - thrust::make_tuple(begin, offsets_column->view().template begin())), - strings_count, - copy_chars); + std::unique_ptr chars_column = [&] { + // use a character-parallel kernel for long string lengths + auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1); + if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) { + auto const d_offsets = + device_span{offsets_column->view().template data(), + static_cast(offsets_column->size())}; + auto const str_begin = thrust::make_transform_iterator(begin, [] __device__(auto ip) { + return string_view{ip.first, ip.second}; + }); + + return gather_chars(str_begin, + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + d_offsets, + static_cast(bytes), + stream, + mr); + } else { + // this approach is 2-3x faster for a large number of smaller string lengths + auto chars_column = + strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr); + auto d_chars = chars_column->mutable_view().template data(); + auto copy_chars = [d_chars] __device__(auto item) { + string_index_pair const str = thrust::get<0>(item); + size_type const offset = thrust::get<1>(item); + if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second); + }; + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_zip_iterator(thrust::make_tuple( + begin, offsets_column->view().template begin())), + strings_count, + copy_chars); + return chars_column; + } + }(); return make_strings_column(strings_count, std::move(offsets_column), @@ -95,7 +140,22 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, mr); } -// Create a strings-type column from iterators to chars, offsets, and bitmask. +/** + * @brief Create a strings-type column from iterators to chars, offsets, and bitmask. + * + * @tparam CharIterator iterator over character bytes (int8) + * @tparam OffsetIterator iterator over offset values (size_type) + * + * @param chars_begin First character byte (inclusive) + * @param chars_end Last character byte (exclusive) + * @param offset_begin First offset value (inclusive) + * @param offset_end Last offset value (exclusive) + * @param null_count Number of null rows + * @param null_mask The validity bitmask in Arrow format + * @param stream CUDA stream used for device memory operations + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column + */ template std::unique_ptr make_strings_column(CharIterator chars_begin, CharIterator chars_end, diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp index e045476ea77..1e0d45d081d 100644 --- a/cpp/include/cudf/utilities/traits.hpp +++ b/cpp/include/cudf/utilities/traits.hpp @@ -224,6 +224,18 @@ constexpr inline bool is_unsigned(data_type type) return cudf::type_dispatcher(type, is_unsigned_impl{}); } +/** + * @brief Indicates whether the `Iterator` value type is unsigned. + * + * @tparam Iterator The type to verify + * @return true if the iterator's value type is unsigned + */ +template +constexpr inline bool is_signed_iterator() +{ + return std::is_signed::value_type>::value; +} + /** * @brief Indicates whether the type `T` is a floating point type. * diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu index f904c404251..bd463a7ab0d 100644 --- a/cpp/tests/strings/factories_test.cu +++ b/cpp/tests/strings/factories_test.cu @@ -19,12 +19,18 @@ #include #include #include +#include #include #include #include #include #include +#include + +#include +#include + #include #include @@ -198,3 +204,31 @@ TEST_F(StringsFactoriesTest, CreateOffsets) } } } + +namespace { +using string_pair = thrust::pair; +struct string_view_to_pair { + __device__ string_pair operator()(thrust::pair const& p) + { + return (p.second) ? string_pair{p.first.data(), p.first.size_bytes()} : string_pair{nullptr, 0}; + } +}; +} // namespace + +TEST_F(StringsFactoriesTest, StringPairWithNullsAndEmpty) +{ + cudf::test::strings_column_wrapper data( + {"", "this", "is", "", "a", "", "column", "of", "strings", "", ""}, + {0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1}); + + auto d_column = cudf::column_device_view::create(data); + rmm::device_vector pairs(d_column->size()); + thrust::transform(thrust::device, + d_column->pair_begin(), + d_column->pair_end(), + pairs.data(), + string_view_to_pair{}); + + auto result = cudf::make_strings_column(pairs); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), data); +} From 951b455b14a37efcbffc38638ab0b89d787d5b59 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 17 Mar 2021 20:29:36 -0700 Subject: [PATCH 5/8] Adds `list.take`, python binding for `cudf::lists::segmented_gather` (#7591) Closes #7465 Implements `ListColumn.list.take` based on `cudf::lists:segmented_gather`. Gather elements inside each list based on the provided positions. Example: ```python >>> s = cudf.Series([[1, 2, 3], [4, 5]]) >>> s 0 [1, 2, 3] 1 [4, 5] dtype: list >>> s.list.take([[2, 1], [1, 0]]) 0 [3, 2] 1 [5, 4] dtype: list ``` Authors: - Michael Wang (@isVoid) Approvers: - Keith Kraus (@kkraus14) URL: https://github.com/rapidsai/cudf/pull/7591 --- python/cudf/cudf/_lib/copying.pyx | 25 ++++++++- python/cudf/cudf/_lib/cpp/lists/gather.pxd | 13 +++++ python/cudf/cudf/core/column/lists.py | 61 +++++++++++++++++++++- python/cudf/cudf/tests/test_list.py | 47 +++++++++++++++++ 4 files changed, 143 insertions(+), 3 deletions(-) create mode 100644 python/cudf/cudf/_lib/cpp/lists/gather.pxd diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index ad798a73ed2..e5501428624 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -3,7 +3,7 @@ import pandas as pd from libcpp cimport bool -from libcpp.memory cimport make_unique, unique_ptr +from libcpp.memory cimport make_unique, unique_ptr, shared_ptr, make_shared from libcpp.vector cimport vector from libcpp.utility cimport move from libc.stdint cimport int32_t, int64_t @@ -24,6 +24,10 @@ from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type +from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view +from cudf._lib.cpp.lists.gather cimport ( + segmented_gather as cpp_segmented_gather +) cimport cudf._lib.cpp.copying as cpp_copying # workaround for https://github.com/cython/cython/issues/3885 @@ -704,3 +708,22 @@ def sample(Table input, size_type n, else input._index_names ) ) + + +def segmented_gather(Column source_column, Column gather_map): + cdef shared_ptr[lists_column_view] source_LCV = ( + make_shared[lists_column_view](source_column.view()) + ) + cdef shared_ptr[lists_column_view] gather_map_LCV = ( + make_shared[lists_column_view](gather_map.view()) + ) + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_segmented_gather( + source_LCV.get()[0], gather_map_LCV.get()[0]) + ) + + result = Column.from_unique_ptr(move(c_result)) + return result diff --git a/python/cudf/cudf/_lib/cpp/lists/gather.pxd b/python/cudf/cudf/_lib/cpp/lists/gather.pxd new file mode 100644 index 00000000000..ea664eee82e --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/lists/gather.pxd @@ -0,0 +1,13 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view + + +cdef extern from "cudf/lists/gather.hpp" namespace "cudf::lists" nogil: + cdef unique_ptr[column] segmented_gather( + const lists_column_view source_column, + const lists_column_view gather_map_list + ) except + diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index a60fe627acb..1d3f73822a9 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -2,14 +2,16 @@ import pickle +import numpy as np import pyarrow as pa import cudf +from cudf._lib.copying import segmented_gather from cudf._lib.lists import count_elements from cudf.core.buffer import Buffer -from cudf.core.column import ColumnBase, column +from cudf.core.column import ColumnBase, as_column, column from cudf.core.column.methods import ColumnMethodsMixin -from cudf.utils.dtypes import is_list_dtype +from cudf.utils.dtypes import is_list_dtype, is_numerical_dtype class ListColumn(ColumnBase): @@ -228,3 +230,58 @@ def len(self): dtype: int32 """ return self._return_or_inplace(count_elements(self._column)) + + def take(self, lists_indices): + """ + Collect list elements based on given indices. + + Parameters + ---------- + lists_indices: List type arrays + Specifies what to collect from each row + + Returns + ------- + ListColumn + + Examples + -------- + >>> s = cudf.Series([[1, 2, 3], None, [4, 5]]) + >>> s + 0 [1, 2, 3] + 1 None + 2 [4, 5] + dtype: list + >>> s.list.take([[0, 1], [], []]) + 0 [1, 2] + 1 None + 2 [] + dtype: list + """ + + lists_indices_col = as_column(lists_indices) + if not isinstance(lists_indices_col, ListColumn): + raise ValueError("lists_indices should be list type array.") + if not lists_indices_col.size == self._column.size: + raise ValueError( + "lists_indices and list column is of different " "size." + ) + if not is_numerical_dtype( + lists_indices_col.children[1].dtype + ) or not np.issubdtype( + lists_indices_col.children[1].dtype, np.integer + ): + raise TypeError( + "lists_indices should be column of values of index types." + ) + + try: + res = self._return_or_inplace( + segmented_gather(self._column, lists_indices_col) + ) + except RuntimeError as e: + if "contains nulls" in str(e): + raise ValueError("lists_indices contains null.") from e + raise + else: + return res diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 195d8749ec6..33812cfa7a7 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -112,3 +112,50 @@ def test_len(data): got = gsr.list.len() assert_eq(expect, got, check_dtype=False) + + +@pytest.mark.parametrize( + ("data", "idx"), + [ + ([[1, 2, 3], [3, 4, 5], [4, 5, 6]], [[0, 1], [2], [1, 2]]), + ([[1, 2, 3], [3, 4, 5], [4, 5, 6]], [[1, 2, 0], [1, 0, 2], [0, 1, 2]]), + ([[1, 2, 3], []], [[0, 1], []]), + ([[1, 2, 3], [None]], [[0, 1], []]), + ([[1, None, 3], None], [[0, 1], []]), + ], +) +def test_take(data, idx): + ps = pd.Series(data) + gs = cudf.from_pandas(ps) + + expected = pd.Series(zip(ps, idx)).map( + lambda x: [x[0][i] for i in x[1]] if x[0] is not None else None + ) + got = gs.list.take(idx) + assert_eq(expected, got) + + +@pytest.mark.parametrize( + ("invalid", "exception"), + [ + ([[0]], pytest.raises(ValueError, match="different size")), + ([1, 2, 3, 4], pytest.raises(ValueError, match="should be list type")), + ( + [["a", "b"], ["c"]], + pytest.raises( + TypeError, match="should be column of values of index types" + ), + ), + ( + [[[1], [0]], [[0]]], + pytest.raises( + TypeError, match="should be column of values of index types" + ), + ), + ([[0, 1], None], pytest.raises(ValueError, match="contains null")), + ], +) +def test_take_invalid(invalid, exception): + gs = cudf.Series([[0, 1], [2, 3]]) + with exception: + gs.list.take(invalid) From 873955e08ae47de118b73ba9c52dfaa3062fce81 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra <36027403+codereport@users.noreply.github.com> Date: Thu, 18 Mar 2021 09:32:06 -0400 Subject: [PATCH 6/8] Add explicit fixed_point merge test (#7635) While confirming that libcudf does indeed support `cudf::merge` with `fixed_point` ended up writing an explicit test that there is no reason not to just add as a unit test. Authors: - Conor Hoekstra (@codereport) Approvers: - Karthikeyan (@karthikeyann) - David (@davidwendt) URL: https://github.com/rapidsai/cudf/pull/7635 --- cpp/tests/merge/merge_test.cpp | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp index fa3bde8cb52..451fa82d5a3 100644 --- a/cpp/tests/merge/merge_test.cpp +++ b/cpp/tests/merge/merge_test.cpp @@ -729,4 +729,36 @@ TEST_F(MergeTest, KeysWithNulls) } } +template +struct FixedPointTestBothReps : public cudf::test::BaseFixture { +}; + +template +using fp_wrapper = cudf::test::fixed_point_column_wrapper; + +TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes); + +TYPED_TEST(FixedPointTestBothReps, FixedPointMerge) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = cudf::device_storage_type_t; + + auto const a = fp_wrapper{{4, 22, 33, 44, 55}, scale_type{-1}}; + auto const b = fp_wrapper{{5, 7, 10}, scale_type{-1}}; + auto const table_a = cudf::table_view(std::vector{a}); + auto const table_b = cudf::table_view(std::vector{b}); + auto const tables = std::vector{table_a, table_b}; + + auto const key_cols = std::vector{0}; + auto const order = std::vector{cudf::order::ASCENDING}; + + auto const exp = fp_wrapper{{4, 5, 7, 10, 22, 33, 44, 55}, scale_type{-1}}; + auto const exp_table = cudf::table_view(std::vector{exp}); + + auto const result = cudf::merge(tables, key_cols, order); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(exp_table.column(0), result->view().column(0)); +} + CUDF_TEST_PROGRAM_MAIN() From d6cc6943411c00d01368f8a1a53567997ffaff39 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Thu, 18 Mar 2021 08:55:43 -0500 Subject: [PATCH 7/8] Add in JNI support for table partition (#7637) This adds in support for partition. Which will partition a table based off of a partition map. Authors: - Robert (Bobby) Evans (@revans2) Approvers: - Jason Lowe (@jlowe) URL: https://github.com/rapidsai/cudf/pull/7637 --- java/src/main/java/ai/rapids/cudf/Table.java | 21 ++++++++++++ java/src/main/native/src/TableJni.cpp | 33 +++++++++++++++++++ .../test/java/ai/rapids/cudf/TableTest.java | 17 ++++++++++ 3 files changed, 71 insertions(+) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index d0e59fdc105..0dc529d423f 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -183,6 +183,9 @@ public long getDeviceMemorySize() { private static native ContiguousTable[] contiguousSplit(long inputTable, int[] indices); + private static native long[] partition(long inputTable, long partitionView, + int numberOfPartitions, int[] outputOffsets); + private static native long[] hashPartition(long inputTable, int[] columnsToHash, int hashTypeId, @@ -1257,6 +1260,24 @@ public Table repeat(ColumnVector counts, boolean checkCount) { return new Table(repeatColumnCount(this.nativeHandle, counts.getNativeView(), checkCount)); } + /** + * Partition this table using the mapping in partitionMap. partitionMap must be an integer + * column. The number of rows in partitionMap must be the same as this table. Each row + * in the map will indicate which partition the rows in the table belong to. + * @param partitionMap the partitions for each row. + * @param numberOfPartitions number of partitions + * @return {@link PartitionedTable} Table that exposes a limited functionality of the + * {@link Table} class + */ + public PartitionedTable partition(ColumnView partitionMap, int numberOfPartitions) { + int[] partitionOffsets = new int[numberOfPartitions]; + return new PartitionedTable(new Table(partition( + getNativeView(), + partitionMap.getNativeView(), + partitionOffsets.length, + partitionOffsets)), partitionOffsets); + } + /** * Find smallest indices in a sorted table where values should be inserted to maintain order. *
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 02385a453d0..81b9882104f 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1614,6 +1614,39 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_concatenate(JNIEnv *env,
   CATCH_STD(env, NULL);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_partition(JNIEnv *env, jclass,
+                                                                 jlong input_table,
+                                                                 jlong partition_column,
+                                                                 jint number_of_partitions,
+                                                                 jintArray output_offsets) {
+
+  JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
+  JNI_NULL_CHECK(env, partition_column, "partition_column is null", NULL);
+  JNI_NULL_CHECK(env, output_offsets, "output_offsets is null", NULL);
+  JNI_ARG_CHECK(env, number_of_partitions > 0, "number_of_partitions is zero", NULL);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *n_input_table = reinterpret_cast(input_table);
+    cudf::column_view *n_part_column = reinterpret_cast(partition_column);
+    cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
+
+    auto result = cudf::partition(*n_input_table,
+                                  *n_part_column,
+                                  number_of_partitions);
+
+    for (size_t i = 0; i < result.second.size() - 1; i++) {
+      // for what ever reason partition returns the length of the result at then
+      // end and hash partition/round robin do not, so skip the last entry for
+      // consistency
+      n_output_offsets[i] = result.second[i];
+    }
+
+    return cudf::jni::convert_table_for_return(env, result.first);
+  }
+  CATCH_STD(env, NULL);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(JNIEnv *env, jclass,
                                                                      jlong input_table,
                                                                      jintArray columns_to_hash,
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index c2e28e1cad8..c075f074068 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -1773,6 +1773,23 @@ void testPartStability() {
     }
   }
 
+  @Test
+  void testPartition() {
+    try (Table t = new Table.TestBuilder()
+        .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+        .build();
+         ColumnVector parts = ColumnVector
+             .fromInts(1, 2, 1, 2, 1, 2, 1, 2, 1, 2);
+         PartitionedTable pt = t.partition(parts, 3);
+         Table expected = new Table.TestBuilder()
+             .column(1, 3, 5, 7, 9, 2, 4, 6, 8, 10)
+             .build()) {
+      int[] partCutoffs = pt.getPartitions();
+      assertArrayEquals(new int[]{0, 0, 5}, partCutoffs);
+      assertTablesAreEqual(expected, pt.getTable());
+    }
+  }
+
   @Test
   void testIdentityHashPartition() {
     final int count = 1024 * 1024;

From 472305172bd373823db0976e1b0f53b3e7cc11f1 Mon Sep 17 00:00:00 2001
From: David <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 18 Mar 2021 10:33:55 -0400
Subject: [PATCH 8/8] Add gbenchmarks for string substrings functions (#7603)

Reference #5698
This creates a gbenchmark for the 4 variations of `cudf::strings::slice_strings()` API. The benchmarks measures various sized rows as well as strings lengths.
This PR also includes changes to `substring.cu` implementation cleaning up the code and using the more efficient `make_strings_children`. This change improved performance for all 4 functions on average by 2-3x.

Authors:
  - David (@davidwendt)

Approvers:
  - Nghia Truong (@ttnghia)
  - @nvdbaranec
  - Keith Kraus (@kkraus14)

URL: https://github.com/rapidsai/cudf/pull/7603
---
 cpp/benchmarks/CMakeLists.txt                 |   1 +
 cpp/benchmarks/string/substring_benchmark.cpp |  93 ++++++++++++
 cpp/src/strings/substring.cu                  | 136 ++++++++----------
 3 files changed, 155 insertions(+), 75 deletions(-)
 create mode 100644 cpp/benchmarks/string/substring_benchmark.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index e63ea38a31b..682f1ac5fca 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -190,4 +190,5 @@ ConfigureBench(STRINGS_BENCH
   string/find_benchmark.cpp
   string/replace_benchmark.cpp
   string/split_benchmark.cpp
+  string/substring_benchmark.cpp
   string/url_decode_benchmark.cpp)
diff --git a/cpp/benchmarks/string/substring_benchmark.cpp b/cpp/benchmarks/string/substring_benchmark.cpp
new file mode 100644
index 00000000000..d47c42e45be
--- /dev/null
+++ b/cpp/benchmarks/string/substring_benchmark.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "string_bench_args.hpp"
+
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#include 
+
+class StringSubstring : public cudf::benchmark {
+};
+
+enum substring_type { position, multi_position, delimiter, multi_delimiter };
+
+static void BM_substring(benchmark::State& state, substring_type rt)
+{
+  cudf::size_type const n_rows{static_cast(state.range(0))};
+  cudf::size_type const max_str_length{static_cast(state.range(1))};
+  data_profile table_profile;
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+  auto starts_itr = thrust::constant_iterator(1);
+  auto stops_itr  = thrust::constant_iterator(max_str_length / 2);
+  cudf::test::fixed_width_column_wrapper starts(starts_itr, starts_itr + n_rows);
+  cudf::test::fixed_width_column_wrapper stops(stops_itr, stops_itr + n_rows);
+  auto delim_itr = thrust::constant_iterator(" ");
+  cudf::test::strings_column_wrapper delimiters(delim_itr, delim_itr + n_rows);
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, 0);
+    switch (rt) {
+      case position: cudf::strings::slice_strings(input, 1, max_str_length / 2); break;
+      case multi_position: cudf::strings::slice_strings(input, starts, stops); break;
+      case delimiter: cudf::strings::slice_strings(input, std::string{" "}, 1); break;
+      case multi_delimiter:
+        cudf::strings::slice_strings(input, cudf::strings_column_view(delimiters), 1);
+        break;
+    }
+  }
+
+  state.SetBytesProcessed(state.iterations() * input.chars_size());
+}
+
+static void generate_bench_args(benchmark::internal::Benchmark* b)
+{
+  int const min_rows   = 1 << 12;
+  int const max_rows   = 1 << 24;
+  int const row_mult   = 8;
+  int const min_rowlen = 1 << 5;
+  int const max_rowlen = 1 << 13;
+  int const len_mult   = 4;
+  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+}
+
+#define STRINGS_BENCHMARK_DEFINE(name)                                  \
+  BENCHMARK_DEFINE_F(StringSubstring, name)                             \
+  (::benchmark::State & st) { BM_substring(st, substring_type::name); } \
+  BENCHMARK_REGISTER_F(StringSubstring, name)                           \
+    ->Apply(generate_bench_args)                                        \
+    ->UseManualTime()                                                   \
+    ->Unit(benchmark::kMillisecond);
+
+STRINGS_BENCHMARK_DEFINE(position)
+STRINGS_BENCHMARK_DEFINE(multi_position)
+STRINGS_BENCHMARK_DEFINE(delimiter)
+STRINGS_BENCHMARK_DEFINE(multi_delimiter)
diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu
index 68080c0eb89..f712b0cb6aa 100644
--- a/cpp/src/strings/substring.cu
+++ b/cpp/src/strings/substring.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,17 +43,25 @@ namespace {
  * using the provided start, stop, and step parameters.
  */
 struct substring_fn {
-  const column_device_view d_column;
-  numeric_scalar_device_view d_start, d_stop, d_step;
-  const int32_t* d_offsets{};
+  column_device_view const d_column;
+  numeric_scalar_device_view const d_start;
+  numeric_scalar_device_view const d_stop;
+  numeric_scalar_device_view const d_step;
+  int32_t* d_offsets{};
   char* d_chars{};
 
-  __device__ cudf::size_type operator()(size_type idx)
+  __device__ void operator()(size_type idx)
   {
-    if (d_column.is_null(idx)) return 0;  // null string
-    string_view d_str = d_column.template element(idx);
+    if (d_column.is_null(idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
+    auto const d_str  = d_column.template element(idx);
     auto const length = d_str.length();
-    if (length == 0) return 0;  // empty string
+    if (length == 0) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
     size_type const step = d_step.is_valid() ? d_step.value() : 1;
     auto const begin     = [&] {  // always inclusive
       // when invalid, default depends on step
@@ -88,7 +96,7 @@ struct substring_fn {
       if (d_buffer) d_buffer += from_char_utf8(*itr, d_buffer);
       itr += step;
     }
-    return bytes;
+    if (!d_chars) d_offsets[idx] = bytes;
   }
 };
 
@@ -103,42 +111,26 @@ std::unique_ptr slice_strings(
   rmm::cuda_stream_view stream           = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource())
 {
-  size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_strings_column(stream, mr);
+  if (strings.is_empty()) return make_empty_strings_column(stream, mr);
 
   if (step.is_valid()) CUDF_EXPECTS(step.value(stream) != 0, "Step parameter must not be 0");
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_column       = *strings_column;
-  auto d_start        = get_scalar_device_view(const_cast&>(start));
-  auto d_stop         = get_scalar_device_view(const_cast&>(stop));
-  auto d_step         = get_scalar_device_view(const_cast&>(step));
-
-  // copy the null mask
-  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
-
-  // build offsets column
-  auto offsets_transformer_itr = thrust::make_transform_iterator(
-    thrust::make_counting_iterator(0), substring_fn{d_column, d_start, d_stop, d_step});
-  auto offsets_column = make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_new_offsets = offsets_column->view().data();
-
-  // build chars column
-  auto bytes = cudf::detail::get_value(offsets_column->view(), strings_count, stream);
-  auto chars_column = strings::detail::create_chars_child_column(
-    strings_count, strings.null_count(), bytes, stream, mr);
-  auto d_chars = chars_column->mutable_view().data();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator(0),
-                     strings_count,
-                     substring_fn{d_column, d_start, d_stop, d_step, d_new_offsets, d_chars});
+  auto const d_column = column_device_view::create(strings.parent(), stream);
+  auto const d_start  = get_scalar_device_view(const_cast&>(start));
+  auto const d_stop   = get_scalar_device_view(const_cast&>(stop));
+  auto const d_step   = get_scalar_device_view(const_cast&>(step));
 
-  return make_strings_column(strings_count,
-                             std::move(offsets_column),
-                             std::move(chars_column),
+  auto children = make_strings_children(substring_fn{*d_column, d_start, d_stop, d_step},
+                                        strings.size(),
+                                        strings.null_count(),
+                                        stream,
+                                        mr);
+
+  return make_strings_column(strings.size(),
+                             std::move(children.first),
+                             std::move(children.second),
                              strings.null_count(),
-                             std::move(null_mask),
+                             cudf::detail::copy_bitmask(strings.parent(), stream, mr),
                              stream,
                              mr);
 }
@@ -166,25 +158,33 @@ namespace {
  * This both calculates the output size and executes the substring.
  */
 struct substring_from_fn {
-  const column_device_view d_column;
-  const cudf::detail::input_indexalator starts;
-  const cudf::detail::input_indexalator stops;
-  const int32_t* d_offsets{};
+  column_device_view const d_column;
+  cudf::detail::input_indexalator const starts;
+  cudf::detail::input_indexalator const stops;
+  int32_t* d_offsets{};
   char* d_chars{};
 
-  __device__ size_type operator()(size_type idx)
+  __device__ void operator()(size_type idx)
   {
-    if (d_column.is_null(idx)) return 0;  // null string
-    string_view d_str = d_column.template element(idx);
+    if (d_column.is_null(idx)) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
+    auto const d_str  = d_column.template element(idx);
     auto const length = d_str.length();
     auto const start  = starts[idx];
-    if (start >= length) return 0;  // empty string
+    if (start >= length) {
+      if (!d_chars) d_offsets[idx] = 0;
+      return;
+    }
     auto const stop = stops[idx];
     auto const end  = (((stop < 0) || (stop > length)) ? length : stop);
 
-    string_view d_substr = d_str.substr(start, end - start);
-    if (d_chars) memcpy(d_chars + d_offsets[idx], d_substr.data(), d_substr.size_bytes());
-    return d_substr.size_bytes();
+    auto const d_substr = d_str.substr(start, end - start);
+    if (d_chars)
+      memcpy(d_chars + d_offsets[idx], d_substr.data(), d_substr.size_bytes());
+    else
+      d_offsets[idx] = d_substr.size_bytes();
   }
 };
 
@@ -212,32 +212,18 @@ std::unique_ptr compute_substrings_from_fn(column_device_view const& d_c
   auto strings_count = d_column.size();
 
   // Copy the null mask
-  rmm::device_buffer null_mask{0, stream, mr};
-  if (d_column.nullable())
-    null_mask = rmm::device_buffer(
-      d_column.null_mask(), cudf::bitmask_allocation_size_bytes(strings_count), stream, mr);
-
-  // Build offsets column
-  auto offsets_transformer_itr = thrust::make_transform_iterator(
-    thrust::make_counting_iterator(0), substring_from_fn{d_column, starts, stops});
-  auto offsets_column = cudf::strings::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_new_offsets = offsets_column->view().data();
-
-  // Build chars column
-  auto bytes = cudf::detail::get_value(offsets_column->view(), strings_count, stream);
-  auto chars_column =
-    cudf::strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr);
-  auto chars_view = chars_column->mutable_view();
-  auto d_chars    = chars_view.template data();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator(0),
-                     strings_count,
-                     substring_from_fn{d_column, starts, stops, d_new_offsets, d_chars});
+  rmm::device_buffer null_mask =
+    !d_column.nullable()
+      ? rmm::device_buffer{0, stream, mr}
+      : rmm::device_buffer(
+          d_column.null_mask(), cudf::bitmask_allocation_size_bytes(strings_count), stream, mr);
+
+  auto children = make_strings_children(
+    substring_from_fn{d_column, starts, stops}, strings_count, null_count, stream, mr);
 
   return make_strings_column(strings_count,
-                             std::move(offsets_column),
-                             std::move(chars_column),
+                             std::move(children.first),
+                             std::move(children.second),
                              null_count,
                              std::move(null_mask),
                              stream,