From 3349764b9a0699bf96f7403895df88f7308647fb Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 17 Mar 2021 12:48:09 -0600 Subject: [PATCH 1/3] Refactor string conversion check (#7599) This addresses #7557. In summary: * Move `cudf::strings::is_integer()` code from `strings/chars_types.*` to `strings/convert/convert_integers.hpp/cu` * Move `cudf::strings::is_float()` code from `strings/chars_types.*` to `strings/convert/convert_floats.hpp/cu` * Remove `cudf::strings::all_integer()` and `cudf::strings::all_float()` Authors: - Nghia Truong (@ttnghia) Approvers: - GALI PREM SAGAR (@galipremsagar) - Jason Lowe (@jlowe) - Jake Hemstad (@jrhemstad) - David (@davidwendt) URL: https://github.com/rapidsai/cudf/pull/7599 --- .../cudf/strings/char_types/char_types.hpp | 78 +----------- .../cudf/strings/convert/convert_floats.hpp | 26 +++- .../cudf/strings/convert/convert_integers.hpp | 26 +++- cpp/src/strings/char_types/char_types.cu | 113 +----------------- cpp/src/strings/convert/convert_floats.cu | 41 ++++++- cpp/src/strings/convert/convert_integers.cu | 41 ++++++- cpp/tests/strings/chars_types_tests.cpp | 63 ---------- cpp/tests/strings/floats_tests.cpp | 35 ++++++ cpp/tests/strings/integers_tests.cu | 23 +++- java/src/main/native/src/ColumnViewJni.cpp | 1 - .../cudf/cudf/_lib/cpp/strings/char_types.pxd | 10 +- .../cpp/strings/convert/convert_floats.pxd | 6 +- .../cpp/strings/convert/convert_integers.pxd | 6 +- python/cudf/cudf/_lib/strings/char_types.pyx | 36 +----- .../_lib/strings/convert/convert_floats.pyx | 29 +++++ .../_lib/strings/convert/convert_integers.pyx | 29 +++++ python/cudf/cudf/core/column/string.py | 6 +- python/cudf/cudf/core/tools/datetimes.py | 4 +- 18 files changed, 265 insertions(+), 308 deletions(-) create mode 100644 python/cudf/cudf/_lib/strings/convert/convert_floats.pyx create mode 100644 python/cudf/cudf/_lib/strings/convert/convert_integers.pyx diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp index 300722920f4..1f5b6241850 100644 --- a/cpp/include/cudf/strings/char_types/char_types.hpp +++ b/cpp/include/cudf/strings/char_types/char_types.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -146,82 +146,6 @@ std::unique_ptr filter_characters_of_type( string_character_types types_to_keep = string_character_types::ALL_TYPES, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** - * @brief Returns a boolean column identifying strings in which all - * characters are valid for conversion to integers. - * - * The output row entry will be set to `true` if the corresponding string element - * has at least one character in [-+0-9]. - * - * @code{.pseudo} - * Example: - * s = ['123', '-456', '', 'A', '+7'] - * b = s.is_integer(s) - * b is [true, true, false, false, true] - * @endcode - * - * Any null row results in a null entry for that row in the output column. - * - * @param strings Strings instance for this operation. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column of boolean results for each string. - */ -std::unique_ptr is_integer( - strings_column_view const& strings, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Returns `true` if all strings contain - * characters that are valid for conversion to integers. - * - * This function will return `true` if all string elements - * has at least one character in [-+0-9]. - * - * Any null entry or empty string will cause this function to return `false`. - * - * @param strings Strings instance for this operation. - * @return true if all string are valid - */ -bool all_integer(strings_column_view const& strings); - -/** - * @brief Returns a boolean column identifying strings in which all - * characters are valid for conversion to floats. - * - * The output row entry will be set to `true` if the corresponding string element - * has at least one character in [-+0-9eE.]. - * - * @code{.pseudo} - * Example: - * s = ['123', '-456', '', 'A', '+7', '8.9' '3.7e+5'] - * b = s.is_float(s) - * b is [true, true, false, false, true, true, true] - * @endcode - * - * Any null row results in a null entry for that row in the output column. - * - * @param strings Strings instance for this operation. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column of boolean results for each string. - */ -std::unique_ptr is_float( - strings_column_view const& strings, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Returns `true` if all strings contain - * characters that are valid for conversion to floats. - * - * This function will return `true` if all string elements - * has at least one character in [-+0-9eE.]. - * - * Any null entry or empty string will cause this function to return `false`. - * - * @param strings Strings instance for this operation. - * @return true if all string are valid - */ -bool all_float(strings_column_view const& strings); - /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/include/cudf/strings/convert/convert_floats.hpp b/cpp/include/cudf/strings/convert/convert_floats.hpp index cb4746dbf40..d1e00b36f6f 100644 --- a/cpp/include/cudf/strings/convert/convert_floats.hpp +++ b/cpp/include/cudf/strings/convert/convert_floats.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -68,6 +68,30 @@ std::unique_ptr from_floats( column_view const& floats, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a boolean column identifying strings in which all + * characters are valid for conversion to floats. + * + * The output row entry will be set to `true` if the corresponding string element + * has at least one character in [-+0-9eE.]. + * + * @code{.pseudo} + * Example: + * s = ['123', '-456', '', 'A', '+7', '8.9' '3.7e+5'] + * b = s.is_float(s) + * b is [true, true, false, false, true, true, true] + * @endcode + * + * Any null row results in a null entry for that row in the output column. + * + * @param strings Strings instance for this operation. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New column of boolean results for each string. + */ +std::unique_ptr is_float( + strings_column_view const& strings, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp index 8f42deb380d..1e2fa80b129 100644 --- a/cpp/include/cudf/strings/convert/convert_integers.hpp +++ b/cpp/include/cudf/strings/convert/convert_integers.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -73,6 +73,30 @@ std::unique_ptr from_integers( column_view const& integers, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a boolean column identifying strings in which all + * characters are valid for conversion to integers. + * + * The output row entry will be set to `true` if the corresponding string element + * has at least one character in [-+0-9]. + * + * @code{.pseudo} + * Example: + * s = ['123', '-456', '', 'A', '+7'] + * b = s.is_integer(s) + * b is [true, true, false, false, true] + * @endcode + * + * Any null row results in a null entry for that row in the output column. + * + * @param strings Strings instance for this operation. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New column of boolean results for each string. + */ +std::unique_ptr is_integer( + strings_column_view const& strings, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Returns a new integer numeric column parsing hexadecimal values from the * provided strings column. diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu index 10496b89328..0b384ad0631 100644 --- a/cpp/src/strings/char_types/char_types.cu +++ b/cpp/src/strings/char_types/char_types.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -186,91 +186,6 @@ std::unique_ptr filter_characters_of_type(strings_column_view const& str mr); } -std::unique_ptr is_integer( - strings_column_view const& strings, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) -{ - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; - // create output column - auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings.size(), - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); - auto d_results = results->mutable_view().data(); - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings.size()), - d_results, - [d_column] __device__(size_type idx) { - if (d_column.is_null(idx)) return false; - return string::is_integer(d_column.element(idx)); - }); - results->set_null_count(strings.null_count()); - return results; -} - -bool all_integer(strings_column_view const& strings, rmm::cuda_stream_view stream) -{ - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; - auto transformer_itr = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), [d_column] __device__(size_type idx) { - if (d_column.is_null(idx)) return false; - return string::is_integer(d_column.element(idx)); - }); - return thrust::all_of(rmm::exec_policy(stream), - transformer_itr, - transformer_itr + strings.size(), - thrust::identity()); -} - -std::unique_ptr is_float( - strings_column_view const& strings, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) -{ - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; - // create output column - auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings.size(), - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), - stream, - mr); - auto d_results = results->mutable_view().data(); - // check strings for valid float chars - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings.size()), - d_results, - [d_column] __device__(size_type idx) { - if (d_column.is_null(idx)) return false; - return string::is_float(d_column.element(idx)); - }); - results->set_null_count(strings.null_count()); - return results; -} - -bool all_float(strings_column_view const& strings, rmm::cuda_stream_view stream) -{ - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; - auto transformer_itr = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), [d_column] __device__(size_type idx) { - if (d_column.is_null(idx)) return false; - return string::is_float(d_column.element(idx)); - }); - return thrust::all_of(rmm::exec_policy(stream), - transformer_itr, - transformer_itr + strings.size(), - thrust::identity()); -} - } // namespace detail // external API @@ -295,31 +210,5 @@ std::unique_ptr filter_characters_of_type(strings_column_view const& str strings, types_to_remove, replacement, types_to_keep, rmm::cuda_stream_default, mr); } -std::unique_ptr is_integer(strings_column_view const& strings, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::is_integer(strings, rmm::cuda_stream_default, mr); -} - -std::unique_ptr is_float(strings_column_view const& strings, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::is_float(strings, rmm::cuda_stream_default, mr); -} - -bool all_integer(strings_column_view const& strings) -{ - CUDF_FUNC_RANGE(); - return detail::all_integer(strings, rmm::cuda_stream_default); -} - -bool all_float(strings_column_view const& strings) -{ - CUDF_FUNC_RANGE(); - return detail::all_float(strings, rmm::cuda_stream_default); -} - } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu index 2bf65976986..b6d99efd51f 100644 --- a/cpp/src/strings/convert/convert_floats.cu +++ b/cpp/src/strings/convert/convert_floats.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -536,12 +537,50 @@ std::unique_ptr from_floats(column_view const& floats, } // namespace detail // external API - std::unique_ptr from_floats(column_view const& floats, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::from_floats(floats, rmm::cuda_stream_default, mr); } +namespace detail { +std::unique_ptr is_float( + strings_column_view const& strings, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + auto strings_column = column_device_view::create(strings.parent(), stream); + auto d_column = *strings_column; + // create output column + auto results = make_numeric_column(data_type{type_id::BOOL8}, + strings.size(), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); + auto d_results = results->mutable_view().data(); + // check strings for valid float chars + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings.size()), + d_results, + [d_column] __device__(size_type idx) { + if (d_column.is_null(idx)) return false; + return string::is_float(d_column.element(idx)); + }); + results->set_null_count(strings.null_count()); + return results; +} + +} // namespace detail + +// external API +std::unique_ptr is_float(strings_column_view const& strings, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::is_float(strings, rmm::cuda_stream_default, mr); +} + } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu index 112550fc25b..5c5032b5c87 100644 --- a/cpp/src/strings/convert/convert_integers.cu +++ b/cpp/src/strings/convert/convert_integers.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -245,7 +246,6 @@ std::unique_ptr from_integers(column_view const& integers, } // namespace detail // external API - std::unique_ptr from_integers(column_view const& integers, rmm::mr::device_memory_resource* mr) { @@ -253,5 +253,42 @@ std::unique_ptr from_integers(column_view const& integers, return detail::from_integers(integers, rmm::cuda_stream_default, mr); } +namespace detail { +std::unique_ptr is_integer( + strings_column_view const& strings, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + auto strings_column = column_device_view::create(strings.parent(), stream); + auto d_column = *strings_column; + // create output column + auto results = make_numeric_column(data_type{type_id::BOOL8}, + strings.size(), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), + strings.null_count(), + stream, + mr); + auto d_results = results->mutable_view().data(); + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings.size()), + d_results, + [d_column] __device__(size_type idx) { + if (d_column.is_null(idx)) return false; + return string::is_integer(d_column.element(idx)); + }); + results->set_null_count(strings.null_count()); + return results; +} +} // namespace detail + +// external API +std::unique_ptr is_integer(strings_column_view const& strings, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::is_integer(strings, rmm::cuda_stream_default, mr); +} + } // namespace strings } // namespace cudf diff --git a/cpp/tests/strings/chars_types_tests.cpp b/cpp/tests/strings/chars_types_tests.cpp index 803a9b01b07..702329edaba 100644 --- a/cpp/tests/strings/chars_types_tests.cpp +++ b/cpp/tests/strings/chars_types_tests.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include #include @@ -228,54 +227,6 @@ TEST_F(StringsCharsTest, Numerics) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } -TEST_F(StringsCharsTest, Integers) -{ - cudf::test::strings_column_wrapper strings1( - {"+175", "-34", "9.8", "17+2", "+-14", "1234567890", "67de", "", "1e10", "-", "++", ""}); - auto results = cudf::strings::is_integer(cudf::strings_column_view(strings1)); - cudf::test::fixed_width_column_wrapper expected1({1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1); - EXPECT_FALSE(cudf::strings::all_integer(cudf::strings_column_view(strings1))); - - cudf::test::strings_column_wrapper strings2( - {"0", "+0", "-0", "1234567890", "-27341132", "+012", "023", "-045"}); - results = cudf::strings::is_integer(cudf::strings_column_view(strings2)); - cudf::test::fixed_width_column_wrapper expected2({1, 1, 1, 1, 1, 1, 1, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); - EXPECT_TRUE(cudf::strings::all_integer(cudf::strings_column_view(strings2))); -} - -TEST_F(StringsCharsTest, Floats) -{ - cudf::test::strings_column_wrapper strings1({"+175", - "-9.8", - "7+2", - "+-4", - "6.7e17", - "-1.2e-5", - "e", - ".e", - "1.e+-2", - "00.00", - "1.0e+1.0", - "1.2.3", - "+", - "--", - ""}); - auto results = cudf::strings::is_float(cudf::strings_column_view(strings1)); - cudf::test::fixed_width_column_wrapper expected1( - {1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1); - EXPECT_FALSE(cudf::strings::all_float(cudf::strings_column_view(strings1))); - - cudf::test::strings_column_wrapper strings2( - {"+175", "-34", "9.8", "1234567890", "6.7e17", "-917.2e5"}); - results = cudf::strings::is_float(cudf::strings_column_view(strings2)); - cudf::test::fixed_width_column_wrapper expected2({1, 1, 1, 1, 1, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); - EXPECT_TRUE(cudf::strings::all_float(cudf::strings_column_view(strings2))); -} - TEST_F(StringsCharsTest, EmptyStrings) { cudf::test::strings_column_wrapper strings({"", "", ""}); @@ -284,12 +235,6 @@ TEST_F(StringsCharsTest, EmptyStrings) auto results = cudf::strings::all_characters_of_type( strings_view, cudf::strings::string_character_types::ALPHANUM); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - results = cudf::strings::is_integer(strings_view); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - EXPECT_FALSE(cudf::strings::all_integer(strings_view)); - results = cudf::strings::is_float(strings_view); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - EXPECT_FALSE(cudf::strings::all_float(strings_view)); } TEST_F(StringsCharsTest, FilterCharTypes) @@ -379,14 +324,6 @@ TEST_F(StringsCharsTest, EmptyStringsColumn) EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id()); EXPECT_EQ(0, results->view().size()); - results = cudf::strings::is_integer(strings_view); - EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id()); - EXPECT_EQ(0, results->view().size()); - - results = cudf::strings::is_float(strings_view); - EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id()); - EXPECT_EQ(0, results->view().size()); - results = cudf::strings::filter_characters_of_type( strings_view, cudf::strings::string_character_types::NUMERIC); EXPECT_EQ(cudf::type_id::STRING, results->view().type().id()); diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp index b98416d9edd..f7151363d83 100644 --- a/cpp/tests/strings/floats_tests.cpp +++ b/cpp/tests/strings/floats_tests.cpp @@ -27,6 +27,41 @@ struct StringsConvertTest : public cudf::test::BaseFixture { }; +TEST_F(StringsConvertTest, IsFloat) +{ + cudf::test::strings_column_wrapper strings; + auto strings_view = cudf::strings_column_view(strings); + auto results = cudf::strings::is_float(strings_view); + EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id()); + EXPECT_EQ(0, results->view().size()); + + cudf::test::strings_column_wrapper strings1({"+175", + "-9.8", + "7+2", + "+-4", + "6.7e17", + "-1.2e-5", + "e", + ".e", + "1.e+-2", + "00.00", + "1.0e+1.0", + "1.2.3", + "+", + "--", + ""}); + results = cudf::strings::is_float(cudf::strings_column_view(strings1)); + cudf::test::fixed_width_column_wrapper expected1( + {1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1); + + cudf::test::strings_column_wrapper strings2( + {"+175", "-34", "9.8", "1234567890", "6.7e17", "-917.2e5"}); + results = cudf::strings::is_float(cudf::strings_column_view(strings2)); + cudf::test::fixed_width_column_wrapper expected2({1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); +} + TEST_F(StringsConvertTest, ToFloats32) { std::vector h_strings{"1234", diff --git a/cpp/tests/strings/integers_tests.cu b/cpp/tests/strings/integers_tests.cu index 9e2b9809b26..d6bf03b3f76 100644 --- a/cpp/tests/strings/integers_tests.cu +++ b/cpp/tests/strings/integers_tests.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,6 +29,27 @@ struct StringsConvertTest : public cudf::test::BaseFixture { }; +TEST_F(StringsConvertTest, IsInteger) +{ + cudf::test::strings_column_wrapper strings; + auto strings_view = cudf::strings_column_view(strings); + auto results = cudf::strings::is_integer(strings_view); + EXPECT_EQ(cudf::type_id::BOOL8, results->view().type().id()); + EXPECT_EQ(0, results->view().size()); + + cudf::test::strings_column_wrapper strings1( + {"+175", "-34", "9.8", "17+2", "+-14", "1234567890", "67de", "", "1e10", "-", "++", ""}); + results = cudf::strings::is_integer(cudf::strings_column_view(strings1)); + cudf::test::fixed_width_column_wrapper expected1({1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1); + + cudf::test::strings_column_wrapper strings2( + {"0", "+0", "-0", "1234567890", "-27341132", "+012", "023", "-045"}); + results = cudf::strings::is_integer(cudf::strings_column_view(strings2)); + cudf::test::fixed_width_column_wrapper expected2({1, 1, 1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); +} + TEST_F(StringsConvertTest, ToInteger) { std::vector h_strings{ diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 0ce9d6303e4..ac14e1605d7 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include diff --git a/python/cudf/cudf/_lib/cpp/strings/char_types.pxd b/python/cudf/cudf/_lib/cpp/strings/char_types.pxd index ad675027c10..934269c6f25 100644 --- a/python/cudf/cudf/_lib/cpp/strings/char_types.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/char_types.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2021, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from cudf._lib.cpp.column.column_view cimport column_view @@ -33,11 +33,3 @@ cdef extern from "cudf/strings/char_types/char_types.hpp" \ string_character_types types_to_remove, string_scalar replacement, string_character_types types_to_keep) except + - - cdef unique_ptr[column] is_integer( - column_view source_strings - ) except + - - cdef unique_ptr[column] is_float( - column_view source_strings - ) except + diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd b/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd index baee01b8f99..55a84b60efd 100644 --- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2021, NVIDIA CORPORATION. from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view @@ -14,3 +14,7 @@ cdef extern from "cudf/strings/convert/convert_floats.hpp" namespace \ cdef unique_ptr[column] from_floats( column_view input_col) except + + + cdef unique_ptr[column] is_float( + column_view source_strings + ) except + diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd b/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd index 92f99a2f5cb..6e45d4ba869 100644 --- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2021, NVIDIA CORPORATION. from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view @@ -15,6 +15,10 @@ cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \ cdef unique_ptr[column] from_integers( column_view input_col) except + + cdef unique_ptr[column] is_integer( + column_view source_strings + ) except + + cdef unique_ptr[column] hex_to_integers( column_view input_col, data_type output_type) except + diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx index 5d8d1522418..1890e98f956 100644 --- a/python/cudf/cudf/_lib/strings/char_types.pyx +++ b/python/cudf/cudf/_lib/strings/char_types.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2021, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -14,8 +14,6 @@ from cudf._lib.cpp.strings.char_types cimport ( all_characters_of_type as cpp_all_characters_of_type, filter_characters_of_type as cpp_filter_characters_of_type, string_character_types as string_character_types, - is_integer as cpp_is_integer, - is_float as cpp_is_float, ) @@ -191,35 +189,3 @@ def is_space(Column source_strings): )) return Column.from_unique_ptr(move(c_result)) - - -def is_integer(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that have intergers. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_integer( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) - - -def is_float(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that have floats. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_float( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx new file mode 100644 index 00000000000..195d9b71f6e --- /dev/null +++ b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx @@ -0,0 +1,29 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.column cimport Column +from cudf._lib.cpp.column.column cimport column + +from cudf._lib.cpp.strings.convert.convert_floats cimport ( + is_float as cpp_is_float, +) + + +def is_float(Column source_strings): + """ + Returns a Column of boolean values with True for `source_strings` + that have floats. + """ + cdef unique_ptr[column] c_result + cdef column_view source_view = source_strings.view() + + with nogil: + c_result = move(cpp_is_float( + source_view + )) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx new file mode 100644 index 00000000000..d1bae1edd37 --- /dev/null +++ b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx @@ -0,0 +1,29 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.column cimport Column +from cudf._lib.cpp.column.column cimport column + +from cudf._lib.cpp.strings.convert.convert_integers cimport ( + is_integer as cpp_is_integer, +) + + +def is_integer(Column source_strings): + """ + Returns a Column of boolean values with True for `source_strings` + that have intergers. + """ + cdef unique_ptr[column] c_result + cdef column_view source_view = source_strings.view() + + with nogil: + c_result = move(cpp_is_integer( + source_view + )) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index ea01aa07b91..11dd7556812 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -70,13 +70,15 @@ is_alpha as cpp_is_alpha, is_decimal as cpp_is_decimal, is_digit as cpp_is_digit, - is_float as cpp_is_float, - is_integer as cpp_is_integer, is_lower as cpp_is_lower, is_numeric as cpp_is_numeric, is_space as cpp_isspace, is_upper as cpp_is_upper, ) +from cudf._lib.strings.convert.convert_integers import ( + is_integer as cpp_is_integer, +) +from cudf._lib.strings.convert.convert_floats import is_float as cpp_is_float from cudf._lib.strings.combine import ( concatenate as cpp_concatenate, join as cpp_join, diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 4e5e4ce1987..535e497e8dc 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -8,7 +8,9 @@ from pandas.core.tools.datetimes import _unit_map import cudf -from cudf._lib.strings.char_types import is_integer as cpp_is_integer +from cudf._lib.strings.convert.convert_integers import ( + is_integer as cpp_is_integer, +) from cudf.core import column from cudf.core.index import as_index from cudf.utils.dtypes import is_scalar From 168c489a9415ae7bbbec5ef600b0d3dcde44b583 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Wed, 17 Mar 2021 20:37:11 -0500 Subject: [PATCH 2/3] Fix Series/Dataframe Mixed Arithmetic (#7491) Fixes https://github.com/rapidsai/cudf/issues/7385 Authors: - @brandon-b-miller Approvers: - GALI PREM SAGAR (@galipremsagar) - Michael Wang (@isVoid) URL: https://github.com/rapidsai/cudf/pull/7491 --- python/cudf/cudf/core/dataframe.py | 8 ++--- python/cudf/cudf/core/series.py | 4 +-- python/cudf/cudf/tests/test_dataframe.py | 42 ++++++++++++------------ 3 files changed, 24 insertions(+), 30 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 25f57748765..9672ab3002f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1518,11 +1518,7 @@ def fallback(col, fn): else: if col not in df_cols: r_opr = other_cols[col] - l_opr = Series( - column_empty( - len(self), masked=True, dtype=other.dtype - ) - ) + l_opr = Series(as_column(np.nan, length=len(self))) if col not in other_cols_keys: r_opr = None l_opr = self[col] @@ -2198,7 +2194,7 @@ def rpow(self, other, axis="columns", level=None, fill_value=None): return self._apply_op("rpow", other, fill_value) def __rpow__(self, other): - return self._apply_op("__pow__", other) + return self._apply_op("__rpow__", other) def floordiv(self, other, axis="columns", level=None, fill_value=None): """ diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 5e7121c0488..b06fef178f6 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1501,9 +1501,7 @@ def _binaryop( If ``reflect`` is ``True``, swap the order of the operands. """ if isinstance(other, cudf.DataFrame): - # TODO: fn is not the same as arg expected by _apply_op - # e.g. for fn = 'and', _apply_op equivalent is '__and__' - return other._apply_op(self, fn) + return NotImplemented result_name = utils.get_result_name(self, other) if isinstance(other, Series): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 77548b95277..5f4d571e8c5 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -4996,13 +4996,13 @@ def test_cov_nans(): @pytest.mark.parametrize( "gsr", [ - cudf.Series([1, 2, 3]), - cudf.Series([1, 2, 3], index=["a", "b", "c"]), - cudf.Series([1, 2, 3], index=["a", "b", "d"]), - cudf.Series([1, 2], index=["a", "b"]), - cudf.Series([1, 2, 3], index=cudf.core.index.RangeIndex(0, 3)), + cudf.Series([4, 2, 3]), + cudf.Series([4, 2, 3], index=["a", "b", "c"]), + cudf.Series([4, 2, 3], index=["a", "b", "d"]), + cudf.Series([4, 2], index=["a", "b"]), + cudf.Series([4, 2, 3], index=cudf.core.index.RangeIndex(0, 3)), pytest.param( - cudf.Series([1, 2, 3, 4, 5], index=["a", "b", "d", "0", "12"]), + cudf.Series([4, 2, 3, 4, 5], index=["a", "b", "d", "0", "12"]), marks=pytest.mark.xfail, ), ], @@ -5017,32 +5017,32 @@ def test_cov_nans(): operator.truediv, operator.mod, operator.pow, - # comparison ops will temporarily XFAIL - # see PR https://github.com/rapidsai/cudf/pull/7491 - pytest.param(operator.eq, marks=pytest.mark.xfail()), - pytest.param(operator.lt, marks=pytest.mark.xfail()), - pytest.param(operator.le, marks=pytest.mark.xfail()), - pytest.param(operator.gt, marks=pytest.mark.xfail()), - pytest.param(operator.ge, marks=pytest.mark.xfail()), - pytest.param(operator.ne, marks=pytest.mark.xfail()), + operator.eq, + operator.lt, + operator.le, + operator.gt, + operator.ge, + operator.ne, ], ) def test_df_sr_binop(gsr, colnames, op): - data = [[0, 2, 5], [3, None, 5], [6, 7, np.nan]] + data = [[3.0, 2.0, 5.0], [3.0, None, 5.0], [6.0, 7.0, np.nan]] data = dict(zip(colnames, data)) + gsr = gsr.astype("float64") + gdf = cudf.DataFrame(data) - pdf = pd.DataFrame.from_dict(data) + pdf = gdf.to_pandas(nullable=True) - psr = gsr.to_pandas() + psr = gsr.to_pandas(nullable=True) expect = op(pdf, psr) - got = op(gdf, gsr) - assert_eq(expect.astype(float), got.astype(float)) + got = op(gdf, gsr).to_pandas(nullable=True) + assert_eq(expect, got, check_dtype=False) expect = op(psr, pdf) - got = op(psr, pdf) - assert_eq(expect.astype(float), got.astype(float)) + got = op(gsr, gdf).to_pandas(nullable=True) + assert_eq(expect, got, check_dtype=False) @pytest.mark.parametrize( From 99001d2c8d9b3898e58c74d7979ab6204c5e5bee Mon Sep 17 00:00:00 2001 From: Alfred Xu Date: Thu, 18 Mar 2021 09:57:30 +0800 Subject: [PATCH 3/3] Java support on explode_outer (#7625) This pull request aims to enable `cudf::explode_outer` and `cudf::explode_outer_position` in Java package. Authors: - Alfred Xu (@sperlingxx) Approvers: - Robert (Bobby) Evans (@revans2) URL: https://github.com/rapidsai/cudf/pull/7625 --- java/src/main/java/ai/rapids/cudf/Table.java | 141 +++++++++++++++--- java/src/main/native/src/TableJni.cpp | 28 ++++ .../test/java/ai/rapids/cudf/TableTest.java | 86 +++++++++-- 3 files changed, 218 insertions(+), 37 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 7385b55d0df..d0e59fdc105 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -516,6 +516,10 @@ private static native long[] repeatColumnCount(long tableHandle, private static native long[] explodePosition(long tableHandle, int index); + private static native long[] explodeOuter(long tableHandle, int index); + + private static native long[] explodeOuterPosition(long tableHandle, int index); + private static native long createCudfTableView(long[] nativeColumnViewHandles); private static native long[] columnViewsFromPacked(ByteBuffer metadata, long dataAddress); @@ -1725,7 +1729,7 @@ public ContiguousTable[] contiguousSplit(int... indices) { * Example: * input: [[5,10,15], 100], * [[20,25], 200], - * [[30], 300], + * [[30], 300] * index: 0 * output: [5, 100], * [10, 100], @@ -1737,12 +1741,12 @@ public ContiguousTable[] contiguousSplit(int... indices) { * * Nulls propagate in different ways depending on what is null. * - * [[5,null,15], 100], - * [null, 200] - * returns: - * [5, 100], - * [null, 100], - * [15, 100] + * input: [[5,null,15], 100], + * [null, 200] + * index: 0 + * output: [5, 100], + * [null, 100], + * [15, 100] * * Note that null lists are completely removed from the output * and nulls inside lists are pulled out and remain. @@ -1763,27 +1767,26 @@ public Table explode(int index) { * in the output. The corresponding rows for other columns in the input are duplicated. A position * column is added that has the index inside the original list for each row. Example: * - * [[5,10,15], 100], - * [[20,25], 200], - * [[30], 300], - * returns - * [0, 5, 100], - * [1, 10, 100], - * [2, 15, 100], - * [0, 20, 200], - * [1, 25, 200], - * [0, 30, 300], + * input: [[5,10,15], 100], + * [[20,25], 200], + * [[30], 300] + * index: 0 + * output: [0, 5, 100], + * [1, 10, 100], + * [2, 15, 100], + * [0, 20, 200], + * [1, 25, 200], + * [0, 30, 300] * * * Nulls and empty lists propagate in different ways depending on what is null or empty. * - * [[5,null,15], 100], - * [null, 200], - * [[], 300], - * returns - * [0, 5, 100], - * [1, null, 100], - * [2, 15, 100], + * input: [[5,null,15], 100], + * [null, 200] + * index: 0 + * output: [5, 100], + * [null, 100], + * [15, 100] * * * Note that null lists are not included in the resulting table, but nulls inside @@ -1799,6 +1802,96 @@ public Table explodePosition(int index) { return new Table(explodePosition(nativeHandle, index)); } + /** + * Explodes a list column's elements. + * + * Any list is exploded, which means the elements of the list in each row are expanded + * into new rows in the output. The corresponding rows for other columns in the input + * are duplicated. + * + * + * Example: + * input: [[5,10,15], 100], + * [[20,25], 200], + * [[30], 300], + * index: 0 + * output: [5, 100], + * [10, 100], + * [15, 100], + * [20, 200], + * [25, 200], + * [30, 300] + * + * + * Nulls propagate in different ways depending on what is null. + * + * input: [[5,null,15], 100], + * [null, 200] + * index: 0 + * output: [5, 100], + * [null, 100], + * [15, 100], + * [null, 200] + * + * Note that null lists are completely removed from the output + * and nulls inside lists are pulled out and remain. + * + * @param index Column index to explode inside the table. + * @return A new table with explode_col exploded. + */ + public Table explodeOuter(int index) { + assert 0 <= index && index < columns.length : "Column index is out of range"; + assert columns[index].getType().equals(DType.LIST) : "Column to explode must be of type LIST"; + return new Table(explodeOuter(nativeHandle, index)); + } + + /** + * Explodes a list column's elements retaining any null entries or empty lists and includes a + * position column. + * + * Any list is exploded, which means the elements of the list in each row are expanded into new rows + * in the output. The corresponding rows for other columns in the input are duplicated. A position + * column is added that has the index inside the original list for each row. Example: + * + * + * Example: + * input: [[5,10,15], 100], + * [[20,25], 200], + * [[30], 300], + * index: 0 + * output: [0, 5, 100], + * [1, 10, 100], + * [2, 15, 100], + * [0, 20, 200], + * [1, 25, 200], + * [0, 30, 300] + * + * + * Nulls and empty lists propagate as null entries in the result. + * + * input: [[5,null,15], 100], + * [null, 200], + * [[], 300] + * index: 0 + * output: [0, 5, 100], + * [1, null, 100], + * [2, 15, 100], + * [0, null, 200], + * [0, null, 300] + * + * + * returns + * + * @param index Column index to explode inside the table. + * @return A new table with exploded value and position. The column order of return table is + * [cols before explode_input, explode_position, explode_value, cols after explode_input]. + */ + public Table explodeOuterPosition(int index) { + assert 0 <= index && index < columns.length : "Column index is out of range"; + assert columns[index].getType().equals(DType.LIST) : "Column to explode must be of type LIST"; + return new Table(explodeOuterPosition(nativeHandle, index)); + } + /** * Gathers the rows of this table according to `gatherMap` such that row "i" * in the resulting table's columns will contain row "gatherMap[i]" from this table. diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 4548156055a..02385a453d0 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -2052,4 +2052,32 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodePosition(JNIEnv *e CATCH_STD(env, 0); } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuter(JNIEnv *env, jclass, + jlong input_jtable, + jint column_index) { + JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0); + try { + cudf::jni::auto_set_device(env); + cudf::table_view *input_table = reinterpret_cast(input_jtable); + cudf::size_type col_index = static_cast(column_index); + std::unique_ptr exploded = cudf::explode_outer(*input_table, col_index); + return cudf::jni::convert_table_for_return(env, exploded); + } + CATCH_STD(env, 0); +} + +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuterPosition(JNIEnv *env, jclass, + jlong input_jtable, + jint column_index) { + JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0); + try { + cudf::jni::auto_set_device(env); + cudf::table_view *input_table = reinterpret_cast(input_jtable); + cudf::size_type col_index = static_cast(column_index); + std::unique_ptr exploded = cudf::explode_outer_position(*input_table, col_index); + return cudf::jni::convert_table_for_return(env, exploded); + } + CATCH_STD(env, 0); +} + } // extern "C" diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 626f7828012..c2e28e1cad8 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -4635,7 +4635,7 @@ private Table[] buildExplodeTestTableWithPrimitiveTypes(boolean pos, boolean out } } - private Table[] buildExplodeTestTableWithNestedTypes(boolean pos) { + private Table[] buildExplodeTestTableWithNestedTypes(boolean pos, boolean outer) { StructType nestedType = new StructType(true, new BasicType(false, DType.INT32), new BasicType(false, DType.STRING)); try (Table input = new Table.TestBuilder() @@ -4644,23 +4644,42 @@ private Table[] buildExplodeTestTableWithNestedTypes(boolean pos) { Arrays.asList(struct(4, "k4"), struct(5, "k5")), Arrays.asList(struct(6, "k6")), Arrays.asList(new HostColumnVector.StructData((List) null)), - Arrays.asList()) + null) .column("s1", "s2", "s3", "s4", "s5") .column(1, 3, 5, 7, 9) .column(12.0, 14.0, 13.0, 11.0, 15.0) .build()) { Table.TestBuilder expectedBuilder = new Table.TestBuilder(); if (pos) { - expectedBuilder.column(0, 1, 2, 0, 1, 0, 0); + if (!outer) + expectedBuilder.column(0, 1, 2, 0, 1, 0, 0); + else + expectedBuilder.column(0, 1, 2, 0, 1, 0, 0, 0); } - try (Table expected = expectedBuilder - .column(nestedType, + List expectedData = new ArrayList(){{ + if (!outer) { + this.add(new HostColumnVector.StructData[]{ + struct(1, "k1"), struct(2, "k2"), struct(3, "k3"), + struct(4, "k4"), struct(5, "k5"), struct(6, "k6"), + new HostColumnVector.StructData((List) null)}); + this.add(new String[]{"s1", "s1", "s1", "s2", "s2", "s3", "s4"}); + this.add(new Integer[]{1, 1, 1, 3, 3, 5, 7}); + this.add(new Double[]{12.0, 12.0, 12.0, 14.0, 14.0, 13.0, 11.0}); + } else { + this.add(new HostColumnVector.StructData[]{ struct(1, "k1"), struct(2, "k2"), struct(3, "k3"), struct(4, "k4"), struct(5, "k5"), struct(6, "k6"), - new HostColumnVector.StructData((List) null)) - .column("s1", "s1", "s1", "s2", "s2", "s3", "s4") - .column(1, 1, 1, 3, 3, 5, 7) - .column(12.0, 12.0, 12.0, 14.0, 14.0, 13.0, 11.0) + new HostColumnVector.StructData((List) null), null}); + this.add(new String[]{"s1", "s1", "s1", "s2", "s2", "s3", "s4", "s5"}); + this.add(new Integer[]{1, 1, 1, 3, 3, 5, 7, 9}); + this.add(new Double[]{12.0, 12.0, 12.0, 14.0, 14.0, 13.0, 11.0, 15.0}); + } + }}; + try (Table expected = expectedBuilder + .column(nestedType, (HostColumnVector.StructData[]) expectedData.get(0)) + .column((String[]) expectedData.get(1)) + .column((Integer[]) expectedData.get(2)) + .column((Double[]) expectedData.get(3)) .build()) { return new Table[]{new Table(input.getColumns()), new Table(expected.getColumns())}; } @@ -4679,7 +4698,7 @@ void testExplode() { } // Child is nested type - Table[] testTables2 = buildExplodeTestTableWithNestedTypes(false); + Table[] testTables2 = buildExplodeTestTableWithNestedTypes(false, false); try (Table input = testTables2[0]; Table expected = testTables2[1]) { try (Table exploded = input.explode(0)) { @@ -4689,7 +4708,7 @@ void testExplode() { } @Test - void testPosExplode() { + void testExplodePosition() { // Child is primitive type Table[] testTables = buildExplodeTestTableWithPrimitiveTypes(true, false); try (Table input = testTables[0]; @@ -4699,8 +4718,8 @@ void testPosExplode() { } } - // Child is primitive type - Table[] testTables2 = buildExplodeTestTableWithNestedTypes(true); + // Child is nested type + Table[] testTables2 = buildExplodeTestTableWithNestedTypes(true, false); try (Table input = testTables2[0]; Table expected = testTables2[1]) { try (Table exploded = input.explodePosition(0)) { @@ -4709,4 +4728,45 @@ void testPosExplode() { } } + @Test + void testExplodeOuter() { + // Child is primitive type + Table[] testTables = buildExplodeTestTableWithPrimitiveTypes(false, true); + try (Table input = testTables[0]; + Table expected = testTables[1]) { + try (Table exploded = input.explodeOuter(0)) { + assertTablesAreEqual(expected, exploded); + } + } + + // Child is nested type + Table[] testTables2 = buildExplodeTestTableWithNestedTypes(false, true); + try (Table input = testTables2[0]; + Table expected = testTables2[1]) { + try (Table exploded = input.explodeOuter(0)) { + assertTablesAreEqual(expected, exploded); + } + } + } + + @Test + void testExplodeOuterPosition() { + // Child is primitive type + Table[] testTables = buildExplodeTestTableWithPrimitiveTypes(true, true); + try (Table input = testTables[0]; + Table expected = testTables[1]) { + try (Table exploded = input.explodeOuterPosition(0)) { + assertTablesAreEqual(expected, exploded); + } + } + + // Child is nested type + Table[] testTables2 = buildExplodeTestTableWithNestedTypes(true, true); + try (Table input = testTables2[0]; + Table expected = testTables2[1]) { + try (Table exploded = input.explodeOuterPosition(0)) { + assertTablesAreEqual(expected, exploded); + } + } + } }