From c732cef58a10ccfc7bdf7370ddd218cc02f96476 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 19 May 2021 18:28:47 -0700 Subject: [PATCH 01/27] Support create lists column from a `list_scalar` (#8185) This PR adds support to `make_column_from_scalar` for `list_scalar`. For 0-length columns, a well-formed `LIST` type column, whose child column has the same column hierarchy to the row data stored in `list_scalar` is returned. Example: ``` slr.data = [1, 2, 3] // An integer list of 1, 2, 3, `data` is an INT column make_column_from_scalar(s, 2) // List column: {[1, 2, 3], [1, 2, 3]}, whose child column is an `INT` column. slr.data = [[1, 2], [3]] // A list of integer lists, `data` is a List column make_column_from_scalar(s, 0) // Well formed, 0-length List> column, whose child column is a List column. ``` Closes #8088 Authors: - Michael Wang (https://github.com/isVoid) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - Devavret Makkar (https://github.com/devavret) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/8185 --- conda/recipes/libcudf/meta.yaml | 1 + cpp/include/cudf/column/column_factories.hpp | 3 +- .../cudf/lists/lists_column_factories.hpp | 42 +++ cpp/src/column/column_factories.cu | 8 +- cpp/src/lists/lists_column_factories.cu | 67 +++- cpp/tests/column/factories_test.cpp | 296 ++++++++++++++++++ 6 files changed, 413 insertions(+), 4 deletions(-) create mode 100644 cpp/include/cudf/lists/lists_column_factories.hpp diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index ea2fda399fd..0fcf62a2606 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -135,6 +135,7 @@ test: - test -f $PREFIX/include/cudf/join.hpp - test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp - test -f $PREFIX/include/cudf/lists/detail/copying.hpp + - test -f $PREFIX/include/cudf/lists/lists_column_factories.hpp - test -f $PREFIX/include/cudf/lists/detail/drop_list_duplicates.hpp - test -f $PREFIX/include/cudf/lists/detail/interleave_columns.hpp - test -f $PREFIX/include/cudf/lists/detail/sorting.hpp diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp index 43c2407d629..e5424f0fc44 100644 --- a/cpp/include/cudf/column/column_factories.hpp +++ b/cpp/include/cudf/column/column_factories.hpp @@ -541,7 +541,8 @@ std::unique_ptr make_structs_column( * * The output column will have the same type as `s.type()` * The output column will contain all null rows if `s.invalid()==false` - * The output column will be empty if `size==0`. + * The output column will be empty if `size==0`. For LIST scalars, the column hierarchy + * from @p s is preserved. * * @param[in] s The scalar to use for values in the column. * @param[in] size The number of rows for the output column. diff --git a/cpp/include/cudf/lists/lists_column_factories.hpp b/cpp/include/cudf/lists/lists_column_factories.hpp new file mode 100644 index 00000000000..bdf06cfa9e7 --- /dev/null +++ b/cpp/include/cudf/lists/lists_column_factories.hpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +namespace cudf { +namespace lists { +namespace detail { + +/** + * @brief Internal API to construct a lists column from a `list_scalar`, for public + * use, use `cudf::make_column_from_scalar`. + * + * @param[in] value The `list_scalar` to construct from + * @param[in] size The number of rows for the output column. + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + * @param[in] mr Device memory resource used to allocate the returned column's device memory. + */ +std::unique_ptr make_lists_column_from_scalar( + list_scalar const& value, + size_type size, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} // namespace detail +} // namespace lists +} // namespace cudf diff --git a/cpp/src/column/column_factories.cu b/cpp/src/column/column_factories.cu index 60e642ea3d5..6ba8497b320 100644 --- a/cpp/src/column/column_factories.cu +++ b/cpp/src/column/column_factories.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -32,6 +33,7 @@ struct column_from_scalar_dispatch { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { + if (size == 0) return make_empty_column(value.type()); if (!value.is_valid()) return make_fixed_width_column(value.type(), size, mask_state::ALL_NULL, stream, mr); auto output_column = @@ -49,6 +51,7 @@ std::unique_ptr column_from_scalar_dispatch::operator() column_from_scalar_dispatch::operator()(&value); + return lists::detail::make_lists_column_from_scalar(*lv, size, stream, mr); } template <> @@ -94,6 +98,7 @@ std::unique_ptr column_from_scalar_dispatch::operator() const&>(value); auto iter = thrust::make_constant_iterator(0); @@ -117,7 +122,6 @@ std::unique_ptr make_column_from_scalar(scalar const& s, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (size == 0) return make_empty_column(s.type()); return type_dispatcher(s.type(), column_from_scalar_dispatch{}, s, size, stream, mr); } diff --git a/cpp/src/lists/lists_column_factories.cu b/cpp/src/lists/lists_column_factories.cu index ebf5e07f76a..3291aeb9f22 100644 --- a/cpp/src/lists/lists_column_factories.cu +++ b/cpp/src/lists/lists_column_factories.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,10 +16,75 @@ #include #include +#include +#include #include +#include + +#include +#include namespace cudf { +namespace lists { +namespace detail { + +std::unique_ptr make_lists_column_from_scalar(list_scalar const& value, + size_type size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (size == 0) { + return make_lists_column(0, + make_empty_column(data_type{type_to_id()}), + empty_like(value.view()), + 0, + cudf::detail::create_null_mask(0, mask_state::UNALLOCATED, stream, mr), + stream, + mr); + } + auto mr_final = size == 1 ? mr : rmm::mr::get_current_device_resource(); + + // Handcraft a 1-row column + auto offsets = make_numeric_column( + data_type{type_to_id()}, 2, mask_state::UNALLOCATED, stream, mr_final); + auto m_offsets = offsets->mutable_view(); + thrust::sequence(rmm::exec_policy(stream), + m_offsets.begin(), + m_offsets.end(), + 0, + value.view().size()); + size_type null_count = value.is_valid(stream) ? 0 : 1; + auto null_mask_state = null_count ? mask_state::ALL_NULL : mask_state::UNALLOCATED; + auto null_mask = cudf::detail::create_null_mask(1, null_mask_state, stream, mr_final); + + if (size == 1) { + auto child = std::make_unique(value.view(), stream, mr_final); + return make_lists_column( + 1, std::move(offsets), std::move(child), null_count, std::move(null_mask), stream, mr_final); + } + + auto children_views = std::vector{offsets->view(), value.view()}; + auto one_row_col_view = column_view(data_type{type_id::LIST}, + 1, + nullptr, + static_cast(null_mask.data()), + null_count, + 0, + children_views); + + auto begin = thrust::make_constant_iterator(0); + auto res = cudf::detail::gather(table_view({one_row_col_view}), + begin, + begin + size, + out_of_bounds_policy::DONT_CHECK, + stream, + mr_final); + return std::move(res->release()[0]); +} + +} // namespace detail +} // namespace lists /** * @copydoc cudf::make_lists_column diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp index 71f65eedd91..f9e83311b1b 100644 --- a/cpp/tests/column/factories_test.cpp +++ b/cpp/tests/column/factories_test.cpp @@ -20,7 +20,9 @@ #include #include +#include #include +#include #include #include @@ -462,6 +464,300 @@ TEST_F(ColumnFactoryTest, DictionaryFromStringScalarError) EXPECT_THROW(cudf::make_dictionary_from_scalar(value, 1), cudf::logic_error); } +template +class ListsFixedWidthLeafTest : public ColumnFactoryTest { +}; + +TYPED_TEST_CASE(ListsFixedWidthLeafTest, cudf::test::FixedWidthTypes); + +TYPED_TEST(ListsFixedWidthLeafTest, FromNonNested) +{ + using FCW = cudf::test::fixed_width_column_wrapper; + using LCW = cudf::test::lists_column_wrapper; + using valid_t = std::vector; + + auto s = cudf::make_list_scalar(FCW({1, -1, 3}, {1, 0, 1})); + auto col = cudf::make_column_from_scalar(*s, 3); + + auto expected = LCW{LCW({1, 2, 3}, valid_t{1, 0, 1}.begin()), + LCW({1, 2, 3}, valid_t{1, 0, 1}.begin()), + LCW({1, 2, 3}, valid_t{1, 0, 1}.begin())}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, expected); +} + +TYPED_TEST(ListsFixedWidthLeafTest, FromNested) +{ + using LCW = cudf::test::lists_column_wrapper; + using valid_t = std::vector; + +#define row_data \ + LCW({LCW({-1, -1, 3}, valid_t{0, 0, 1}.begin()), LCW{}, LCW{}}, valid_t{1, 0, 1}.begin()) + + auto s = cudf::make_list_scalar(row_data); + auto col = cudf::make_column_from_scalar(*s, 5); + + auto expected = LCW{row_data, row_data, row_data, row_data, row_data}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, expected); + +#undef row_data +} + +template +class ListsDictionaryLeafTest : public ColumnFactoryTest { +}; + +TYPED_TEST_CASE(ListsDictionaryLeafTest, cudf::test::FixedWidthTypes); + +TYPED_TEST(ListsDictionaryLeafTest, FromNonNested) +{ + using DCW = cudf::test::dictionary_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; + + auto s = cudf::make_list_scalar(DCW({1, 3, -1, 1, 3}, {1, 1, 0, 1, 1})); + auto col = cudf::make_column_from_scalar(*s, 2); + + DCW leaf({1, 3, -1, 1, 3, 1, 3, -1, 1, 3}, {1, 1, 0, 1, 1, 1, 1, 0, 1, 1}); + offset_t offsets{0, 5, 10}; + auto mask = cudf::create_null_mask(2, cudf::mask_state::UNALLOCATED); + + auto expected = cudf::make_lists_column(2, offsets.release(), leaf.release(), 0, std::move(mask)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, *expected); +} + +TYPED_TEST(ListsDictionaryLeafTest, FromNested) +{ + using DCW = cudf::test::dictionary_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; + + DCW leaf({1, 3, -1, 1, 3, 1, 3, -1, 1, 3}, {1, 1, 0, 1, 1, 1, 1, 0, 1, 1}); + offset_t offsets{0, 3, 3, 6, 6, 10}; + auto mask = cudf::create_null_mask(5, cudf::mask_state::ALL_VALID); + cudf::set_null_mask(static_cast(mask.data()), 1, 2, false); + auto data = cudf::make_lists_column(5, offsets.release(), leaf.release(), 0, std::move(mask)); + + auto s = cudf::make_list_scalar(*data); + auto col = cudf::make_column_from_scalar(*s, 3); + + DCW leaf2( + {1, 3, -1, 1, 3, 1, 3, -1, 1, 3, 1, 3, -1, 1, 3, + 1, 3, -1, 1, 3, 1, 3, -1, 1, 3, 1, 3, -1, 1, 3}, + {1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1}); + offset_t offsets2{0, 3, 3, 6, 6, 10, 13, 13, 16, 16, 20, 23, 23, 26, 26, 30}; + auto mask2 = cudf::create_null_mask(15, cudf::mask_state::ALL_VALID); + cudf::set_null_mask(static_cast(mask2.data()), 1, 2, false); + cudf::set_null_mask(static_cast(mask2.data()), 6, 7, false); + cudf::set_null_mask(static_cast(mask2.data()), 11, 12, false); + auto nested = + cudf::make_lists_column(15, offsets2.release(), leaf2.release(), 3, std::move(mask2)); + + offset_t offsets3{0, 5, 10, 15}; + auto mask3 = cudf::create_null_mask(3, cudf::mask_state::UNALLOCATED); + auto expected = + cudf::make_lists_column(3, offsets3.release(), std::move(nested), 0, std::move(mask3)); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, *expected); +} + +class ListsStringLeafTest : public ColumnFactoryTest { +}; + +TEST_F(ListsStringLeafTest, FromNonNested) +{ + using SCW = cudf::test::strings_column_wrapper; + using LCW = cudf::test::lists_column_wrapper; + using valid_t = std::vector; + + auto s = cudf::make_list_scalar(SCW({"xx", "", "z"}, {true, false, true})); + auto col = cudf::make_column_from_scalar(*s, 4); + + auto expected = LCW{LCW({"xx", "", "z"}, valid_t{1, 0, 1}.begin()), + LCW({"xx", "", "z"}, valid_t{1, 0, 1}.begin()), + LCW({"xx", "", "z"}, valid_t{1, 0, 1}.begin()), + LCW({"xx", "", "z"}, valid_t{1, 0, 1}.begin())}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, expected); +} + +TEST_F(ListsStringLeafTest, FromNested) +{ + using LCW = cudf::test::lists_column_wrapper; + using valid_t = std::vector; + +#define row_data \ + LCW({LCW{}, \ + LCW({"@@", "rapids", "", "四", "ら"}, valid_t{1, 1, 0, 1, 1}.begin()), \ + LCW{}, \ + LCW({"hello", ""}, valid_t{1, 0}.begin())}, \ + valid_t{0, 1, 1, 1}.begin()) + + auto s = cudf::make_list_scalar(row_data); + + auto col = cudf::make_column_from_scalar(*s, 3); + + auto expected = LCW{row_data, row_data, row_data}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, expected); +#undef row_data +} + +template +class ListsStructsLeafTest : public ColumnFactoryTest { + protected: + using SCW = cudf::test::structs_column_wrapper; + /** + * @brief Create a structs column that contains 3 fields: int, string, List + */ + template + SCW make_test_structs_column(cudf::test::fixed_width_column_wrapper field1, + cudf::test::strings_column_wrapper field2, + cudf::test::lists_column_wrapper field3, + MaskIterator mask) + { + return SCW{{field1, field2, field3}, mask}; + } +}; + +TYPED_TEST_CASE(ListsStructsLeafTest, cudf::test::FixedWidthTypes); + +TYPED_TEST(ListsStructsLeafTest, FromNonNested) +{ + using LCWinner_t = cudf::test::lists_column_wrapper; + using StringCW = cudf::test::strings_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; + using valid_t = std::vector; + + auto data = this->make_test_structs_column( + {{1, 3, 5, 2, 4}, {1, 0, 1, 0, 1}}, + StringCW({"fleur", "flower", "", "花", "はな"}, {true, true, false, true, true}), + LCWinner_t({{1, 2}, {}, {4, 5}, {-1}, {}}, valid_t{1, 1, 1, 1, 0}.begin()), + valid_t{1, 1, 1, 0, 1}.begin()); + auto s = cudf::make_list_scalar(data); + auto col = cudf::make_column_from_scalar(*s, 2); + + auto leaf = this->make_test_structs_column( + {{1, 3, 5, 2, 4, 1, 3, 5, 2, 4}, {1, 0, 1, 0, 1, 1, 0, 1, 0, 1}}, + StringCW({"fleur", "flower", "", "花", "はな", "fleur", "flower", "", "花", "はな"}, + {true, true, false, true, true, true, true, false, true, true}), + LCWinner_t({{1, 2}, {}, {4, 5}, {-1}, {}, {1, 2}, {}, {4, 5}, {-1}, {}}, + valid_t{1, 1, 1, 1, 0, 1, 1, 1, 1, 0}.begin()), + valid_t{1, 1, 1, 0, 1, 1, 1, 1, 0, 1}.begin()); + auto expected = cudf::make_lists_column(2, + offset_t{0, 5, 10}.release(), + leaf.release(), + 0, + cudf::create_null_mask(2, cudf::mask_state::UNALLOCATED)); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, *expected); +} + +TYPED_TEST(ListsStructsLeafTest, FromNested) +{ + using LCWinner_t = cudf::test::lists_column_wrapper; + using StringCW = cudf::test::strings_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; + using valid_t = std::vector; + auto leaf = this->make_test_structs_column( + {{1, 2}, {0, 1}}, + StringCW({"étoile", "星"}, {true, true}), + LCWinner_t({LCWinner_t{}, LCWinner_t{42}}, valid_t{1, 1}.begin()), + valid_t{0, 1}.begin()); + auto mask = cudf::create_null_mask(3, cudf::mask_state::ALL_VALID); + cudf::set_null_mask(static_cast(mask.data()), 0, 1, false); + auto data = + cudf::make_lists_column(3, offset_t{0, 0, 1, 2}.release(), leaf.release(), 1, std::move(mask)); + auto s = cudf::make_list_scalar(*data); + + auto col = cudf::make_column_from_scalar(*s, 3); + + auto leaf2 = this->make_test_structs_column( + {{1, 2, 1, 2, 1, 2}, {0, 1, 0, 1, 0, 1}}, + StringCW({"étoile", "星", "étoile", "星", "étoile", "星"}, + {true, true, true, true, true, true}), + LCWinner_t( + {LCWinner_t{}, LCWinner_t{42}, LCWinner_t{}, LCWinner_t{42}, LCWinner_t{}, LCWinner_t{42}}, + valid_t{1, 1, 1, 1, 1, 1}.begin()), + valid_t{0, 1, 0, 1, 0, 1}.begin()); + auto mask2 = cudf::create_null_mask(9, cudf::mask_state::ALL_VALID); + cudf::set_null_mask(static_cast(mask2.data()), 0, 1, false); + cudf::set_null_mask(static_cast(mask2.data()), 3, 4, false); + cudf::set_null_mask(static_cast(mask2.data()), 6, 7, false); + auto data2 = cudf::make_lists_column( + 9, offset_t{0, 0, 1, 2, 2, 3, 4, 4, 5, 6}.release(), leaf2.release(), 3, std::move(mask2)); + auto expected = cudf::make_lists_column(3, + offset_t{0, 3, 6, 9}.release(), + std::move(data2), + 0, + cudf::create_null_mask(3, cudf::mask_state::UNALLOCATED)); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, *expected); +} + +class ListsZeroLengthColumnTest : public ColumnFactoryTest { + protected: + using StructsCW = cudf::test::structs_column_wrapper; + StructsCW make_test_structs_column(cudf::test::fixed_width_column_wrapper field1, + cudf::test::strings_column_wrapper field2, + cudf::test::lists_column_wrapper field3) + { + return StructsCW{field1, field2, field3}; + } +}; + +TEST_F(ListsZeroLengthColumnTest, MixedTypes) +{ + using FCW = cudf::test::fixed_width_column_wrapper; + using StringCW = cudf::test::strings_column_wrapper; + using LCW = cudf::test::lists_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; + { + auto s = cudf::make_list_scalar(FCW{1, 2, 3}); + auto got = cudf::make_column_from_scalar(*s, 0); + auto expected = + cudf::make_lists_column(0, + offset_t{}.release(), + FCW{}.release(), + 0, + cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*got, *expected); + } + + { + auto s = cudf::make_list_scalar(LCW{LCW{1, 2, 3}, LCW{}, LCW{5, 6}}); + auto got = cudf::make_column_from_scalar(*s, 0); + auto nested = cudf::make_lists_column(0, + offset_t{}.release(), + FCW{}.release(), + 0, + cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED)); + auto expected = + cudf::make_lists_column(0, + offset_t{}.release(), + std::move(nested), + 0, + cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*got, *expected); + } + + { + auto s = cudf::make_list_scalar( + this->make_test_structs_column({1, 2, 3}, StringCW({"x", "", "y"}), LCW{{5, 6}, {}, {7}})); + auto got = cudf::make_column_from_scalar(*s, 0); + + std::vector> children; + children.emplace_back(FCW{}.release()); + children.emplace_back(StringCW{}.release()); + children.emplace_back(LCW{}.release()); + auto nested = cudf::make_structs_column( + 0, std::move(children), 0, cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED)); + + auto expected = + cudf::make_lists_column(0, + offset_t{}.release(), + std::move(nested), + 0, + cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*got, *expected); + } +} + void struct_from_scalar(bool is_valid) { using LCW = cudf::test::lists_column_wrapper; From 2da847320cccb4c603d006dd1dbb96d7aacad1ff Mon Sep 17 00:00:00 2001 From: Liangcai Li Date: Thu, 20 May 2021 09:52:20 +0800 Subject: [PATCH 02/27] Create a String column from UTF8 String byte arrays (#8257) This PR is to support creating a `ColumnVector ` from the byte arrays of UTF8 Strings. And also let the `Struct` children creation support UTF8 Strings. Closes https://github.com/rapidsai/cudf/issues/8137 Signed-off-by: Firestarman Authors: - Liangcai Li (https://github.com/firestarman) Approvers: - Allen Xu (https://github.com/wjxiz1992) - Jason Lowe (https://github.com/jlowe) - Robert (Bobby) Evans (https://github.com/revans2) - Alfred Xu (https://github.com/sperlingxx) URL: https://github.com/rapidsai/cudf/pull/8257 --- .../java/ai/rapids/cudf/ColumnVector.java | 10 +++++ .../java/ai/rapids/cudf/HostColumnVector.java | 39 ++++++++++++++++++- .../java/ai/rapids/cudf/ColumnVectorTest.java | 14 +++++++ 3 files changed, 62 insertions(+), 1 deletion(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index ea93a2daf36..a7e589ac890 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -1276,6 +1276,16 @@ public static ColumnVector fromStrings(String... values) { } } + /** + * Create a new string vector from the given values. This API + * supports inline nulls. + */ + public static ColumnVector fromUTF8Strings(byte[]... values) { + try (HostColumnVector host = HostColumnVector.fromUTF8Strings(values)) { + return host.copyToDevice(); + } + } + /** * Create a new vector from the given values. This API supports inline nulls, * but is much slower than building from primitive array of unscaledValues. diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java index 846bcb3b635..46255428c1c 100644 --- a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java @@ -29,6 +29,7 @@ import java.util.Objects; import java.util.Optional; import java.util.StringJoiner; +import java.util.function.BiConsumer; import java.util.function.Consumer; /** @@ -577,6 +578,40 @@ public static HostColumnVector fromStrings(String... values) { }); } + /** + * Create a new string vector from the given values. This API + * supports inline nulls. + */ + public static HostColumnVector fromUTF8Strings(byte[]... values) { + int rows = values.length; + long nullCount = 0; + long bufferSize = 0; + // How many bytes do we need to hold the data. + for (byte[] s: values) { + if (s == null) { + nullCount++; + } else { + bufferSize += s.length; + } + } + + BiConsumer appendUTF8 = nullCount == 0 ? + (b, s) -> b.appendUTF8String(s) : + (b, s) -> { + if (s == null) { + b.appendNull(); + } else { + b.appendUTF8String(s); + } + }; + + return build(rows, bufferSize, (b) -> { + for (byte[] s: values) { + appendUTF8.accept(b, s); + } + }); + } + /** * Create a new vector from the given values. This API supports inline nulls, * but is much slower than building from primitive array of unscaledValues. @@ -1085,9 +1120,11 @@ private void appendChildOrNull(ColumnBuilder childBuilder, Object listElement) { } else if (listElement instanceof BigDecimal) { childBuilder.append((BigDecimal) listElement); } else if (listElement instanceof List) { - childBuilder.append((List) listElement); + childBuilder.append((List) listElement); } else if (listElement instanceof StructData) { childBuilder.append((StructData) listElement); + } else if (listElement instanceof byte[]) { + childBuilder.appendUTF8String((byte[]) listElement); } else { throw new IllegalStateException("Unexpected element type: " + listElement.getClass()); } diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 09ddef633e3..83795799a24 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -29,6 +29,7 @@ import java.math.BigDecimal; import java.math.RoundingMode; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -176,6 +177,19 @@ void testStringCreation() { } } + @Test + void testUTF8StringCreation() { + try (ColumnVector cv = ColumnVector.fromUTF8Strings( + "d".getBytes(StandardCharsets.UTF_8), + "sd".getBytes(StandardCharsets.UTF_8), + "sde".getBytes(StandardCharsets.UTF_8), + null, + "END".getBytes(StandardCharsets.UTF_8)); + ColumnVector expected = ColumnVector.fromStrings("d", "sd", "sde", null, "END")) { + TableTest.assertColumnsAreEqual(expected, cv); + } + } + @Test void testRefCountLeak() throws InterruptedException { assumeTrue(Boolean.getBoolean("ai.rapids.cudf.flaky-tests-enabled")); From 48647aaee388387681dd310aef63756beae3a28c Mon Sep 17 00:00:00 2001 From: Liangcai Li Date: Thu, 20 May 2021 16:30:58 +0800 Subject: [PATCH 03/27] Java: Support creating a scalar from utf8 string (#8294) This is a small PR to support creating a scalar from an array of utf8 bytes. Since the PR https://github.com/rapidsai/cudf/pull/8257 added the support for ColumnVector creation, so I think we'd better add it for scalar creation to avoid conversions between utf8 strings and Java strings when used in Spark. Signed-off-by: Firestarman Authors: - Liangcai Li (https://github.com/firestarman) Approvers: - Bobby Wang (https://github.com/wbo4958) URL: https://github.com/rapidsai/cudf/pull/8294 --- java/src/main/java/ai/rapids/cudf/Scalar.java | 11 ++++++++++- .../test/java/ai/rapids/cudf/ScalarTest.java | 17 +++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/java/src/main/java/ai/rapids/cudf/Scalar.java b/java/src/main/java/ai/rapids/cudf/Scalar.java index 62dd9bda13b..7794b57c3f9 100644 --- a/java/src/main/java/ai/rapids/cudf/Scalar.java +++ b/java/src/main/java/ai/rapids/cudf/Scalar.java @@ -329,10 +329,19 @@ public static Scalar timestampFromLong(DType type, Long value) { } public static Scalar fromString(String value) { + return fromUTF8String(value == null ? null : value.getBytes(StandardCharsets.UTF_8)); + } + + /** + * Creates a String scalar from an array of UTF8 bytes. + * @param value the array of UTF8 bytes + * @return a String scalar + */ + public static Scalar fromUTF8String(byte[] value) { if (value == null) { return fromNull(DType.STRING); } - return new Scalar(DType.STRING, makeStringScalar(value.getBytes(StandardCharsets.UTF_8), true)); + return new Scalar(DType.STRING, makeStringScalar(value, true)); } /** diff --git a/java/src/test/java/ai/rapids/cudf/ScalarTest.java b/java/src/test/java/ai/rapids/cudf/ScalarTest.java index b09850bc3d9..a1078f2546b 100644 --- a/java/src/test/java/ai/rapids/cudf/ScalarTest.java +++ b/java/src/test/java/ai/rapids/cudf/ScalarTest.java @@ -27,6 +27,7 @@ import org.junit.jupiter.api.Test; import java.math.BigDecimal; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import static ai.rapids.cudf.TableTest.assertColumnsAreEqual; @@ -244,6 +245,22 @@ public void testString() { } } + @Test + public void testUTF8String() { + try (Scalar s = Scalar.fromUTF8String("TEST".getBytes(StandardCharsets.UTF_8))) { + assertEquals(DType.STRING, s.getType()); + assertTrue(s.isValid()); + assertEquals("TEST", s.getJavaString()); + assertArrayEquals(new byte[]{'T', 'E', 'S', 'T'}, s.getUTF8()); + } + try (Scalar s = Scalar.fromUTF8String("".getBytes(StandardCharsets.UTF_8))) { + assertEquals(DType.STRING, s.getType()); + assertTrue(s.isValid()); + assertEquals("", s.getJavaString()); + assertArrayEquals(new byte[]{}, s.getUTF8()); + } + } + @Test public void testList() { // list of int From 0ebf7e6105893056129760b15ccca318d387d9ac Mon Sep 17 00:00:00 2001 From: Rong Ou Date: Thu, 20 May 2021 09:07:36 -0700 Subject: [PATCH 04/27] support RMM aligned resource adapter in JNI (#8266) Depends on https://github.com/rapidsai/rmm/pull/768. Authors: - Rong Ou (https://github.com/rongou) Approvers: - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/8266 --- java/src/main/java/ai/rapids/cudf/Rmm.java | 38 ++++++++++++++++++++-- java/src/main/native/src/RmmJni.cpp | 30 +++++++++-------- 2 files changed, 51 insertions(+), 17 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java index 8d63d2aeefc..97813182deb 100755 --- a/java/src/main/java/ai/rapids/cudf/Rmm.java +++ b/java/src/main/java/ai/rapids/cudf/Rmm.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -173,6 +173,36 @@ public static synchronized void initialize(int allocationMode, LogConf logConf, */ public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize, long maxPoolSize) throws RmmException { + initialize(allocationMode, logConf, poolSize, maxPoolSize, 0, 0); + } + + /** + * Initialize memory manager state and storage. This will always initialize + * the CUDA context for the calling thread if it is not already set. The + * caller is responsible for setting the desired CUDA device prior to this + * call if a specific device is already set. + *

NOTE: All cudf methods will set the chosen CUDA device in the CUDA + * context of the calling thread after this returns. + * @param allocationMode Allocation strategy to use. Bit set using + * {@link RmmAllocationMode#CUDA_DEFAULT}, + * {@link RmmAllocationMode#POOL}, + * {@link RmmAllocationMode#ARENA} and + * {@link RmmAllocationMode#CUDA_MANAGED_MEMORY} + * @param logConf How to do logging or null if you don't want to + * @param poolSize The initial pool size in bytes + * @param maxPoolSize The maximum size the pool is allowed to grow. If the specified value + * is <= 0 then the pool size will not be artificially limited. + * @param allocationAlignment The size to which allocations are aligned. + * @param alignmentThreshold Only allocations with size larger than or equal to this threshold + * are aligned with `allocationAlignment`. + * @throws IllegalStateException if RMM has already been initialized + * @throws IllegalArgumentException if a max pool size is specified but the allocation mode + * is not {@link RmmAllocationMode#POOL} or + * {@link RmmAllocationMode#ARENA}, or the maximum pool size is + * below the initial size. + */ + public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize, + long maxPoolSize, long allocationAlignment, long alignmentThreshold) throws RmmException { if (initialized) { throw new IllegalStateException("RMM is already initialized"); } @@ -195,7 +225,8 @@ public static synchronized void initialize(int allocationMode, LogConf logConf, loc = logConf.loc; } - initializeInternal(allocationMode, loc.internalId, path, poolSize, maxPoolSize); + initializeInternal(allocationMode, loc.internalId, path, poolSize, maxPoolSize, + allocationAlignment, alignmentThreshold); MemoryCleaner.setDefaultGpu(Cuda.getDevice()); initialized = true; } @@ -241,7 +272,8 @@ private static long[] sortThresholds(long[] thresholds) { } private static native void initializeInternal(int allocationMode, int logTo, String path, - long poolSize, long maxPoolSize) throws RmmException; + long poolSize, long maxPoolSize, long allocationAlignment, long alignmentThreshold) + throws RmmException; /** * Shut down any initialized RMM instance. This should be used very rarely. It does not need to diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 7f11e19fce8..e604fc7dd46 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -332,7 +333,9 @@ extern "C" { JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, jclass clazz, jint allocation_mode, jint log_to, jstring jpath, jlong pool_size, - jlong max_pool_size) { + jlong max_pool_size, + jlong allocation_alignment, + jlong alignment_threshold) { try { // make sure the CUDA device is setup in the context cudaError_t cuda_status = cudaFree(0); @@ -351,13 +354,9 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j if (use_managed_mem) { Initialized_resource = rmm::mr::make_owning_wrapper( std::make_shared(), pool_size, pool_limit); - auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT); - Tracking_memory_resource.reset(wrapped); } else { Initialized_resource = rmm::mr::make_owning_wrapper( std::make_shared(), pool_size, pool_limit); - auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT); - Tracking_memory_resource.reset(wrapped); } } else if (use_arena_alloc) { std::size_t pool_limit = (max_pool_size > 0) ? static_cast(max_pool_size) : @@ -365,23 +364,26 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j if (use_managed_mem) { Initialized_resource = rmm::mr::make_owning_wrapper( std::make_shared(), pool_size, pool_limit); - auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT); - Tracking_memory_resource.reset(wrapped); } else { Initialized_resource = rmm::mr::make_owning_wrapper( std::make_shared(), pool_size, pool_limit); - auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT); - Tracking_memory_resource.reset(wrapped); } } else if (use_managed_mem) { Initialized_resource = std::make_shared(); - auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT); - Tracking_memory_resource.reset(wrapped); } else { Initialized_resource = std::make_shared(); - auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT); - Tracking_memory_resource.reset(wrapped); } + + if (allocation_alignment != 0) { + Initialized_resource = rmm::mr::make_owning_wrapper( + Initialized_resource, allocation_alignment, alignment_threshold); + } + + auto wrapped = make_tracking_adaptor( + Initialized_resource.get(), + std::max(RMM_ALLOC_SIZE_ALIGNMENT, static_cast(allocation_alignment))); + Tracking_memory_resource.reset(wrapped); + auto resource = Tracking_memory_resource.get(); rmm::mr::set_current_device_resource(resource); From deee1f62d1f1887b625ddd6b8a7cbd393f04c264 Mon Sep 17 00:00:00 2001 From: AJ Schmidt Date: Thu, 20 May 2021 15:52:28 -0400 Subject: [PATCH 05/27] update changelog (#8297) --- CHANGELOG.md | 318 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 315 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 21ab8ed3274..a6f43296419 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,318 @@ -# cuDF 0.19.0 (Date TBD) - -Please see https://github.com/rapidsai/cudf/releases/tag/v0.19.0a for the latest changes to this development branch. +# cuDF 0.19.0 (21 Apr 2021) + +## 🚨 Breaking Changes + +- Allow hash_partition to take a seed value ([#7771](https://github.com/rapidsai/cudf/pull/7771)) [@magnatelee](https://github.com/magnatelee) +- Allow merging index column with data column using keyword "on" ([#7736](https://github.com/rapidsai/cudf/pull/7736)) [@skirui-source](https://github.com/skirui-source) +- Change JNI API to avoid loading native dependencies when creating sort order classes. ([#7729](https://github.com/rapidsai/cudf/pull/7729)) [@revans2](https://github.com/revans2) +- Replace device_vector with device_uvector in null_mask ([#7715](https://github.com/rapidsai/cudf/pull/7715)) [@harrism](https://github.com/harrism) +- Don't identify decimals as strings. ([#7710](https://github.com/rapidsai/cudf/pull/7710)) [@vyasr](https://github.com/vyasr) +- Fix Java Parquet write after writer API changes ([#7655](https://github.com/rapidsai/cudf/pull/7655)) [@revans2](https://github.com/revans2) +- Convert cudf::concatenate APIs to use spans and device_uvector ([#7621](https://github.com/rapidsai/cudf/pull/7621)) [@harrism](https://github.com/harrism) +- Update missing docstring examples in python public APIs ([#7546](https://github.com/rapidsai/cudf/pull/7546)) [@galipremsagar](https://github.com/galipremsagar) +- Remove unneeded step parameter from strings::detail::copy_slice ([#7525](https://github.com/rapidsai/cudf/pull/7525)) [@davidwendt](https://github.com/davidwendt) +- Rename ARROW_STATIC_LIB because it conflicts with one in FindArrow.cmake ([#7518](https://github.com/rapidsai/cudf/pull/7518)) [@trxcllnt](https://github.com/trxcllnt) +- Match Pandas logic for comparing two objects with nulls ([#7490](https://github.com/rapidsai/cudf/pull/7490)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Add struct support to parquet writer ([#7461](https://github.com/rapidsai/cudf/pull/7461)) [@devavret](https://github.com/devavret) +- Join APIs that return gathermaps ([#7454](https://github.com/rapidsai/cudf/pull/7454)) [@shwina](https://github.com/shwina) +- `fixed_point` + `cudf::binary_operation` API Changes ([#7435](https://github.com/rapidsai/cudf/pull/7435)) [@codereport](https://github.com/codereport) +- Fix BUG: Exception when PYTHONOPTIMIZE=2 ([#7434](https://github.com/rapidsai/cudf/pull/7434)) [@skirui-source](https://github.com/skirui-source) +- Change nvtext::load_vocabulary_file to return a unique ptr ([#7424](https://github.com/rapidsai/cudf/pull/7424)) [@davidwendt](https://github.com/davidwendt) +- Refactor strings column factories ([#7397](https://github.com/rapidsai/cudf/pull/7397)) [@harrism](https://github.com/harrism) +- Use CMAKE_CUDA_ARCHITECTURES ([#7391](https://github.com/rapidsai/cudf/pull/7391)) [@robertmaynard](https://github.com/robertmaynard) +- Upgrade pandas to 1.2 ([#7375](https://github.com/rapidsai/cudf/pull/7375)) [@galipremsagar](https://github.com/galipremsagar) +- Rename `logical_cast` to `bit_cast` and allow additional conversions ([#7373](https://github.com/rapidsai/cudf/pull/7373)) [@ttnghia](https://github.com/ttnghia) +- Rework libcudf CMakeLists.txt to export targets for CPM ([#7107](https://github.com/rapidsai/cudf/pull/7107)) [@trxcllnt](https://github.com/trxcllnt) + +## 🐛 Bug Fixes + +- Fix a `NameError` in meta dispatch API ([#7996](https://github.com/rapidsai/cudf/pull/7996)) [@galipremsagar](https://github.com/galipremsagar) +- Reindex in `DataFrame.__setitem__` ([#7957](https://github.com/rapidsai/cudf/pull/7957)) [@galipremsagar](https://github.com/galipremsagar) +- jitify direct-to-cubin compilation and caching. ([#7919](https://github.com/rapidsai/cudf/pull/7919)) [@cwharris](https://github.com/cwharris) +- Use dynamic cudart for nvcomp in java build ([#7896](https://github.com/rapidsai/cudf/pull/7896)) [@abellina](https://github.com/abellina) +- fix "incompatible redefinition" warnings ([#7894](https://github.com/rapidsai/cudf/pull/7894)) [@cwharris](https://github.com/cwharris) +- cudf consistently specifies the cuda runtime ([#7887](https://github.com/rapidsai/cudf/pull/7887)) [@robertmaynard](https://github.com/robertmaynard) +- disable verbose output for jitify_preprocess ([#7886](https://github.com/rapidsai/cudf/pull/7886)) [@cwharris](https://github.com/cwharris) +- CMake jit_preprocess_files function only runs when needed ([#7872](https://github.com/rapidsai/cudf/pull/7872)) [@robertmaynard](https://github.com/robertmaynard) +- Push DeviceScalar construction into cython for list.contains ([#7864](https://github.com/rapidsai/cudf/pull/7864)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- cudf now sets an install rpath of $ORIGIN ([#7863](https://github.com/rapidsai/cudf/pull/7863)) [@robertmaynard](https://github.com/robertmaynard) +- Don't install Thrust examples, tests, docs, and python files ([#7811](https://github.com/rapidsai/cudf/pull/7811)) [@robertmaynard](https://github.com/robertmaynard) +- Sort by index in groupby tests more consistently ([#7802](https://github.com/rapidsai/cudf/pull/7802)) [@shwina](https://github.com/shwina) +- Revert "Update conda recipes pinning of repo dependencies ([#7743)" (#7793](https://github.com/rapidsai/cudf/pull/7743)" (#7793)) [@raydouglass](https://github.com/raydouglass) +- Add decimal column handling in copy_type_metadata ([#7788](https://github.com/rapidsai/cudf/pull/7788)) [@shwina](https://github.com/shwina) +- Add column names validation in parquet writer ([#7786](https://github.com/rapidsai/cudf/pull/7786)) [@galipremsagar](https://github.com/galipremsagar) +- Fix Java explode outer unit tests ([#7782](https://github.com/rapidsai/cudf/pull/7782)) [@jlowe](https://github.com/jlowe) +- Fix compiler warning about non-POD types passed through ellipsis ([#7781](https://github.com/rapidsai/cudf/pull/7781)) [@jrhemstad](https://github.com/jrhemstad) +- User resource fix for replace_nulls ([#7769](https://github.com/rapidsai/cudf/pull/7769)) [@magnatelee](https://github.com/magnatelee) +- Fix type dispatch for columnar replace_nulls ([#7768](https://github.com/rapidsai/cudf/pull/7768)) [@jlowe](https://github.com/jlowe) +- Add `ignore_order` parameter to dask-cudf concat dispatch ([#7765](https://github.com/rapidsai/cudf/pull/7765)) [@galipremsagar](https://github.com/galipremsagar) +- Fix slicing and arrow representations of decimal columns ([#7755](https://github.com/rapidsai/cudf/pull/7755)) [@vyasr](https://github.com/vyasr) +- Fixing issue with explode_outer position not nulling position entries of null rows ([#7754](https://github.com/rapidsai/cudf/pull/7754)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- Implement scatter for struct columns ([#7752](https://github.com/rapidsai/cudf/pull/7752)) [@ttnghia](https://github.com/ttnghia) +- Fix data corruption in string columns ([#7746](https://github.com/rapidsai/cudf/pull/7746)) [@galipremsagar](https://github.com/galipremsagar) +- Fix string length in stripe dictionary building ([#7744](https://github.com/rapidsai/cudf/pull/7744)) [@kaatish](https://github.com/kaatish) +- Update conda recipes pinning of repo dependencies ([#7743](https://github.com/rapidsai/cudf/pull/7743)) [@mike-wendt](https://github.com/mike-wendt) +- Enable dask dispatch to cuDF's `is_categorical_dtype` for cuDF objects ([#7740](https://github.com/rapidsai/cudf/pull/7740)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Fix dictionary size computation in ORC writer ([#7737](https://github.com/rapidsai/cudf/pull/7737)) [@vuule](https://github.com/vuule) +- Fix `cudf::cast` overflow for `decimal64` to `int32_t` or smaller in certain cases ([#7733](https://github.com/rapidsai/cudf/pull/7733)) [@codereport](https://github.com/codereport) +- Change JNI API to avoid loading native dependencies when creating sort order classes. ([#7729](https://github.com/rapidsai/cudf/pull/7729)) [@revans2](https://github.com/revans2) +- Disable column_view data accessors for unsupported types ([#7725](https://github.com/rapidsai/cudf/pull/7725)) [@jrhemstad](https://github.com/jrhemstad) +- Materialize `RangeIndex` when `index=True` in parquet writer ([#7711](https://github.com/rapidsai/cudf/pull/7711)) [@galipremsagar](https://github.com/galipremsagar) +- Don't identify decimals as strings. ([#7710](https://github.com/rapidsai/cudf/pull/7710)) [@vyasr](https://github.com/vyasr) +- Fix return type of `DataFrame.argsort` ([#7706](https://github.com/rapidsai/cudf/pull/7706)) [@galipremsagar](https://github.com/galipremsagar) +- Fix/correct cudf installed package requirements ([#7688](https://github.com/rapidsai/cudf/pull/7688)) [@robertmaynard](https://github.com/robertmaynard) +- Fix SparkMurmurHash3_32 hash inconsistencies with Apache Spark ([#7672](https://github.com/rapidsai/cudf/pull/7672)) [@jlowe](https://github.com/jlowe) +- Fix ORC reader issue with reading empty string columns ([#7656](https://github.com/rapidsai/cudf/pull/7656)) [@rgsl888prabhu](https://github.com/rgsl888prabhu) +- Fix Java Parquet write after writer API changes ([#7655](https://github.com/rapidsai/cudf/pull/7655)) [@revans2](https://github.com/revans2) +- Fixing empty null lists throwing explode_outer for a loop. ([#7649](https://github.com/rapidsai/cudf/pull/7649)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- Fix internal compiler error during JNI Docker build ([#7645](https://github.com/rapidsai/cudf/pull/7645)) [@jlowe](https://github.com/jlowe) +- Fix Debug build break with device_uvectors in grouped_rolling.cu ([#7633](https://github.com/rapidsai/cudf/pull/7633)) [@mythrocks](https://github.com/mythrocks) +- Parquet reader: Fix issue when using skip_rows on non-nested columns containing nulls ([#7627](https://github.com/rapidsai/cudf/pull/7627)) [@nvdbaranec](https://github.com/nvdbaranec) +- Fix ORC reader for empty DataFrame/Table ([#7624](https://github.com/rapidsai/cudf/pull/7624)) [@rgsl888prabhu](https://github.com/rgsl888prabhu) +- Fix specifying GPU architecture in JNI build ([#7612](https://github.com/rapidsai/cudf/pull/7612)) [@jlowe](https://github.com/jlowe) +- Fix ORC writer OOM issue ([#7605](https://github.com/rapidsai/cudf/pull/7605)) [@vuule](https://github.com/vuule) +- Fix 0.18 --> 0.19 automerge ([#7589](https://github.com/rapidsai/cudf/pull/7589)) [@kkraus14](https://github.com/kkraus14) +- Fix ORC issue with incorrect timestamp nanosecond values ([#7581](https://github.com/rapidsai/cudf/pull/7581)) [@vuule](https://github.com/vuule) +- Fix missing Dask imports ([#7580](https://github.com/rapidsai/cudf/pull/7580)) [@kkraus14](https://github.com/kkraus14) +- CMAKE_CUDA_ARCHITECTURES doesn't change when build-system invokes cmake ([#7579](https://github.com/rapidsai/cudf/pull/7579)) [@robertmaynard](https://github.com/robertmaynard) +- Another fix for offsets_end() iterator in lists_column_view ([#7575](https://github.com/rapidsai/cudf/pull/7575)) [@ttnghia](https://github.com/ttnghia) +- Fix ORC writer output corruption with string columns ([#7565](https://github.com/rapidsai/cudf/pull/7565)) [@vuule](https://github.com/vuule) +- Fix cudf::lists::sort_lists failing for sliced column ([#7564](https://github.com/rapidsai/cudf/pull/7564)) [@ttnghia](https://github.com/ttnghia) +- FIX Fix Anaconda upload args ([#7558](https://github.com/rapidsai/cudf/pull/7558)) [@dillon-cullinan](https://github.com/dillon-cullinan) +- Fix index mismatch issue in equality related APIs ([#7555](https://github.com/rapidsai/cudf/pull/7555)) [@galipremsagar](https://github.com/galipremsagar) +- FIX Revert gpuci_conda_retry on conda file output locations ([#7552](https://github.com/rapidsai/cudf/pull/7552)) [@dillon-cullinan](https://github.com/dillon-cullinan) +- Fix offset_end iterator for lists_column_view, which was not correctl… ([#7551](https://github.com/rapidsai/cudf/pull/7551)) [@ttnghia](https://github.com/ttnghia) +- Fix no such file dlpack.h error when build libcudf ([#7549](https://github.com/rapidsai/cudf/pull/7549)) [@chenrui17](https://github.com/chenrui17) +- Update missing docstring examples in python public APIs ([#7546](https://github.com/rapidsai/cudf/pull/7546)) [@galipremsagar](https://github.com/galipremsagar) +- Decimal32 Build Fix ([#7544](https://github.com/rapidsai/cudf/pull/7544)) [@razajafri](https://github.com/razajafri) +- FIX Retry conda output location ([#7540](https://github.com/rapidsai/cudf/pull/7540)) [@dillon-cullinan](https://github.com/dillon-cullinan) +- fix missing renames of dask git branches from master to main ([#7535](https://github.com/rapidsai/cudf/pull/7535)) [@kkraus14](https://github.com/kkraus14) +- Remove detail from device_span ([#7533](https://github.com/rapidsai/cudf/pull/7533)) [@rwlee](https://github.com/rwlee) +- Change dask and distributed branch to main ([#7532](https://github.com/rapidsai/cudf/pull/7532)) [@dantegd](https://github.com/dantegd) +- Update JNI build to use CUDF_USE_ARROW_STATIC ([#7526](https://github.com/rapidsai/cudf/pull/7526)) [@jlowe](https://github.com/jlowe) +- Make sure rmm::rmm CMake target is visibile to cudf users ([#7524](https://github.com/rapidsai/cudf/pull/7524)) [@robertmaynard](https://github.com/robertmaynard) +- Fix contiguous_split not properly handling output partitions > 2 GB. ([#7515](https://github.com/rapidsai/cudf/pull/7515)) [@nvdbaranec](https://github.com/nvdbaranec) +- Change jit launch to safe_launch ([#7510](https://github.com/rapidsai/cudf/pull/7510)) [@devavret](https://github.com/devavret) +- Fix comparison between Datetime/Timedelta columns and NULL scalars ([#7504](https://github.com/rapidsai/cudf/pull/7504)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Fix off-by-one error in char-parallel string scalar replace ([#7502](https://github.com/rapidsai/cudf/pull/7502)) [@jlowe](https://github.com/jlowe) +- Fix JNI deprecation of all, put it on the wrong version before ([#7501](https://github.com/rapidsai/cudf/pull/7501)) [@revans2](https://github.com/revans2) +- Fix Series/Dataframe Mixed Arithmetic ([#7491](https://github.com/rapidsai/cudf/pull/7491)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Fix JNI build after removal of libcudf sub-libraries ([#7486](https://github.com/rapidsai/cudf/pull/7486)) [@jlowe](https://github.com/jlowe) +- Correctly compile benchmarks ([#7485](https://github.com/rapidsai/cudf/pull/7485)) [@robertmaynard](https://github.com/robertmaynard) +- Fix bool column corruption with ORC Reader ([#7483](https://github.com/rapidsai/cudf/pull/7483)) [@rgsl888prabhu](https://github.com/rgsl888prabhu) +- Fix `__repr__` for categorical dtype ([#7476](https://github.com/rapidsai/cudf/pull/7476)) [@galipremsagar](https://github.com/galipremsagar) +- Java cleaner synchronization ([#7474](https://github.com/rapidsai/cudf/pull/7474)) [@abellina](https://github.com/abellina) +- Fix java float/double parsing tests ([#7473](https://github.com/rapidsai/cudf/pull/7473)) [@revans2](https://github.com/revans2) +- Pass stream and user resource to make_default_constructed_scalar ([#7469](https://github.com/rapidsai/cudf/pull/7469)) [@magnatelee](https://github.com/magnatelee) +- Improve stability of dask_cudf.DataFrame.var and dask_cudf.DataFrame.std ([#7453](https://github.com/rapidsai/cudf/pull/7453)) [@rjzamora](https://github.com/rjzamora) +- Missing `device_storage_dispatch` change affecting `cudf::gather` ([#7449](https://github.com/rapidsai/cudf/pull/7449)) [@codereport](https://github.com/codereport) +- fix cuFile JNI compile errors ([#7445](https://github.com/rapidsai/cudf/pull/7445)) [@rongou](https://github.com/rongou) +- Support `Series.__setitem__` with key to a new row ([#7443](https://github.com/rapidsai/cudf/pull/7443)) [@isVoid](https://github.com/isVoid) +- Fix BUG: Exception when PYTHONOPTIMIZE=2 ([#7434](https://github.com/rapidsai/cudf/pull/7434)) [@skirui-source](https://github.com/skirui-source) +- Make inclusive scan safe for cases with leading nulls ([#7432](https://github.com/rapidsai/cudf/pull/7432)) [@magnatelee](https://github.com/magnatelee) +- Fix typo in list_device_view::pair_rep_end() ([#7423](https://github.com/rapidsai/cudf/pull/7423)) [@mythrocks](https://github.com/mythrocks) +- Fix string to double conversion and row equivalent comparison ([#7410](https://github.com/rapidsai/cudf/pull/7410)) [@ttnghia](https://github.com/ttnghia) +- Fix thrust failure when transfering data from device_vector to host_vector with vectors of size 1 ([#7382](https://github.com/rapidsai/cudf/pull/7382)) [@ttnghia](https://github.com/ttnghia) +- Fix std::exeception catch-by-reference gcc9 compile error ([#7380](https://github.com/rapidsai/cudf/pull/7380)) [@davidwendt](https://github.com/davidwendt) +- Fix skiprows issue with ORC Reader ([#7359](https://github.com/rapidsai/cudf/pull/7359)) [@rgsl888prabhu](https://github.com/rgsl888prabhu) +- fix Arrow CMake file ([#7358](https://github.com/rapidsai/cudf/pull/7358)) [@rongou](https://github.com/rongou) +- Fix lists::contains() for NaN and Decimals ([#7349](https://github.com/rapidsai/cudf/pull/7349)) [@mythrocks](https://github.com/mythrocks) +- Handle cupy array in `Dataframe.__setitem__` ([#7340](https://github.com/rapidsai/cudf/pull/7340)) [@galipremsagar](https://github.com/galipremsagar) +- Fix invalid-device-fn error in cudf::strings::replace_re with multiple regex's ([#7336](https://github.com/rapidsai/cudf/pull/7336)) [@davidwendt](https://github.com/davidwendt) +- FIX Add codecov upload block to gpu script ([#6860](https://github.com/rapidsai/cudf/pull/6860)) [@dillon-cullinan](https://github.com/dillon-cullinan) + +## 📖 Documentation + +- Fix join API doxygen ([#7890](https://github.com/rapidsai/cudf/pull/7890)) [@shwina](https://github.com/shwina) +- Add Resources to README. ([#7697](https://github.com/rapidsai/cudf/pull/7697)) [@bdice](https://github.com/bdice) +- Add `isin` examples in Docstring ([#7479](https://github.com/rapidsai/cudf/pull/7479)) [@galipremsagar](https://github.com/galipremsagar) +- Resolving unlinked type shorthands in cudf doc ([#7416](https://github.com/rapidsai/cudf/pull/7416)) [@isVoid](https://github.com/isVoid) +- Fix typo in regex.md doc page ([#7363](https://github.com/rapidsai/cudf/pull/7363)) [@davidwendt](https://github.com/davidwendt) +- Fix incorrect strings_column_view::chars_size documentation ([#7360](https://github.com/rapidsai/cudf/pull/7360)) [@jlowe](https://github.com/jlowe) + +## 🚀 New Features + +- Enable basic reductions for decimal columns ([#7776](https://github.com/rapidsai/cudf/pull/7776)) [@ChrisJar](https://github.com/ChrisJar) +- Enable join on decimal columns ([#7764](https://github.com/rapidsai/cudf/pull/7764)) [@ChrisJar](https://github.com/ChrisJar) +- Allow merging index column with data column using keyword "on" ([#7736](https://github.com/rapidsai/cudf/pull/7736)) [@skirui-source](https://github.com/skirui-source) +- Implement DecimalColumn + Scalar and add cudf.Scalars of Decimal64Dtype ([#7732](https://github.com/rapidsai/cudf/pull/7732)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Add support for `unique` groupby aggregation ([#7726](https://github.com/rapidsai/cudf/pull/7726)) [@shwina](https://github.com/shwina) +- Expose libcudf's label_bins function to cudf ([#7724](https://github.com/rapidsai/cudf/pull/7724)) [@vyasr](https://github.com/vyasr) +- Adding support for equi-join on struct ([#7720](https://github.com/rapidsai/cudf/pull/7720)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- Add decimal column comparison operations ([#7716](https://github.com/rapidsai/cudf/pull/7716)) [@isVoid](https://github.com/isVoid) +- Implement scan operations for decimal columns ([#7707](https://github.com/rapidsai/cudf/pull/7707)) [@ChrisJar](https://github.com/ChrisJar) +- Enable typecasting between decimal and int ([#7691](https://github.com/rapidsai/cudf/pull/7691)) [@ChrisJar](https://github.com/ChrisJar) +- Enable decimal support in parquet writer ([#7673](https://github.com/rapidsai/cudf/pull/7673)) [@devavret](https://github.com/devavret) +- Adds `list.unique` API ([#7664](https://github.com/rapidsai/cudf/pull/7664)) [@isVoid](https://github.com/isVoid) +- Fix NaN handling in drop_list_duplicates ([#7662](https://github.com/rapidsai/cudf/pull/7662)) [@ttnghia](https://github.com/ttnghia) +- Add `lists.sort_values` API ([#7657](https://github.com/rapidsai/cudf/pull/7657)) [@isVoid](https://github.com/isVoid) +- Add is_integer API that can check for the validity of a string-to-integer conversion ([#7642](https://github.com/rapidsai/cudf/pull/7642)) [@ttnghia](https://github.com/ttnghia) +- Adds `explode` API ([#7607](https://github.com/rapidsai/cudf/pull/7607)) [@isVoid](https://github.com/isVoid) +- Adds `list.take`, python binding for `cudf::lists::segmented_gather` ([#7591](https://github.com/rapidsai/cudf/pull/7591)) [@isVoid](https://github.com/isVoid) +- Implement cudf::label_bins() ([#7554](https://github.com/rapidsai/cudf/pull/7554)) [@vyasr](https://github.com/vyasr) +- Add Python bindings for `lists::contains` ([#7547](https://github.com/rapidsai/cudf/pull/7547)) [@skirui-source](https://github.com/skirui-source) +- cudf::row_bit_count() support. ([#7534](https://github.com/rapidsai/cudf/pull/7534)) [@nvdbaranec](https://github.com/nvdbaranec) +- Implement drop_list_duplicates ([#7528](https://github.com/rapidsai/cudf/pull/7528)) [@ttnghia](https://github.com/ttnghia) +- Add Python bindings for `lists::extract_lists_element` ([#7505](https://github.com/rapidsai/cudf/pull/7505)) [@skirui-source](https://github.com/skirui-source) +- Add explode_outer and explode_outer_position ([#7499](https://github.com/rapidsai/cudf/pull/7499)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- Match Pandas logic for comparing two objects with nulls ([#7490](https://github.com/rapidsai/cudf/pull/7490)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Add struct support to parquet writer ([#7461](https://github.com/rapidsai/cudf/pull/7461)) [@devavret](https://github.com/devavret) +- Enable type conversion from float to decimal type ([#7450](https://github.com/rapidsai/cudf/pull/7450)) [@ChrisJar](https://github.com/ChrisJar) +- Add cython for converting strings/fixed-point functions ([#7429](https://github.com/rapidsai/cudf/pull/7429)) [@davidwendt](https://github.com/davidwendt) +- Add struct column support to cudf::sort and cudf::sorted_order ([#7422](https://github.com/rapidsai/cudf/pull/7422)) [@karthikeyann](https://github.com/karthikeyann) +- Implement groupby collect_set ([#7420](https://github.com/rapidsai/cudf/pull/7420)) [@ttnghia](https://github.com/ttnghia) +- Merge branch-0.18 into branch-0.19 ([#7411](https://github.com/rapidsai/cudf/pull/7411)) [@raydouglass](https://github.com/raydouglass) +- Refactor strings column factories ([#7397](https://github.com/rapidsai/cudf/pull/7397)) [@harrism](https://github.com/harrism) +- Add groupby scan operations (sort groupby) ([#7387](https://github.com/rapidsai/cudf/pull/7387)) [@karthikeyann](https://github.com/karthikeyann) +- Add cudf::explode_position ([#7376](https://github.com/rapidsai/cudf/pull/7376)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- Add string conversion to/from decimal values libcudf APIs ([#7364](https://github.com/rapidsai/cudf/pull/7364)) [@davidwendt](https://github.com/davidwendt) +- Add groupby SUM_OF_SQUARES support ([#7362](https://github.com/rapidsai/cudf/pull/7362)) [@karthikeyann](https://github.com/karthikeyann) +- Add `Series.drop` api ([#7304](https://github.com/rapidsai/cudf/pull/7304)) [@isVoid](https://github.com/isVoid) +- get_json_object() implementation ([#7286](https://github.com/rapidsai/cudf/pull/7286)) [@nvdbaranec](https://github.com/nvdbaranec) +- Python API for `LIstMethods.len()` ([#7283](https://github.com/rapidsai/cudf/pull/7283)) [@isVoid](https://github.com/isVoid) +- Support null_policy::EXCLUDE for COLLECT rolling aggregation ([#7264](https://github.com/rapidsai/cudf/pull/7264)) [@mythrocks](https://github.com/mythrocks) +- Add support for special tokens in nvtext::subword_tokenizer ([#7254](https://github.com/rapidsai/cudf/pull/7254)) [@davidwendt](https://github.com/davidwendt) +- Fix inplace update of data and add Series.update ([#7201](https://github.com/rapidsai/cudf/pull/7201)) [@galipremsagar](https://github.com/galipremsagar) +- Implement `cudf::group_by` (hash) for `decimal32` and `decimal64` ([#7190](https://github.com/rapidsai/cudf/pull/7190)) [@codereport](https://github.com/codereport) +- Adding support to specify "level" parameter for `Dataframe.rename` ([#7135](https://github.com/rapidsai/cudf/pull/7135)) [@skirui-source](https://github.com/skirui-source) + +## 🛠️ Improvements + +- fix GDS include path for version 0.95 ([#7877](https://github.com/rapidsai/cudf/pull/7877)) [@rongou](https://github.com/rongou) +- Update `dask` + `distributed` to `2021.4.0` ([#7858](https://github.com/rapidsai/cudf/pull/7858)) [@jakirkham](https://github.com/jakirkham) +- Add ability to extract include dirs from `CUDF_HOME` ([#7848](https://github.com/rapidsai/cudf/pull/7848)) [@galipremsagar](https://github.com/galipremsagar) +- Add USE_GDS as an option in build script ([#7833](https://github.com/rapidsai/cudf/pull/7833)) [@pxLi](https://github.com/pxLi) +- add an allocate method with stream in java DeviceMemoryBuffer ([#7826](https://github.com/rapidsai/cudf/pull/7826)) [@rongou](https://github.com/rongou) +- Constrain dask and distributed versions to 2021.3.1 ([#7825](https://github.com/rapidsai/cudf/pull/7825)) [@shwina](https://github.com/shwina) +- Revert dask versioning of concat dispatch ([#7823](https://github.com/rapidsai/cudf/pull/7823)) [@galipremsagar](https://github.com/galipremsagar) +- add copy methods in Java memory buffer ([#7791](https://github.com/rapidsai/cudf/pull/7791)) [@rongou](https://github.com/rongou) +- Update README and CONTRIBUTING for 0.19 ([#7778](https://github.com/rapidsai/cudf/pull/7778)) [@robertmaynard](https://github.com/robertmaynard) +- Allow hash_partition to take a seed value ([#7771](https://github.com/rapidsai/cudf/pull/7771)) [@magnatelee](https://github.com/magnatelee) +- Turn on NVTX by default in java build ([#7761](https://github.com/rapidsai/cudf/pull/7761)) [@tgravescs](https://github.com/tgravescs) +- Add Java bindings to join gather map APIs ([#7751](https://github.com/rapidsai/cudf/pull/7751)) [@jlowe](https://github.com/jlowe) +- Add replacements column support for Java replaceNulls ([#7750](https://github.com/rapidsai/cudf/pull/7750)) [@jlowe](https://github.com/jlowe) +- Add Java bindings for row_bit_count ([#7749](https://github.com/rapidsai/cudf/pull/7749)) [@jlowe](https://github.com/jlowe) +- Remove unused JVM array creation ([#7748](https://github.com/rapidsai/cudf/pull/7748)) [@jlowe](https://github.com/jlowe) +- Added JNI support for new is_integer ([#7739](https://github.com/rapidsai/cudf/pull/7739)) [@revans2](https://github.com/revans2) +- Create and promote library aliases in libcudf installations ([#7734](https://github.com/rapidsai/cudf/pull/7734)) [@trxcllnt](https://github.com/trxcllnt) +- Support groupby operations for decimal dtypes ([#7731](https://github.com/rapidsai/cudf/pull/7731)) [@vyasr](https://github.com/vyasr) +- Memory map the input file only when GDS compatiblity mode is not used ([#7717](https://github.com/rapidsai/cudf/pull/7717)) [@vuule](https://github.com/vuule) +- Replace device_vector with device_uvector in null_mask ([#7715](https://github.com/rapidsai/cudf/pull/7715)) [@harrism](https://github.com/harrism) +- Struct hashing support for SerialMurmur3 and SparkMurmur3 ([#7714](https://github.com/rapidsai/cudf/pull/7714)) [@jlowe](https://github.com/jlowe) +- Add gbenchmark for nvtext replace-tokens function ([#7708](https://github.com/rapidsai/cudf/pull/7708)) [@davidwendt](https://github.com/davidwendt) +- Use stream in groupby calls ([#7705](https://github.com/rapidsai/cudf/pull/7705)) [@karthikeyann](https://github.com/karthikeyann) +- Update codeowners file ([#7701](https://github.com/rapidsai/cudf/pull/7701)) [@ajschmidt8](https://github.com/ajschmidt8) +- Cleanup groupby to use host_span, device_span, device_uvector ([#7698](https://github.com/rapidsai/cudf/pull/7698)) [@karthikeyann](https://github.com/karthikeyann) +- Add gbenchmark for nvtext ngrams functions ([#7693](https://github.com/rapidsai/cudf/pull/7693)) [@davidwendt](https://github.com/davidwendt) +- Misc Python/Cython optimizations ([#7686](https://github.com/rapidsai/cudf/pull/7686)) [@shwina](https://github.com/shwina) +- Add gbenchmark for nvtext tokenize functions ([#7684](https://github.com/rapidsai/cudf/pull/7684)) [@davidwendt](https://github.com/davidwendt) +- Add column_device_view to orc writer ([#7676](https://github.com/rapidsai/cudf/pull/7676)) [@kaatish](https://github.com/kaatish) +- cudf_kafka now uses cuDF CMake export targets (CPM) ([#7674](https://github.com/rapidsai/cudf/pull/7674)) [@robertmaynard](https://github.com/robertmaynard) +- Add gbenchmark for nvtext normalize functions ([#7668](https://github.com/rapidsai/cudf/pull/7668)) [@davidwendt](https://github.com/davidwendt) +- Resolve unnecessary import of thrust/optional.hpp in types.hpp ([#7667](https://github.com/rapidsai/cudf/pull/7667)) [@vyasr](https://github.com/vyasr) +- Feature/optimize accessor copy ([#7660](https://github.com/rapidsai/cudf/pull/7660)) [@vyasr](https://github.com/vyasr) +- Fix `find_package(cudf)` ([#7658](https://github.com/rapidsai/cudf/pull/7658)) [@trxcllnt](https://github.com/trxcllnt) +- Work-around for gcc7 compile error on Centos7 ([#7652](https://github.com/rapidsai/cudf/pull/7652)) [@davidwendt](https://github.com/davidwendt) +- Add in JNI support for count_elements ([#7651](https://github.com/rapidsai/cudf/pull/7651)) [@revans2](https://github.com/revans2) +- Fix issues with building cudf in a non-conda environment ([#7647](https://github.com/rapidsai/cudf/pull/7647)) [@galipremsagar](https://github.com/galipremsagar) +- Refactor ConfigureCUDA to not conditionally insert compiler flags ([#7643](https://github.com/rapidsai/cudf/pull/7643)) [@robertmaynard](https://github.com/robertmaynard) +- Add gbenchmark for converting strings to/from timestamps ([#7641](https://github.com/rapidsai/cudf/pull/7641)) [@davidwendt](https://github.com/davidwendt) +- Handle constructing a `cudf.Scalar` from a `cudf.Scalar` ([#7639](https://github.com/rapidsai/cudf/pull/7639)) [@shwina](https://github.com/shwina) +- Add in JNI support for table partition ([#7637](https://github.com/rapidsai/cudf/pull/7637)) [@revans2](https://github.com/revans2) +- Add explicit fixed_point merge test ([#7635](https://github.com/rapidsai/cudf/pull/7635)) [@codereport](https://github.com/codereport) +- Add JNI support for IDENTITY hash partitioning ([#7626](https://github.com/rapidsai/cudf/pull/7626)) [@revans2](https://github.com/revans2) +- Java support on explode_outer ([#7625](https://github.com/rapidsai/cudf/pull/7625)) [@sperlingxx](https://github.com/sperlingxx) +- Java support of casting string from/to decimal ([#7623](https://github.com/rapidsai/cudf/pull/7623)) [@sperlingxx](https://github.com/sperlingxx) +- Convert cudf::concatenate APIs to use spans and device_uvector ([#7621](https://github.com/rapidsai/cudf/pull/7621)) [@harrism](https://github.com/harrism) +- Add gbenchmark for cudf::strings::translate function ([#7617](https://github.com/rapidsai/cudf/pull/7617)) [@davidwendt](https://github.com/davidwendt) +- Use file(COPY ) over file(INSTALL ) so cmake output is reduced ([#7616](https://github.com/rapidsai/cudf/pull/7616)) [@robertmaynard](https://github.com/robertmaynard) +- Use rmm::device_uvector in place of rmm::device_vector for ORC reader/writer and cudf::io::column_buffer ([#7614](https://github.com/rapidsai/cudf/pull/7614)) [@vuule](https://github.com/vuule) +- Refactor Java host-side buffer concatenation to expose separate steps ([#7610](https://github.com/rapidsai/cudf/pull/7610)) [@jlowe](https://github.com/jlowe) +- Add gbenchmarks for string substrings functions ([#7603](https://github.com/rapidsai/cudf/pull/7603)) [@davidwendt](https://github.com/davidwendt) +- Refactor string conversion check ([#7599](https://github.com/rapidsai/cudf/pull/7599)) [@ttnghia](https://github.com/ttnghia) +- JNI: Pass names of children struct columns to native Arrow IPC writer ([#7598](https://github.com/rapidsai/cudf/pull/7598)) [@firestarman](https://github.com/firestarman) +- Revert "ENH Fix stale GHA and prevent duplicates " ([#7595](https://github.com/rapidsai/cudf/pull/7595)) [@mike-wendt](https://github.com/mike-wendt) +- ENH Fix stale GHA and prevent duplicates ([#7594](https://github.com/rapidsai/cudf/pull/7594)) [@mike-wendt](https://github.com/mike-wendt) +- Fix auto-detecting GPU architectures ([#7593](https://github.com/rapidsai/cudf/pull/7593)) [@trxcllnt](https://github.com/trxcllnt) +- Reduce cudf library size ([#7583](https://github.com/rapidsai/cudf/pull/7583)) [@robertmaynard](https://github.com/robertmaynard) +- Optimize cudf::make_strings_column for long strings ([#7576](https://github.com/rapidsai/cudf/pull/7576)) [@davidwendt](https://github.com/davidwendt) +- Always build and export the cudf::cudftestutil target ([#7574](https://github.com/rapidsai/cudf/pull/7574)) [@trxcllnt](https://github.com/trxcllnt) +- Eliminate literal parameters to uvector::set_element_async and device_scalar::set_value ([#7563](https://github.com/rapidsai/cudf/pull/7563)) [@harrism](https://github.com/harrism) +- Add gbenchmark for strings::concatenate ([#7560](https://github.com/rapidsai/cudf/pull/7560)) [@davidwendt](https://github.com/davidwendt) +- Update Changelog Link ([#7550](https://github.com/rapidsai/cudf/pull/7550)) [@ajschmidt8](https://github.com/ajschmidt8) +- Add gbenchmarks for strings replace regex functions ([#7541](https://github.com/rapidsai/cudf/pull/7541)) [@davidwendt](https://github.com/davidwendt) +- Add `__repr__` for Column and ColumnAccessor ([#7531](https://github.com/rapidsai/cudf/pull/7531)) [@shwina](https://github.com/shwina) +- Support Decimal DIV changes in cudf ([#7527](https://github.com/rapidsai/cudf/pull/7527)) [@razajafri](https://github.com/razajafri) +- Remove unneeded step parameter from strings::detail::copy_slice ([#7525](https://github.com/rapidsai/cudf/pull/7525)) [@davidwendt](https://github.com/davidwendt) +- Use device_uvector, device_span in sort groupby ([#7523](https://github.com/rapidsai/cudf/pull/7523)) [@karthikeyann](https://github.com/karthikeyann) +- Add gbenchmarks for strings extract function ([#7522](https://github.com/rapidsai/cudf/pull/7522)) [@davidwendt](https://github.com/davidwendt) +- Rename ARROW_STATIC_LIB because it conflicts with one in FindArrow.cmake ([#7518](https://github.com/rapidsai/cudf/pull/7518)) [@trxcllnt](https://github.com/trxcllnt) +- Reduce compile time/size for scan.cu ([#7516](https://github.com/rapidsai/cudf/pull/7516)) [@davidwendt](https://github.com/davidwendt) +- Change device_vector to device_uvector in nvtext source files ([#7512](https://github.com/rapidsai/cudf/pull/7512)) [@davidwendt](https://github.com/davidwendt) +- Removed unneeded includes from traits.hpp ([#7509](https://github.com/rapidsai/cudf/pull/7509)) [@davidwendt](https://github.com/davidwendt) +- FIX Remove random build directory generation for ccache ([#7508](https://github.com/rapidsai/cudf/pull/7508)) [@dillon-cullinan](https://github.com/dillon-cullinan) +- xfail failing pytest in pandas 1.2.3 ([#7507](https://github.com/rapidsai/cudf/pull/7507)) [@galipremsagar](https://github.com/galipremsagar) +- JNI bit cast ([#7493](https://github.com/rapidsai/cudf/pull/7493)) [@revans2](https://github.com/revans2) +- Combine rolling window function tests ([#7480](https://github.com/rapidsai/cudf/pull/7480)) [@mythrocks](https://github.com/mythrocks) +- Prepare Changelog for Automation ([#7477](https://github.com/rapidsai/cudf/pull/7477)) [@ajschmidt8](https://github.com/ajschmidt8) +- Java support for explode position ([#7471](https://github.com/rapidsai/cudf/pull/7471)) [@sperlingxx](https://github.com/sperlingxx) +- Update 0.18 changelog entry ([#7463](https://github.com/rapidsai/cudf/pull/7463)) [@ajschmidt8](https://github.com/ajschmidt8) +- JNI: Support skipping nulls for collect aggregation ([#7457](https://github.com/rapidsai/cudf/pull/7457)) [@firestarman](https://github.com/firestarman) +- Join APIs that return gathermaps ([#7454](https://github.com/rapidsai/cudf/pull/7454)) [@shwina](https://github.com/shwina) +- Remove dependence on managed memory for multimap test ([#7451](https://github.com/rapidsai/cudf/pull/7451)) [@jrhemstad](https://github.com/jrhemstad) +- Use cuFile for Parquet IO when available ([#7444](https://github.com/rapidsai/cudf/pull/7444)) [@vuule](https://github.com/vuule) +- Statistics cleanup ([#7439](https://github.com/rapidsai/cudf/pull/7439)) [@kaatish](https://github.com/kaatish) +- Add gbenchmarks for strings filter functions ([#7438](https://github.com/rapidsai/cudf/pull/7438)) [@davidwendt](https://github.com/davidwendt) +- `fixed_point` + `cudf::binary_operation` API Changes ([#7435](https://github.com/rapidsai/cudf/pull/7435)) [@codereport](https://github.com/codereport) +- Improve string gather performance ([#7433](https://github.com/rapidsai/cudf/pull/7433)) [@jlowe](https://github.com/jlowe) +- Don't use user resource for a temporary allocation in sort_by_key ([#7431](https://github.com/rapidsai/cudf/pull/7431)) [@magnatelee](https://github.com/magnatelee) +- Detail APIs for datetime functions ([#7430](https://github.com/rapidsai/cudf/pull/7430)) [@magnatelee](https://github.com/magnatelee) +- Replace thrust::max_element with thrust::reduce in strings findall_re ([#7428](https://github.com/rapidsai/cudf/pull/7428)) [@davidwendt](https://github.com/davidwendt) +- Add gbenchmark for strings split/split_record functions ([#7427](https://github.com/rapidsai/cudf/pull/7427)) [@davidwendt](https://github.com/davidwendt) +- Update JNI build to use CMAKE_CUDA_ARCHITECTURES ([#7425](https://github.com/rapidsai/cudf/pull/7425)) [@jlowe](https://github.com/jlowe) +- Change nvtext::load_vocabulary_file to return a unique ptr ([#7424](https://github.com/rapidsai/cudf/pull/7424)) [@davidwendt](https://github.com/davidwendt) +- Simplify type dispatch with `device_storage_dispatch` ([#7419](https://github.com/rapidsai/cudf/pull/7419)) [@codereport](https://github.com/codereport) +- Java support for casting of nested child columns ([#7417](https://github.com/rapidsai/cudf/pull/7417)) [@razajafri](https://github.com/razajafri) +- Improve scalar string replace performance for long strings ([#7415](https://github.com/rapidsai/cudf/pull/7415)) [@jlowe](https://github.com/jlowe) +- Remove unneeded temporary device vector for strings scatter specialization ([#7409](https://github.com/rapidsai/cudf/pull/7409)) [@davidwendt](https://github.com/davidwendt) +- bitmask_or implementation with bitmask refactor ([#7406](https://github.com/rapidsai/cudf/pull/7406)) [@rwlee](https://github.com/rwlee) +- Add other cudf::strings::replace functions to current strings replace gbenchmark ([#7403](https://github.com/rapidsai/cudf/pull/7403)) [@davidwendt](https://github.com/davidwendt) +- Clean up included headers in `device_operators.cuh` ([#7401](https://github.com/rapidsai/cudf/pull/7401)) [@codereport](https://github.com/codereport) +- Move nullable index iterator to indexalator factory ([#7399](https://github.com/rapidsai/cudf/pull/7399)) [@davidwendt](https://github.com/davidwendt) +- ENH Pass ccache variables to conda recipe & use Ninja in CI ([#7398](https://github.com/rapidsai/cudf/pull/7398)) [@Ethyling](https://github.com/Ethyling) +- upgrade maven-antrun-plugin to support maven parallel builds ([#7393](https://github.com/rapidsai/cudf/pull/7393)) [@rongou](https://github.com/rongou) +- Add gbenchmark for strings find/contains functions ([#7392](https://github.com/rapidsai/cudf/pull/7392)) [@davidwendt](https://github.com/davidwendt) +- Use CMAKE_CUDA_ARCHITECTURES ([#7391](https://github.com/rapidsai/cudf/pull/7391)) [@robertmaynard](https://github.com/robertmaynard) +- Refactor libcudf strings::replace to use make_strings_children utility ([#7384](https://github.com/rapidsai/cudf/pull/7384)) [@davidwendt](https://github.com/davidwendt) +- Added in JNI support for out of core sort algorithm ([#7381](https://github.com/rapidsai/cudf/pull/7381)) [@revans2](https://github.com/revans2) +- Upgrade pandas to 1.2 ([#7375](https://github.com/rapidsai/cudf/pull/7375)) [@galipremsagar](https://github.com/galipremsagar) +- Rename `logical_cast` to `bit_cast` and allow additional conversions ([#7373](https://github.com/rapidsai/cudf/pull/7373)) [@ttnghia](https://github.com/ttnghia) +- jitify 2 support ([#7372](https://github.com/rapidsai/cudf/pull/7372)) [@cwharris](https://github.com/cwharris) +- compile_udf: Cache PTX for similar functions ([#7371](https://github.com/rapidsai/cudf/pull/7371)) [@gmarkall](https://github.com/gmarkall) +- Add string scalar replace benchmark ([#7369](https://github.com/rapidsai/cudf/pull/7369)) [@jlowe](https://github.com/jlowe) +- Add gbenchmark for strings contains_re/count_re functions ([#7366](https://github.com/rapidsai/cudf/pull/7366)) [@davidwendt](https://github.com/davidwendt) +- Update orc reader and writer fuzz tests ([#7357](https://github.com/rapidsai/cudf/pull/7357)) [@galipremsagar](https://github.com/galipremsagar) +- Improve url_decode performance for long strings ([#7353](https://github.com/rapidsai/cudf/pull/7353)) [@jlowe](https://github.com/jlowe) +- `cudf::ast` Small Refactorings ([#7352](https://github.com/rapidsai/cudf/pull/7352)) [@codereport](https://github.com/codereport) +- Remove std::cout and print in the scatter test function EmptyListsOfNullableStrings. ([#7342](https://github.com/rapidsai/cudf/pull/7342)) [@ttnghia](https://github.com/ttnghia) +- Use `cudf::detail::make_counting_transform_iterator` ([#7338](https://github.com/rapidsai/cudf/pull/7338)) [@codereport](https://github.com/codereport) +- Change block size parameter from a global to a template param. ([#7333](https://github.com/rapidsai/cudf/pull/7333)) [@nvdbaranec](https://github.com/nvdbaranec) +- Partial clean up of ORC writer ([#7324](https://github.com/rapidsai/cudf/pull/7324)) [@vuule](https://github.com/vuule) +- Add gbenchmark for cudf::strings::to_lower ([#7316](https://github.com/rapidsai/cudf/pull/7316)) [@davidwendt](https://github.com/davidwendt) +- Update Java bindings version to 0.19-SNAPSHOT ([#7307](https://github.com/rapidsai/cudf/pull/7307)) [@pxLi](https://github.com/pxLi) +- Move `cudf::test::make_counting_transform_iterator` to `cudf/detail/iterator.cuh` ([#7306](https://github.com/rapidsai/cudf/pull/7306)) [@codereport](https://github.com/codereport) +- Use string literals in `fixed_point` `release_assert`s ([#7303](https://github.com/rapidsai/cudf/pull/7303)) [@codereport](https://github.com/codereport) +- Fix merge conflicts for #7295 ([#7297](https://github.com/rapidsai/cudf/pull/7297)) [@ajschmidt8](https://github.com/ajschmidt8) +- Add UTF-8 chars to create_random_column<string_view> benchmark utility ([#7292](https://github.com/rapidsai/cudf/pull/7292)) [@davidwendt](https://github.com/davidwendt) +- Abstracting block reduce and block scan from cuIO kernels with `cub` apis ([#7278](https://github.com/rapidsai/cudf/pull/7278)) [@rgsl888prabhu](https://github.com/rgsl888prabhu) +- Build.sh use cmake --build to drive build system invocation ([#7270](https://github.com/rapidsai/cudf/pull/7270)) [@robertmaynard](https://github.com/robertmaynard) +- Refactor dictionary support for reductions any/all ([#7242](https://github.com/rapidsai/cudf/pull/7242)) [@davidwendt](https://github.com/davidwendt) +- Replace stream.value() with stream for stream_view args ([#7236](https://github.com/rapidsai/cudf/pull/7236)) [@karthikeyann](https://github.com/karthikeyann) +- Interval index and interval_range ([#7182](https://github.com/rapidsai/cudf/pull/7182)) [@marlenezw](https://github.com/marlenezw) +- avro reader integration tests ([#7156](https://github.com/rapidsai/cudf/pull/7156)) [@cwharris](https://github.com/cwharris) +- Rework libcudf CMakeLists.txt to export targets for CPM ([#7107](https://github.com/rapidsai/cudf/pull/7107)) [@trxcllnt](https://github.com/trxcllnt) +- Adding Interval Dtype ([#6984](https://github.com/rapidsai/cudf/pull/6984)) [@marlenezw](https://github.com/marlenezw) +- Cleaning up `for` loops with `make_(counting_)transform_iterator` ([#6546](https://github.com/rapidsai/cudf/pull/6546)) [@codereport](https://github.com/codereport) # cuDF 0.18.0 (24 Feb 2021) From 74270491acf3becffd9e4937eefe88afaf0047be Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 20 May 2021 12:54:37 -0700 Subject: [PATCH 06/27] Remove abc inheritance from Serializable (#8254) Currently the Serializable class provides `serialize` and `deserialize` as `abstractmethod`s via the mechanisms afforded by inheritance from `abc.ABC`. Since this class is purely internal to `cudf` and is not describing an abstract interface in a manner useful to consumers of our code, the benefits of the abstract base class concept are outweighed by the performance and maintenance costs. In particular, `isinstance` checks on subclasses of `abc.ABC` are much more expensive than for normal classes (due to an expensive implementation of `__instancecheck__`), and (for better or worse) our code base currently makes use of these checks extensively. In addition, in certain places we can benefit from the use of custom metaclasses in `cudf`, but their usage becomes more cumbersome with `ABC` because metaclasses then also have to inherit from `ABCMeta` (which brings along any associated complexities). This PR removes that inheritance, replacing it with a much simpler approach that simply implements `serialize` and `deserialize` as raising `NotImplementedError`. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Ashwin Srinath (https://github.com/shwina) - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/8254 --- python/cudf/cudf/core/abc.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/abc.py b/python/cudf/cudf/core/abc.py index 0439f0d24b8..d3da544f8b5 100644 --- a/python/cudf/cudf/core/abc.py +++ b/python/cudf/cudf/core/abc.py @@ -1,9 +1,7 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. """Common abstract base classes for cudf.""" -import abc import sys -from abc import abstractmethod import rmm @@ -18,7 +16,7 @@ import pickle # type: ignore -class Serializable(abc.ABC): +class Serializable: """A serializable object composed of device memory buffers. This base class defines a standard serialization protocol for objects @@ -32,7 +30,6 @@ class Serializable(abc.ABC): latter converts back from that representation into an equivalent object. """ - @abstractmethod def serialize(self): """Generate an equivalent serializable representation of an object. @@ -53,10 +50,11 @@ def serialize(self): :meta private: """ - pass + raise NotImplementedError( + "Subclasses of Serializable must implement serialize" + ) @classmethod - @abstractmethod def deserialize(cls, header, frames): """Generate an object from a serialized representation. @@ -80,7 +78,9 @@ class can be constructed from a serialized representation generalized :meta private: """ - pass + raise NotImplementedError( + "Subclasses of Serializable must implement deserialize" + ) def device_serialize(self): """Serialize data and metadata associated with device memory. From 944e93210565940516fda6372650b6316fc684ac Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 20 May 2021 13:57:47 -0600 Subject: [PATCH 07/27] Implement `lists::concatenate_list_elements` (#8231) This PR implements `lists::concatenate_list_elements` for list type. Given a lists column in which each row is a list of lists, the output column is generated by concatenating all lists in the same row into a single list. Example: ``` l = [ [{1, 2}, {3, 4}, {5}], [{6}, {}, {7, 8, 9}] ] r = lists::concatenate_list_elements(l); r is [ {1, 2, 3, 4, 5}, {6, 7, 8, 9} ] ``` This closes #8164. In addition, `lists::concatenate_rows` is rewritten using `lists::interleave_columns` following by `lists::concatenate_list_elements`, which is significantly shorter. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Jason Lowe (https://github.com/jlowe) - AJ Schmidt (https://github.com/ajschmidt8) - GALI PREM SAGAR (https://github.com/galipremsagar) - Devavret Makkar (https://github.com/devavret) URL: https://github.com/rapidsai/cudf/pull/8231 --- conda/recipes/libcudf/meta.yaml | 3 +- cpp/CMakeLists.txt | 3 +- .../{concatenate_rows.hpp => combine.hpp} | 37 +- cpp/include/cudf/lists/detail/combine.hpp | 49 ++ cpp/include/doxygen_groups.h | 2 +- .../combine/concatenate_list_elements.cu | 264 ++++++++++ cpp/src/lists/combine/concatenate_rows.cu | 105 ++++ cpp/src/lists/concatenate_rows.cu | 441 ---------------- cpp/tests/CMakeLists.txt | 3 +- .../concatenate_list_elements_tests.cpp | 496 ++++++++++++++++++ .../{ => combine}/concatenate_rows_tests.cpp | 2 +- java/src/main/native/src/ColumnVectorJni.cpp | 2 +- .../{concatenate_rows.pxd => combine.pxd} | 2 +- python/cudf/cudf/_lib/lists.pyx | 2 +- 14 files changed, 959 insertions(+), 452 deletions(-) rename cpp/include/cudf/lists/{concatenate_rows.hpp => combine.hpp} (57%) create mode 100644 cpp/include/cudf/lists/detail/combine.hpp create mode 100644 cpp/src/lists/combine/concatenate_list_elements.cu create mode 100644 cpp/src/lists/combine/concatenate_rows.cu delete mode 100644 cpp/src/lists/concatenate_rows.cu create mode 100644 cpp/tests/lists/combine/concatenate_list_elements_tests.cpp rename cpp/tests/lists/{ => combine}/concatenate_rows_tests.cpp (99%) rename python/cudf/cudf/_lib/cpp/lists/{concatenate_rows.pxd => combine.pxd} (83%) diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 0fcf62a2606..d42daf3194c 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -133,13 +133,14 @@ test: - test -f $PREFIX/include/cudf/io/types.hpp - test -f $PREFIX/include/cudf/ipc.hpp - test -f $PREFIX/include/cudf/join.hpp + - test -f $PREFIX/include/cudf/lists/detail/combine.hpp - test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp - test -f $PREFIX/include/cudf/lists/detail/copying.hpp - test -f $PREFIX/include/cudf/lists/lists_column_factories.hpp - test -f $PREFIX/include/cudf/lists/detail/drop_list_duplicates.hpp - test -f $PREFIX/include/cudf/lists/detail/interleave_columns.hpp - test -f $PREFIX/include/cudf/lists/detail/sorting.hpp - - test -f $PREFIX/include/cudf/lists/concatenate_rows.hpp + - test -f $PREFIX/include/cudf/lists/combine.hpp - test -f $PREFIX/include/cudf/lists/count_elements.hpp - test -f $PREFIX/include/cudf/lists/explode.hpp - test -f $PREFIX/include/cudf/lists/drop_list_duplicates.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index abad4d7bbca..af6f60b031d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -266,7 +266,8 @@ add_library(cudf src/join/join.cu src/join/semi_join.cu src/lists/contains.cu - src/lists/concatenate_rows.cu + src/lists/combine/concatenate_list_elements.cu + src/lists/combine/concatenate_rows.cu src/lists/copying/concatenate.cu src/lists/copying/copying.cu src/lists/copying/gather.cu diff --git a/cpp/include/cudf/lists/concatenate_rows.hpp b/cpp/include/cudf/lists/combine.hpp similarity index 57% rename from cpp/include/cudf/lists/concatenate_rows.hpp rename to cpp/include/cudf/lists/combine.hpp index 1d93de418f8..a9407ed57ca 100644 --- a/cpp/include/cudf/lists/concatenate_rows.hpp +++ b/cpp/include/cudf/lists/combine.hpp @@ -21,7 +21,7 @@ namespace cudf { namespace lists { /** - * @addtogroup lists_concatenate_rows + * @addtogroup lists_combine * @{ * @file */ @@ -53,16 +53,47 @@ enum class concatenate_null_policy { IGNORE, NULLIFY_OUTPUT_ROW }; * * @param input Table of lists to be concatenated. * @param null_policy The parameter to specify whether a null list element will be ignored from - * concatenation, or any concatenation involving a null list element will result in a null list. + * concatenation, or any concatenation involving a null element will result in a null list. * @param mr Device memory resource used to allocate the returned column's device memory. * @return A new column in which each row is a list resulted from concatenating all list elements in - * the corresponding row of the input table. + * the corresponding row of the input table. */ std::unique_ptr concatenate_rows( table_view const& input, concatenate_null_policy null_policy = concatenate_null_policy::IGNORE, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Concatenating multiple lists on the same row of a lists column into a single list. + * + * Given a lists column where each row in the column is a list of lists of entries, an output lists + * column is generated by concatenating all the list elements at the same row together. If any row + * contains null list elements, the concatenation process will either ignore those null elements, or + * will simply set the entire resulting row to be a null element. + * + * @code{.pseudo} + * l = [ [{1, 2}, {3, 4}, {5}], [{6}, {}, {7, 8, 9}] ] + * r = lists::concatenate_list_elements(l); + * r is [ {1, 2, 3, 4, 5}, {6, 7, 8, 9} ] + * @endcode + * + * @throws cudf::logic_error if the input column is not at least two-level depth lists column (i.e., + * each row must be a list of list). + * @throws cudf::logic_error if the input lists column contains nested typed entries that are not + * lists. + * + * @param input The lists column containing lists of list elements to concatenate. + * @param null_policy The parameter to specify whether a null list element will be ignored from + * concatenation, or any concatenation involving a null element will result in a null list. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return A new column in which each row is a list resulted from concatenating all list elements in + * the corresponding row of the input lists column. + */ +std::unique_ptr concatenate_list_elements( + column_view const& input, + concatenate_null_policy null_policy = concatenate_null_policy::IGNORE, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of group } // namespace lists } // namespace cudf diff --git a/cpp/include/cudf/lists/detail/combine.hpp b/cpp/include/cudf/lists/detail/combine.hpp new file mode 100644 index 00000000000..9f28074173a --- /dev/null +++ b/cpp/include/cudf/lists/detail/combine.hpp @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +namespace cudf { +namespace lists { +namespace detail { +/** + * @copydoc cudf::lists::concatenate_rows + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr concatenate_rows( + table_view const& input, + concatenate_null_policy null_policy, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @copydoc cudf::lists::concatenate_list_elements + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr concatenate_list_elements( + column_view const& input, + concatenate_null_policy null_policy, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} // namespace detail +} // namespace lists +} // namespace cudf diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index 11b907e7f16..dda8ce87432 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -143,7 +143,7 @@ * @} * @defgroup lists_apis Lists * @{ - * @defgroup lists_concatenate_rows Combining + * @defgroup lists_combine Combining * @defgroup lists_extract Extracting * @defgroup lists_contains Searching * @defgroup lists_gather Gathering diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu new file mode 100644 index 00000000000..b76cd19d94b --- /dev/null +++ b/cpp/src/lists/combine/concatenate_list_elements.cu @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +namespace cudf { +namespace lists { +namespace detail { +namespace { +/** + * @brief Concatenate lists within the same row into one list, ignoring any null list during + * concatenation. + */ +std::unique_ptr concatenate_lists_ignore_null(column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const num_rows = input.size(); + + static_assert(std::is_same_v && std::is_same_v); + auto out_offsets = make_numeric_column( + data_type{type_id::INT32}, num_rows + 1, mask_state::UNALLOCATED, stream, mr); + + auto const d_out_offsets = out_offsets->mutable_view().template begin(); + auto const d_row_offsets = lists_column_view(input).offsets_begin(); + auto const d_list_offsets = lists_column_view(lists_column_view(input).child()).offsets_begin(); + + // Concatenating the lists at the same row by converting the entry offsets from the child column + // into row offsets of the root column. Those entry offsets are subtracted by the first entry + // offset to output zero-based offsets. + auto const iter = thrust::make_counting_iterator(0); + thrust::transform(rmm::exec_policy(stream), + iter, + iter + num_rows + 1, + d_out_offsets, + [d_row_offsets, d_list_offsets] __device__(auto const idx) { + auto const start_offset = d_list_offsets[d_row_offsets[0]]; + return d_list_offsets[d_row_offsets[idx]] - start_offset; + }); + + // The child column of the output lists column is just copied from the input column. + auto out_entries = std::make_unique( + lists_column_view(lists_column_view(input).get_sliced_child(stream)).get_sliced_child(stream)); + + return make_lists_column(num_rows, + std::move(out_offsets), + std::move(out_entries), + input.null_count(), + cudf::detail::copy_bitmask(input, stream, mr), + stream, + mr); +} + +/** + * @brief Generate list offsets and list validities for the output lists column. + * + * This function is called only when (has_null_list == true and null_policy == NULLIFY_OUTPUT_ROW). + */ +std::pair, rmm::device_uvector> +generate_list_offsets_and_validities(column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const num_rows = input.size(); + + static_assert(std::is_same_v && std::is_same_v); + auto out_offsets = make_numeric_column( + data_type{type_id::INT32}, num_rows + 1, mask_state::UNALLOCATED, stream, mr); + + auto const lists_of_lists_dv_ptr = column_device_view::create(input); + auto const lists_dv_ptr = column_device_view::create(lists_column_view(input).child()); + auto const d_out_offsets = out_offsets->mutable_view().template begin(); + auto const d_row_offsets = lists_column_view(input).offsets_begin(); + auto const d_list_offsets = lists_column_view(lists_column_view(input).child()).offsets_begin(); + + // The array of int8_t stores validities for the output list elements. + auto validities = rmm::device_uvector(num_rows, stream); + + // Compute output list sizes and validities. + auto const iter = thrust::make_counting_iterator(0); + thrust::transform( + rmm::exec_policy(stream), + iter, + iter + num_rows, + d_out_offsets, + [lists_of_lists_dv = *lists_of_lists_dv_ptr, + lists_dv = *lists_dv_ptr, + d_row_offsets, + d_list_offsets, + d_validities = validities.begin(), + iter] __device__(auto const idx) { + if (d_row_offsets[idx] == d_row_offsets[idx + 1]) { // This is a null/empty row. + d_validities[idx] = static_cast(lists_of_lists_dv.is_valid(idx)); + return size_type{0}; + } + // The output row will not be null only if all lists on the input row are not null. + auto const is_valid = + thrust::all_of(thrust::seq, + iter + d_row_offsets[idx], + iter + d_row_offsets[idx + 1], + [&] __device__(auto const list_idx) { return lists_dv.is_valid(list_idx); }); + d_validities[idx] = static_cast(is_valid); + if (!is_valid) { return size_type{0}; } + + // Compute size of the output list as sum of sizes of all lists in the current input row. + return d_list_offsets[d_row_offsets[idx + 1]] - d_list_offsets[d_row_offsets[idx]]; + }); + + // Compute offsets from sizes. + thrust::exclusive_scan( + rmm::exec_policy(stream), d_out_offsets, d_out_offsets + num_rows + 1, d_out_offsets); + + return {std::move(out_offsets), std::move(validities)}; +} + +/** + * @brief Gather entries from the input lists column, ignoring rows that have null list elements. + * + * This function is called only when (has_null_list == true and null_policy == NULLIFY_OUTPUT_ROW). + */ +std::unique_ptr gather_list_entries(column_view const& input, + column_view const& output_list_offsets, + size_type num_rows, + size_type num_output_entries, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const child_col = lists_column_view(input).child(); + auto const entry_col = lists_column_view(child_col).child(); + auto const d_row_offsets = lists_column_view(input).offsets_begin(); + auto const d_list_offsets = lists_column_view(child_col).offsets_begin(); + auto gather_map = rmm::device_uvector(num_output_entries, stream); + + // Fill the gather map with indices of the lists from the child column of the input column. + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + num_rows, + [d_row_offsets, + d_list_offsets, + d_indices = gather_map.begin(), + d_out_list_offsets = + output_list_offsets.template begin()] __device__(size_type const idx) { + // The output row has been identified as a null/empty list during list size computation. + if (d_out_list_offsets[idx + 1] == d_out_list_offsets[idx]) { return; } + + // The indices of the list elements on the row `idx` of the input column. + thrust::sequence(thrust::seq, + d_indices + d_out_list_offsets[idx], + d_indices + d_out_list_offsets[idx + 1], + d_list_offsets[d_row_offsets[idx]]); + }); + + auto result = cudf::detail::gather(table_view{{entry_col}}, + gather_map.begin(), + gather_map.end(), + out_of_bounds_policy::DONT_CHECK, + stream, + mr); + return std::move(result->release()[0]); +} + +std::unique_ptr concatenate_lists_nullifying_rows(column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // Generate offsets and validities of the output lists column. + auto [list_offsets, list_validities] = generate_list_offsets_and_validities(input, stream, mr); + auto const offsets_view = list_offsets->view(); + + auto const num_rows = input.size(); + auto const num_output_entries = + cudf::detail::get_value(offsets_view, num_rows, stream); + + auto list_entries = + gather_list_entries(input, offsets_view, num_rows, num_output_entries, stream, mr); + auto [null_mask, null_count] = cudf::detail::valid_if( + list_validities.begin(), list_validities.end(), thrust::identity{}, stream, mr); + + return make_lists_column(num_rows, + std::move(list_offsets), + std::move(list_entries), + null_count, + null_count ? std::move(null_mask) : rmm::device_buffer{}, + stream, + mr); +} + +} // namespace + +/** + * @copydoc cudf::lists::concatenate_list_elements + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr concatenate_list_elements(column_view const& input, + concatenate_null_policy null_policy, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto type = input.type(); // Column that is lists of lists. + CUDF_EXPECTS(type.id() == type_id::LIST, "Input column must be a lists column."); + + auto col = lists_column_view(input).child(); // Rows, which are lists. + type = col.type(); + CUDF_EXPECTS(type.id() == type_id::LIST, "Rows of the input column must be lists."); + + col = lists_column_view(col).child(); // The last level entries what we need to check. + type = col.type(); + CUDF_EXPECTS(type.id() == type_id::LIST || !cudf::is_nested(type), + "Entry of the input lists column must be of list or non-nested types."); + + if (input.size() == 0) { return cudf::empty_like(input); } + + return (null_policy == concatenate_null_policy::IGNORE || + !lists_column_view(input).child().has_nulls()) + ? concatenate_lists_ignore_null(input, stream, mr) + : concatenate_lists_nullifying_rows(input, stream, mr); +} + +} // namespace detail + +/** + * @copydoc cudf::lists::concatenate_list_elements + */ +std::unique_ptr concatenate_list_elements(column_view const& input, + concatenate_null_policy null_policy, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::concatenate_list_elements(input, null_policy, rmm::cuda_stream_default, mr); +} + +} // namespace lists +} // namespace cudf diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu new file mode 100644 index 00000000000..fdd71aea7bf --- /dev/null +++ b/cpp/src/lists/combine/concatenate_rows.cu @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace cudf { +namespace lists { +namespace detail { +/** + * @copydoc cudf::lists::concatenate_rows + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr concatenate_rows(table_view const& input, + concatenate_null_policy null_policy, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(input.num_columns() > 0, "The input table must have at least one column."); + + auto const entry_type = lists_column_view(*input.begin()).child().type(); + for (auto const& col : input) { + CUDF_EXPECTS(col.type().id() == type_id::LIST, + "All columns of the input table must be of lists column type."); + + auto const child_col = lists_column_view(col).child(); + CUDF_EXPECTS(not cudf::is_nested(child_col.type()), "Nested types are not supported."); + CUDF_EXPECTS(entry_type == child_col.type(), + "The types of entries in the input columns must be the same."); + } + + auto const num_rows = input.num_rows(); + auto const num_cols = input.num_columns(); + if (num_rows == 0) { return cudf::empty_like(input.column(0)); } + if (num_cols == 1) { return std::make_unique(*(input.begin()), stream, mr); } + + // Memory resource for temporary data. + auto const default_mr = rmm::mr::get_current_device_resource(); + + // Interleave the input table into one column. + auto const has_null_mask = std::any_of( + std::cbegin(input), std::cend(input), [](auto const& col) { return col.nullable(); }); + auto interleaved_columns = detail::interleave_columns(input, has_null_mask, stream, default_mr); + + // Generate a lists column which has child column is the interleaved_columns. + // The new nested lists column will have each row is a list of `num_cols` list elements. + static_assert(std::is_same_v and std::is_same_v); + auto list_offsets = make_numeric_column( + data_type{type_id::INT32}, num_rows + 1, mask_state::UNALLOCATED, stream, default_mr); + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_rows + 1), + list_offsets->mutable_view().template begin(), + [num_cols] __device__(auto const idx) { return idx * num_cols; }); + auto const nested_lists_col = make_lists_column(num_rows, + std::move(list_offsets), + std::move(interleaved_columns), + 0, + rmm::device_buffer{}, + stream, + default_mr); + + // Concatenate lists on each row of the nested lists column, producing the desired output. + return concatenate_list_elements(nested_lists_col->view(), null_policy, stream, mr); +} + +} // namespace detail + +/** + * @copydoc cudf::lists::concatenate_rows + */ +std::unique_ptr concatenate_rows(table_view const& lists_columns, + concatenate_null_policy null_policy, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::concatenate_rows(lists_columns, null_policy, rmm::cuda_stream_default, mr); +} + +} // namespace lists +} // namespace cudf diff --git a/cpp/src/lists/concatenate_rows.cu b/cpp/src/lists/concatenate_rows.cu deleted file mode 100644 index 8528a7680f7..00000000000 --- a/cpp/src/lists/concatenate_rows.cu +++ /dev/null @@ -1,441 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include - -namespace cudf { -namespace lists { -namespace detail { -namespace { -/** - * @brief Concatenate lists within the same row into one list, ignoring any null list during - * concatenation. - */ -std::unique_ptr concatenate_rows_ignore_null(table_view const& input, - bool has_null_mask, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto const num_output_lists = input.num_rows(); - auto const table_dv_ptr = table_device_view::create(input); - - // Interleave the list element from the input table, thus all the lists at the same row now stay - // next to each other. - auto interleaved_columns = detail::interleave_columns(input, has_null_mask, stream); - - // Modify the list offsets to combine lists of the same input row. - static_assert(sizeof(offset_type) == sizeof(int32_t)); - static_assert(sizeof(size_type) == sizeof(int32_t)); - auto list_offsets = make_numeric_column( - data_type{type_id::INT32}, num_output_lists + 1, mask_state::UNALLOCATED, stream, mr); - auto const d_offsets = list_offsets->mutable_view().template begin(); - - // The array of int8_t to store validities for list elements. - // Since we combine multiple lists, we may need to recompute list validities. - auto validities = rmm::device_uvector(has_null_mask ? num_output_lists : 0, stream); - - // For an input table of `n` columns, if after interleaving we have the list offsets are - // [ i_0, i_1, ..., i_n, i_n+1, ..., i_2n, ... ] then to concatenate them just modify the offsets - // to be [ i_0, i_n, i_2n, i_3n, ... ]. - auto const d_interleaved_offsets = lists_column_view(interleaved_columns->view()).offsets_begin(); - thrust::transform( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_output_lists + 1), - d_offsets, - [d_interleaved_offsets, - num_cols = input.num_columns(), - table_dv = *table_dv_ptr, - d_validities = validities.begin(), - has_null_mask] __device__(auto const idx) { - if (has_null_mask) { - auto const any_valid = thrust::any_of( - thrust::seq, table_dv.begin(), table_dv.end(), [idx](auto const& list_col) { - return list_col.is_valid(idx); - }); - d_validities[idx] = static_cast(any_valid); - } - return d_interleaved_offsets[idx * num_cols]; - }); - - auto [null_mask, null_count] = [&] { - return has_null_mask - ? cudf::detail::valid_if( - validities.begin(), validities.end(), thrust::identity{}, stream, mr) - : std::make_pair(rmm::device_buffer{}, size_type{0}); - }(); - - // The child column containing list entries is taken from the `interleaved_columns` column. - auto interleaved_columns_content = interleaved_columns->release(); - - return make_lists_column( - num_output_lists, - std::move(list_offsets), - std::move(interleaved_columns_content.children[lists_column_view::child_column_index]), - null_count, - null_count > 0 ? std::move(null_mask) : rmm::device_buffer{}, - stream, - mr); -} - -/** - * @brief Generate list offsets and list validities for the output lists column from the table_view - * of the input lists columns. - * - * This function is called only when (has_null_mask == true and null_policy == NULLIFY_OUTPUT_ROW). - */ -std::pair, rmm::device_uvector> -generate_list_offsets_and_validities(table_view const& input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto const num_output_lists = input.num_rows(); - auto const table_dv_ptr = table_device_view::create(input); - - // The output offsets column. - static_assert(sizeof(offset_type) == sizeof(int32_t)); - static_assert(sizeof(size_type) == sizeof(int32_t)); - auto list_offsets = make_numeric_column( - data_type{type_id::INT32}, num_output_lists + 1, mask_state::UNALLOCATED, stream, mr); - auto const d_offsets = list_offsets->mutable_view().template begin(); - - // The array of int8_t to store validities for list elements. - auto validities = rmm::device_uvector(num_output_lists, stream); - - // Compute list sizes and validities. - thrust::transform( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_output_lists), - d_offsets, - [table_dv = *table_dv_ptr, d_validities = validities.begin()] __device__(size_type const idx) { - auto const all_valid = - thrust::all_of(thrust::seq, table_dv.begin(), table_dv.end(), [idx](auto const& list_col) { - return list_col.is_valid(idx); - }); - d_validities[idx] = static_cast(all_valid); - if (not all_valid) return size_type{0}; - - // Compute size of the output list as sum of sizes of input lists - return thrust::transform_reduce( - thrust::seq, - table_dv.begin(), - table_dv.end(), - [idx] __device__(auto const& lists_col) { - auto const list_offsets = - lists_col.child(lists_column_view::offsets_column_index).template data() + - lists_col.offset(); - return list_offsets[idx + 1] - list_offsets[idx]; // list size - }, - size_type{0}, - thrust::plus{}); - }); - - // Compute offsets from sizes. - thrust::exclusive_scan( - rmm::exec_policy(stream), d_offsets, d_offsets + num_output_lists + 1, d_offsets); - - return {std::move(list_offsets), std::move(validities)}; -} - -/** - * @brief Compute string sizes, string validities, and concatenate string lists functor. - * - * This functor is called only when (has_null_mask == true and null_policy == NULLIFY_OUTPUT_ROW). - * It is executed twice. In the first pass, the sizes and validities of the output strings will be - * computed. In the second pass, this will concatenate the lists of strings on the same row from the - * given input table. - */ -struct compute_string_sizes_and_concatenate_lists_fn { - table_device_view const table_dv; - - // Store list offsets of the output lists column. - offset_type const* const dst_list_offsets; - - // Store offsets of the strings. - offset_type* d_offsets{nullptr}; - - // If d_chars == nullptr: only compute sizes and validities of the output strings. - // If d_chars != nullptr: only concatenate lists of strings. - char* d_chars{nullptr}; - - // We need to set `1` or `0` for the validities of the strings in the child column. - int8_t* d_validities{nullptr}; - - __device__ void operator()(size_type const idx) - { - // The current row contain null, which has been identified during offsets computation. - if (dst_list_offsets[idx + 1] == dst_list_offsets[idx]) { return; } - - // read_idx and write_idx are indices of string elements. - size_type write_idx = dst_list_offsets[idx]; - thrust::for_each( - thrust::seq, table_dv.begin(), table_dv.end(), [&] __device__(auto const& lists_col) { - auto const list_offsets = - lists_col.child(lists_column_view::offsets_column_index).template data() + - lists_col.offset(); - auto const& str_col = lists_col.child(lists_column_view::child_column_index); - auto const str_offsets = - str_col.child(strings_column_view::offsets_column_index).template data(); - - // The range of indices of the strings within the source list. - auto const start_str_idx = list_offsets[idx]; - auto const end_str_idx = list_offsets[idx + 1]; - - if (not d_chars) { // just compute sizes of strings within a list - for (auto read_idx = start_str_idx; read_idx < end_str_idx; ++read_idx, ++write_idx) { - d_validities[write_idx] = static_cast(str_col.is_valid(read_idx)); - d_offsets[write_idx] = str_offsets[read_idx + 1] - str_offsets[read_idx]; - } - } else { // just copy the entire memory region containing all strings in the list - // start_byte and end_byte are indices of character of the string elements. - auto const start_byte = str_offsets[start_str_idx]; - auto const end_byte = str_offsets[end_str_idx]; - if (start_byte < end_byte) { - auto const input_ptr = - str_col.child(strings_column_view::chars_column_index).template data() + - start_byte; - auto const output_ptr = d_chars + d_offsets[write_idx]; - thrust::copy(thrust::seq, input_ptr, input_ptr + end_byte - start_byte, output_ptr); - } - write_idx += end_str_idx - start_str_idx; - } - }); - } -}; - -/** - * @brief Struct used in type_dispatcher to interleave list entries of the input lists columns and - * output the results into a destination column. - * - * This functor is called only when (has_null_mask == true and null_policy == NULLIFY_OUTPUT_ROW). - */ -struct concatenate_lists_fn { - template - std::enable_if_t, std::unique_ptr> operator()( - table_view const& input, - column_view const& output_list_offsets, - size_type num_output_lists, - size_type num_output_entries, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const noexcept - { - auto const table_dv_ptr = table_device_view::create(input); - auto const comp_fn = compute_string_sizes_and_concatenate_lists_fn{ - *table_dv_ptr, output_list_offsets.template begin()}; - - // Generate a null mask because the input table has nullable column. - auto [offsets_column, chars_column, null_mask, null_count] = - cudf::strings::detail::make_strings_children_with_null_mask( - comp_fn, num_output_lists, num_output_entries, stream, mr); - - return make_strings_column(num_output_entries, - std::move(offsets_column), - std::move(chars_column), - null_count, - std::move(null_mask), - stream, - mr); - } - - template - std::enable_if_t(), std::unique_ptr> operator()( - table_view const& input, - column_view const& output_list_offsets, - size_type num_output_lists, - size_type num_output_entries, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const noexcept - { - auto const table_dv_ptr = table_device_view::create(input); - - // The output child column. - auto const child_col = lists_column_view(*input.begin()).child(); - auto output = - allocate_like(child_col, num_output_entries, mask_allocation_policy::NEVER, stream, mr); - auto output_dv_ptr = mutable_column_device_view::create(*output); - - // The array of int8_t to store entry validities. - auto validities = rmm::device_uvector(num_output_entries, stream); - - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - num_output_lists, - [num_cols = input.num_columns(), - table_dv = *table_dv_ptr, - d_validities = validities.begin(), - dst_list_offsets = output_list_offsets.template begin(), - d_output = output_dv_ptr->template begin()] __device__(size_type const idx) { - // The output row has been identified as a null list during list size computation. - if (dst_list_offsets[idx + 1] == dst_list_offsets[idx]) { return; } - - auto write_start = dst_list_offsets[idx]; - thrust::for_each( - thrust::seq, table_dv.begin(), table_dv.end(), [&] __device__(auto const& lists_col) { - auto const list_offsets = lists_col.child(lists_column_view::offsets_column_index) - .template data() + - lists_col.offset(); - auto const& data_col = lists_col.child(lists_column_view::child_column_index); - - // The range of indices of the entries within the source list. - auto const start_idx = list_offsets[idx]; - auto const end_idx = list_offsets[idx + 1]; - - // Fill the validities array. - for (auto read_idx = start_idx, write_idx = write_start; read_idx < end_idx; - ++read_idx, ++write_idx) { - d_validities[write_idx] = static_cast(data_col.is_valid(read_idx)); - } - // Do a copy for the entire list entries. - auto const input_ptr = - reinterpret_cast(data_col.template data() + start_idx); - auto const output_ptr = reinterpret_cast(&d_output[write_start]); - thrust::copy( - thrust::seq, input_ptr, input_ptr + sizeof(T) * (end_idx - start_idx), output_ptr); - write_start += end_idx - start_idx; - }); - }); - - auto [null_mask, null_count] = cudf::detail::valid_if( - validities.begin(), validities.end(), thrust::identity{}, stream, mr); - if (null_count > 0) { output->set_null_mask(null_mask, null_count); } - - return output; - } - - template - std::enable_if_t and not cudf::is_fixed_width(), - std::unique_ptr> - operator()(table_view const&, - column_view const&, - size_type, - size_type, - rmm::cuda_stream_view, - rmm::mr::device_memory_resource*) const - { - // Currently, only support string_view and fixed-width types - CUDF_FAIL("Called `concatenate_lists_fn()` on non-supported types."); - } -}; - -std::unique_ptr concatenate_with_nullifying_rows(table_view const& input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - // Generate offsets of the output lists column. - auto [list_offsets, list_validities] = generate_list_offsets_and_validities(input, stream, mr); - auto const offsets_view = list_offsets->view(); - - // Copy entries from the input lists columns to the output lists column - this needed to be - // specialized for different types. - auto const num_output_lists = input.num_rows(); - auto const num_output_entries = - cudf::detail::get_value(offsets_view, num_output_lists, stream); - auto list_entries = - type_dispatcher(lists_column_view(*input.begin()).child().type(), - concatenate_lists_fn{}, - input, - offsets_view, - num_output_lists, - num_output_entries, - stream, - mr); - - auto [null_mask, null_count] = cudf::detail::valid_if( - list_validities.begin(), list_validities.end(), thrust::identity{}, stream, mr); - return make_lists_column(num_output_lists, - std::move(list_offsets), - std::move(list_entries), - null_count, - null_count ? std::move(null_mask) : rmm::device_buffer{}, - stream, - mr); -} - -} // namespace - -/** - * @copydoc cudf::lists::concatenate_rows - * - * @param stream CUDA stream used for device memory operations and kernel launches. - */ -std::unique_ptr concatenate_rows(table_view const& input, - concatenate_null_policy null_policy, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(input.num_columns() > 0, "The input table must have at least one column."); - - auto const entry_type = lists_column_view(*input.begin()).child().type(); - for (auto const& col : input) { - CUDF_EXPECTS(col.type().id() == type_id::LIST, - "All columns of the input table must be of lists column type."); - - auto const child_col = lists_column_view(col).child(); - CUDF_EXPECTS(not cudf::is_nested(child_col.type()), "Nested types are not supported."); - CUDF_EXPECTS(entry_type == child_col.type(), - "The types of entries in the input columns must be the same."); - } - - if (input.num_rows() == 0) { return cudf::empty_like(input.column(0)); } - if (input.num_columns() == 1) { return std::make_unique(*(input.begin()), stream, mr); } - - // List concatenation can be implemented by simply interleaving the lists columns, then modify the - // list offsets. - auto const has_null_mask = std::any_of( - std::cbegin(input), std::cend(input), [](auto const& col) { return col.nullable(); }); - if (not has_null_mask or null_policy == concatenate_null_policy::IGNORE) { - return concatenate_rows_ignore_null(input, has_null_mask, stream, mr); - } - - // Both conditions satisfied: has_null_mask == true and - // null_policy == NULLIFY_OUTPUT_ROW. - return concatenate_with_nullifying_rows(input, stream, mr); -} - -} // namespace detail - -/** - * @copydoc cudf::lists::concatenate_rows - */ -std::unique_ptr concatenate_rows(table_view const& lists_columns, - concatenate_null_policy null_policy, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::concatenate_rows(lists_columns, null_policy, rmm::cuda_stream_default, mr); -} - -} // namespace lists -} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index d87b4b81bdc..f36ec70479b 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -407,7 +407,8 @@ ConfigureTest(AST_TEST ast/transform_tests.cpp) ################################################################################################### # - lists tests ---------------------------------------------------------------------------------- ConfigureTest(LISTS_TEST - lists/concatenate_rows_tests.cpp + lists/combine/concatenate_list_elements_tests.cpp + lists/combine/concatenate_rows_tests.cpp lists/contains_tests.cpp lists/count_elements_tests.cpp lists/drop_list_duplicates_tests.cpp diff --git a/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp b/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp new file mode 100644 index 00000000000..de6307471a9 --- /dev/null +++ b/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp @@ -0,0 +1,496 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +namespace { +using StrListsCol = cudf::test::lists_column_wrapper; +using IntListsCol = cudf::test::lists_column_wrapper; +using IntCol = cudf::test::fixed_width_column_wrapper; + +constexpr bool print_all{false}; // For debugging +constexpr int32_t null{0}; + +template +auto build_lists_col(T& list, Ts&... lists) +{ + return T(std::initializer_list{std::move(list), std::move(lists)...}); +} + +auto all_nulls() { return cudf::test::iterator_all_nulls(); } + +auto null_at(cudf::size_type idx) { return cudf::test::iterator_with_null_at(idx); } + +auto null_at(std::vector const& indices) +{ + return cudf::test::iterator_with_null_at(cudf::host_span{indices}); +} + +} // namespace + +struct ConcatenateListElementsTest : public cudf::test::BaseFixture { +}; + +TEST_F(ConcatenateListElementsTest, InvalidInput) +{ + // Input lists is not a 2-level depth lists column. + { + auto const col = IntCol{}; + EXPECT_THROW(cudf::lists::concatenate_list_elements(col), cudf::logic_error); + } + + // Input lists is not at least 2-level depth lists column. + { + auto const col = IntListsCol{1, 2, 3}; + EXPECT_THROW(cudf::lists::concatenate_list_elements(col), cudf::logic_error); + } +} + +template +struct ConcatenateListElementsTypedTest : public cudf::test::BaseFixture { +}; + +using TypesForTest = cudf::test::Concat; +TYPED_TEST_CASE(ConcatenateListElementsTypedTest, TypesForTest); + +TYPED_TEST(ConcatenateListElementsTypedTest, SimpleInputNoNull) +{ + using ListsCol = cudf::test::lists_column_wrapper; + + auto row0 = ListsCol{{1, 2}, {3}, {4, 5, 6}}; + auto row1 = ListsCol{ListsCol{}}; + auto row2 = ListsCol{{7, 8}, {9, 10}}; + auto const col = build_lists_col(row0, row1, row2); + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = ListsCol{{1, 2, 3, 4, 5, 6}, {}, {7, 8, 9, 10}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); +} + +TYPED_TEST(ConcatenateListElementsTypedTest, SimpleInputNestedManyLevelsNoNull) +{ + using ListsCol = cudf::test::lists_column_wrapper; + + auto row00 = ListsCol{{1, 2}, {3}, {4, 5, 6}}; + auto row01 = ListsCol{ListsCol{}}; + auto row02 = ListsCol{{7, 8}, {9, 10}}; + auto row0 = build_lists_col(row00, row01, row02); + + auto row10 = ListsCol{{1, 2}, {3}, {4, 5, 6}}; + auto row11 = ListsCol{ListsCol{}}; + auto row12 = ListsCol{{7, 8}, {9, 10}}; + auto row1 = build_lists_col(row10, row11, row12); + + auto row20 = ListsCol{{1, 2}, {3}, {4, 5, 6}}; + auto row21 = ListsCol{ListsCol{}}; + auto row22 = ListsCol{{7, 8}, {9, 10}}; + auto row2 = build_lists_col(row20, row21, row22); + + auto const col = build_lists_col(row0, row1, row2); + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = ListsCol{ListsCol{{1, 2}, {3}, {4, 5, 6}, {}, {7, 8}, {9, 10}}, + ListsCol{{1, 2}, {3}, {4, 5, 6}, {}, {7, 8}, {9, 10}}, + ListsCol{{1, 2}, {3}, {4, 5, 6}, {}, {7, 8}, {9, 10}}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); +} + +TEST_F(ConcatenateListElementsTest, SimpleInputStringsColumnNoNull) +{ + auto row0 = StrListsCol{StrListsCol{"Tomato", "Apple"}, StrListsCol{"Orange"}}; + auto row1 = StrListsCol{StrListsCol{"Banana", "Kiwi", "Cherry"}, StrListsCol{"Lemon", "Peach"}}; + auto row2 = StrListsCol{StrListsCol{"Coconut"}, StrListsCol{}}; + auto const col = build_lists_col(row0, row1, row2); + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = StrListsCol{StrListsCol{"Tomato", "Apple", "Orange"}, + StrListsCol{"Banana", "Kiwi", "Cherry", "Lemon", "Peach"}, + StrListsCol{"Coconut"}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); +} + +TYPED_TEST(ConcatenateListElementsTypedTest, SimpleInputWithNulls) +{ + using ListsCol = cudf::test::lists_column_wrapper; + auto row0 = ListsCol{{ListsCol{{1, null, 3, 4}, null_at(1)}, + ListsCol{{10, 11, 12, null}, null_at(3)}, + ListsCol{} /*NULL*/}, + null_at(2)}; + auto row1 = ListsCol{ListsCol{{null, 2, 3, 4}, null_at(0)}, + ListsCol{{13, 14, 15, 16, 17, null}, null_at(5)}, + ListsCol{{20, null}, null_at(1)}}; + auto row2 = ListsCol{{ListsCol{{null, 2, 3, 4}, null_at(0)}, + ListsCol{} /*NULL*/, + ListsCol{{null, 21, null, null}, null_at({0, 2, 3})}}, + null_at(1)}; + auto row3 = ListsCol{{ListsCol{} /*NULL*/, ListsCol{{null, 18}, null_at(0)}}, null_at(0)}; + auto row4 = ListsCol{ListsCol{{1, 2, null, 4}, null_at(2)}, + ListsCol{{19, 20, null}, null_at(2)}, + ListsCol{22, 23, 24, 25}}; + auto row5 = ListsCol{ListsCol{{1, 2, 3, null}, null_at(3)}, + ListsCol{{null}, null_at(0)}, + ListsCol{{null, null, null, null, null}, all_nulls()}}; + auto const col = build_lists_col(row0, row1, row2, row3, row4, row5); + + // Ignore null list elements. + { + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = + ListsCol{ListsCol{{1, null, 3, 4, 10, 11, 12, null}, null_at({1, 7})}, + ListsCol{{null, 2, 3, 4, 13, 14, 15, 16, 17, null, 20, null}, null_at({0, 9, 11})}, + ListsCol{{null, 2, 3, 4, null, 21, null, null}, null_at({0, 4, 6, 7})}, + ListsCol{{null, 18}, null_at(0)}, + ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})}, + ListsCol{{1, 2, 3, null, null, null, null, null, null, null}, + null_at({3, 4, 5, 6, 7, 8, 9})}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + + // Null lists result in null rows. + { + auto const results = cudf::lists::concatenate_list_elements( + col, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW); + auto const expected = + ListsCol{{ListsCol{} /*NULL*/, + ListsCol{{null, 2, 3, 4, 13, 14, 15, 16, 17, null, 20, null}, null_at({0, 9, 11})}, + ListsCol{} /*NULL*/, + ListsCol{} /*NULL*/, + ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})}, + ListsCol{{1, 2, 3, null, null, null, null, null, null, null}, + null_at({3, 4, 5, 6, 7, 8, 9})}}, + null_at({0, 2, 3})}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } +} + +TYPED_TEST(ConcatenateListElementsTypedTest, SimpleInputNestedManyLevelsWithNulls) +{ + using ListsCol = cudf::test::lists_column_wrapper; + + auto row00 = ListsCol{{1, 2}, {3}, {4, 5, 6}}; + auto row01 = ListsCol{ListsCol{}}; /*NULL*/ + auto row02 = ListsCol{{7, 8}, {9, 10}}; + auto row0 = ListsCol{{std::move(row00), std::move(row01), std::move(row02)}, null_at(1)}; + + auto row10 = ListsCol{{{1, 2}, {3}, {4, 5, 6} /*NULL*/}, null_at(2)}; + auto row11 = ListsCol{ListsCol{}}; + auto row12 = ListsCol{{7, 8}, {9, 10}}; + auto row1 = build_lists_col(row10, row11, row12); + + auto row20 = ListsCol{{1, 2}, {3}, {4, 5, 6}}; + auto row21 = ListsCol{ListsCol{}}; + auto row22 = ListsCol{ListsCol{{null, 8}, null_at(0)}, {9, 10}}; + auto row2 = build_lists_col(row20, row21, row22); + + auto const col = build_lists_col(row0, row1, row2); + + // Ignore null list elements. + { + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = + ListsCol{ListsCol{{1, 2}, {3}, {4, 5, 6}, {7, 8}, {9, 10}}, + ListsCol{{{1, 2}, {3}, {} /*NULL*/, {}, {7, 8}, {9, 10}}, null_at(2)}, + ListsCol{{1, 2}, {3}, {4, 5, 6}, {}, ListsCol{{null, 8}, null_at(0)}, {9, 10}}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + + // Null lists result in null rows. + { + auto const results = cudf::lists::concatenate_list_elements( + col, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW); + auto const expected = + ListsCol{{ListsCol{ListsCol{}}, /*NULL*/ + ListsCol{{{1, 2}, {3}, {} /*NULL*/, {}, {7, 8}, {9, 10}}, null_at(2)}, + ListsCol{{1, 2}, {3}, {4, 5, 6}, {}, ListsCol{{null, 8}, null_at(0)}, {9, 10}}}, + null_at(0)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } +} + +TEST_F(ConcatenateListElementsTest, SimpleInputStringsColumnWithNulls) +{ + auto row0 = StrListsCol{ + StrListsCol{{"Tomato", "Bear" /*NULL*/, "Apple"}, null_at(1)}, + StrListsCol{{"Orange", "Dog" /*NULL*/, "Fox" /*NULL*/, "Duck" /*NULL*/}, null_at({1, 2, 3})}}; + auto row1 = StrListsCol{ + StrListsCol{{"Banana", "Pig" /*NULL*/, "Kiwi", "Cherry", "Whale" /*NULL*/}, null_at({1, 4})}, + StrListsCol{"Lemon", "Peach"}}; + auto row2 = StrListsCol{{StrListsCol{"Coconut"}, StrListsCol{} /*NULL*/}, null_at(1)}; + auto const col = build_lists_col(row0, row1, row2); + + // Ignore null list elements. + { + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = StrListsCol{ + StrListsCol{{"Tomato", "" /*NULL*/, "Apple", "Orange", "" /*NULL*/, "" /*NULL*/, "" + /*NULL*/}, + null_at({1, 4, 5, 6})}, + StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/, "Lemon", "Peach"}, + null_at({1, 4})}, + StrListsCol{"Coconut"}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + + // Null lists result in null rows. + { + auto const results = cudf::lists::concatenate_list_elements( + col, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW); + auto const expected = StrListsCol{ + {StrListsCol{ + {"Tomato", "" /*NULL*/, "Apple", "Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, + null_at({1, 4, 5, 6})}, + StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/, "Lemon", "Peach"}, + null_at({1, 4})}, + StrListsCol{} /*NULL*/}, + null_at(2)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } +} +TEST_F(ConcatenateListElementsTest, SimpleInputStringsColumnWithEmptyStringsAndNulls) +{ + auto row0 = + StrListsCol{StrListsCol{"", "", ""}, + StrListsCol{{"Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, null_at({1, 2, 3})}}; + auto row1 = StrListsCol{ + StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/}, null_at({1, 4})}, + StrListsCol{""}}; + auto row2 = StrListsCol{{StrListsCol{"Coconut"}, StrListsCol{} /*NULL*/}, null_at(1)}; + auto const col = build_lists_col(row0, row1, row2); + + // Ignore null list elements. + { + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = StrListsCol{ + StrListsCol{{"", "", "", "Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, + null_at({4, 5, 6})}, + StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/, ""}, null_at({1, 4})}, + StrListsCol{"Coconut"}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + + // Null lists result in null rows. + { + auto const results = cudf::lists::concatenate_list_elements( + col, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW); + auto const expected = StrListsCol{ + {StrListsCol{{"", "", "", "Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, + null_at({4, 5, 6})}, + StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/, ""}, null_at({1, 4})}, + StrListsCol{} /*NULL*/}, + null_at(2)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } +} + +TYPED_TEST(ConcatenateListElementsTypedTest, SlicedColumnsInputNoNull) +{ + using ListsCol = cudf::test::lists_column_wrapper; + + auto const col_original = ListsCol{ListsCol{{1, 2, 3}, {2, 3}}, + ListsCol{{3, 4, 5, 6}, {5, 6}, {}, {7}}, + ListsCol{{7, 7, 7}, {7, 8, 1, 0}, {1}}, + ListsCol{{9, 10, 11}}, + ListsCol{}, + ListsCol{{12, 13, 14, 15}, {16}, {17}}}; + + { + auto const col = cudf::slice(col_original, {0, 3})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = + ListsCol{{1, 2, 3, 2, 3}, {3, 4, 5, 6, 5, 6, 7}, {7, 7, 7, 7, 8, 1, 0, 1}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {1, 4})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = ListsCol{{3, 4, 5, 6, 5, 6, 7}, {7, 7, 7, 7, 8, 1, 0, 1}, {9, 10, 11}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {2, 5})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = ListsCol{{7, 7, 7, 7, 8, 1, 0, 1}, {9, 10, 11}, {}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {3, 6})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = ListsCol{{9, 10, 11}, {}, {12, 13, 14, 15, 16, 17}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } +} + +TYPED_TEST(ConcatenateListElementsTypedTest, SlicedColumnsInputWithNulls) +{ + using ListsCol = cudf::test::lists_column_wrapper; + + auto row0 = ListsCol{ListsCol{{null, 2, 3}, null_at(0)}, ListsCol{2, 3}}; + auto row1 = ListsCol{ListsCol{{3, null, null, 6}, null_at({1, 2})}, + ListsCol{{5, 6, null}, null_at(2)}, + ListsCol{}, + ListsCol{{7, null}, null_at(1)}}; + auto row2 = ListsCol{ListsCol{7, 7, 7}, ListsCol{{7, 8, null, 0}, null_at(2)}, ListsCol{1}}; + auto row3 = ListsCol{ListsCol{9, 10, 11}}; + auto row4 = ListsCol{ListsCol{}}; + auto row5 = ListsCol{ListsCol{{12, null, 14, 15}, null_at(1)}, ListsCol{16}, ListsCol{17}}; + auto const col_original = build_lists_col(row0, row1, row2, row3, row4, row5); + + { + auto const col = cudf::slice(col_original, {0, 3})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = + ListsCol{ListsCol{{null, 2, 3, 2, 3}, null_at(0)}, + ListsCol{{3, null, null, 6, 5, 6, null, 7, null}, null_at({1, 2, 6, 8})}, + ListsCol{{7, 7, 7, 7, 8, null, 0, 1}, null_at(5)}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {1, 4})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = + ListsCol{ListsCol{{3, null, null, 6, 5, 6, null, 7, null}, null_at({1, 2, 6, 8})}, + ListsCol{{7, 7, 7, 7, 8, null, 0, 1}, null_at(5)}, + ListsCol{9, 10, 11}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {2, 5})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = + ListsCol{ListsCol{{7, 7, 7, 7, 8, null, 0, 1}, null_at(5)}, ListsCol{9, 10, 11}, ListsCol{}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {3, 6})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = + ListsCol{ListsCol{9, 10, 11}, ListsCol{}, ListsCol{{12, null, 14, 15, 16, 17}, null_at(1)}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } +} + +TEST_F(ConcatenateListElementsTest, SlicedStringsColumnsInputWithNulls) +{ + auto row0 = StrListsCol{ + StrListsCol{{"Tomato", "Bear" /*NULL*/, "Apple"}, null_at(1)}, + StrListsCol{{"Banana", "Pig" /*NULL*/, "Kiwi", "Cherry", "Whale" /*NULL*/}, null_at({1, 4})}, + StrListsCol{"Coconut"}}; + auto row1 = StrListsCol{ + StrListsCol{{"Banana", "Pig" /*NULL*/, "Kiwi", "Cherry", "Whale" /*NULL*/}, null_at({1, 4})}, + StrListsCol{"Coconut"}, + StrListsCol{{"Orange", "Dog" /*NULL*/, "Fox" /*NULL*/, "Duck" /*NULL*/}, null_at({1, 2, 3})}}; + auto row2 = StrListsCol{ + StrListsCol{"Coconut"}, + StrListsCol{{"Orange", "Dog" /*NULL*/, "Fox" /*NULL*/, "Duck" /*NULL*/}, null_at({1, 2, 3})}, + StrListsCol{"Lemon", "Peach"}}; + auto row3 = StrListsCol{ + {StrListsCol{{"Orange", "Dog" /*NULL*/, "Fox" /*NULL*/, "Duck" /*NULL*/}, null_at({1, 2, 3})}, + StrListsCol{"Lemon", "Peach"}, + StrListsCol{} /*NULL*/}, + null_at(2)}; + auto const col_original = build_lists_col(row0, row1, row2, row3); + + { + auto const col = cudf::slice(col_original, {0, 2})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = StrListsCol{StrListsCol{{"Tomato", + "" /*NULL*/, + "Apple", + "Banana", + "" /*NULL*/, + "Kiwi", + "Cherry", + "" /*NULL*/, + "Coconut"}, + null_at({1, 4, 7})}, + StrListsCol{{"Banana", + "" /*NULL*/, + "Kiwi", + "Cherry", + "" /*NULL*/, + "Coconut", + "Orange", + "" /*NULL*/, + "" /*NULL*/, + "" /*NULL*/}, + null_at({1, 4, 7, 8, 9})}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {1, 3})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = StrListsCol{StrListsCol{{"Banana", + "" /*NULL*/, + "Kiwi", + "Cherry", + "" /*NULL*/, + "Coconut", + "Orange", + "" /*NULL*/, + "" /*NULL*/, + "" /*NULL*/}, + null_at({1, 4, 7, 8, 9})}, + StrListsCol{{"Coconut", + "Orange", + "" /*NULL*/, + "" /*NULL*/, + "", /*NULL*/ + "Lemon", + "Peach"}, + null_at({2, 3, 4})}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {2, 4})[0]; + auto const results = cudf::lists::concatenate_list_elements(col); + auto const expected = StrListsCol{StrListsCol{{"Coconut", + "Orange", + "" /*NULL*/, + "" /*NULL*/, + "", /*NULL*/ + "Lemon", + "Peach"}, + null_at({2, 3, 4})}, + StrListsCol{{"Orange", + "" /*NULL*/, + "" /*NULL*/, + "", /*NULL*/ + "Lemon", + "Peach"}, + null_at({1, 2, 3})}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + { + auto const col = cudf::slice(col_original, {2, 4})[0]; + auto const results = cudf::lists::concatenate_list_elements( + col, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW); + auto const expected = StrListsCol{{StrListsCol{{"Coconut", + "Orange", + "" /*NULL*/, + "" /*NULL*/, + "", /*NULL*/ + "Lemon", + "Peach"}, + null_at({2, 3, 4})}, + StrListsCol{} /*NULL*/}, + null_at(1)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } +} diff --git a/cpp/tests/lists/concatenate_rows_tests.cpp b/cpp/tests/lists/combine/concatenate_rows_tests.cpp similarity index 99% rename from cpp/tests/lists/concatenate_rows_tests.cpp rename to cpp/tests/lists/combine/concatenate_rows_tests.cpp index 5abaf99f739..3e085af7740 100644 --- a/cpp/tests/lists/concatenate_rows_tests.cpp +++ b/cpp/tests/lists/combine/concatenate_rows_tests.cpp @@ -19,7 +19,7 @@ #include #include -#include +#include namespace { using StrListsCol = cudf::test::lists_column_wrapper; diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp index 85bbdd41b4a..a09de5c61e3 100644 --- a/java/src/main/native/src/ColumnVectorJni.cpp +++ b/java/src/main/native/src/ColumnVectorJni.cpp @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/python/cudf/cudf/_lib/cpp/lists/concatenate_rows.pxd b/python/cudf/cudf/_lib/cpp/lists/combine.pxd similarity index 83% rename from python/cudf/cudf/_lib/cpp/lists/concatenate_rows.pxd rename to python/cudf/cudf/_lib/cpp/lists/combine.pxd index 8c4dabf5168..ea9ade178e2 100644 --- a/python/cudf/cudf/_lib/cpp/lists/concatenate_rows.pxd +++ b/python/cudf/cudf/_lib/cpp/lists/combine.pxd @@ -5,7 +5,7 @@ from libcpp.memory cimport unique_ptr from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.table.table_view cimport table_view -cdef extern from "cudf/lists/concatenate_rows.hpp" namespace \ +cdef extern from "cudf/lists/combine.hpp" namespace \ "cudf::lists" nogil: cdef unique_ptr[column] concatenate_rows( const table_view input_table diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 46f034dc525..7d8909610dc 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -16,7 +16,7 @@ from cudf._lib.cpp.lists.drop_list_duplicates cimport ( from cudf._lib.cpp.lists.sorting cimport ( sort_lists as cpp_sort_lists ) -from cudf._lib.cpp.lists.concatenate_rows cimport ( +from cudf._lib.cpp.lists.combine cimport ( concatenate_rows as cpp_concatenate_rows ) from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view From 75e12d157ccc7c6c6d92b81c4f5d270d593b8e64 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Thu, 20 May 2021 16:02:00 -0400 Subject: [PATCH 08/27] Actually test equality in assert_groupby_results_equal (#8272) Authors: - Ashwin Srinath (https://github.com/shwina) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Keith Kraus (https://github.com/kkraus14) - Michael Wang (https://github.com/isVoid) - Christopher Harris (https://github.com/cwharris) - Gera Shegalov (https://github.com/gerashegalov) URL: https://github.com/rapidsai/cudf/pull/8272 --- python/cudf/cudf/core/groupby/groupby.py | 8 +++-- python/cudf/cudf/tests/test_groupby.py | 46 +++++++++++++++++++----- 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 61fe20636f0..c1060d5f505 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -110,6 +110,7 @@ def cumcount(self): ) .groupby(self.grouping, sort=self._sort) .agg("cumcount") + .reset_index(drop=True) ) @cached_property @@ -225,9 +226,10 @@ def nth(self, n): """ Return the nth row from each group. """ - result = self.agg(lambda x: x.nth(n)) - sizes = self.size() - return result[n < sizes] + result = self.agg(lambda x: x.nth(n)).sort_index() + sizes = self.size().sort_index() + + return result[sizes > n] def serialize(self): header = {} diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index d1458c72770..2430b0da5ef 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -30,14 +30,28 @@ _index_type_aggs = {"count", "idxmin", "idxmax", "cumcount"} -def assert_groupby_results_equal(expect, got, sort=True, **kwargs): +def assert_groupby_results_equal( + expect, got, sort=True, as_index=True, by=None, **kwargs +): # Because we don't sort by index by default in groupby, # sort expect and got by index before comparing if sort: - expect = expect.sort_index() - got = got.sort_index() - else: - assert_eq(expect.sort_index(), got.sort_index(), **kwargs) + if as_index: + expect = expect.sort_index() + got = got.sort_index() + else: + assert by is not None + if isinstance(expect, (pd.DataFrame, cudf.DataFrame)): + expect = expect.sort_values(by=by).reset_index(drop=True) + else: + expect = expect.sort_values().reset_index(drop=True) + + if isinstance(got, cudf.DataFrame): + got = got.sort_values(by=by).reset_index(drop=True) + else: + got = got.sort_values().reset_index(drop=True) + + assert_eq(expect, got, **kwargs) def make_frame( @@ -201,10 +215,16 @@ def test_groupby_getitem_getattr(as_index): pdf = pd.DataFrame({"x": [1, 3, 1], "y": [1, 2, 3], "z": [1, 4, 5]}) gdf = cudf.from_pandas(pdf) assert_groupby_results_equal( - pdf.groupby("x")["y"].sum(), gdf.groupby("x")["y"].sum(), + pdf.groupby("x")["y"].sum(), + gdf.groupby("x")["y"].sum(), + as_index=as_index, + by="x", ) assert_groupby_results_equal( - pdf.groupby("x").y.sum(), gdf.groupby("x").y.sum(), + pdf.groupby("x").y.sum(), + gdf.groupby("x").y.sum(), + as_index=as_index, + by="x", ) assert_groupby_results_equal( pdf.groupby("x")[["y"]].sum(), gdf.groupby("x")[["y"]].sum(), @@ -212,6 +232,8 @@ def test_groupby_getitem_getattr(as_index): assert_groupby_results_equal( pdf.groupby(["x", "y"], as_index=as_index).sum(), gdf.groupby(["x", "y"], as_index=as_index).sum(), + as_index=as_index, + by=["x", "y"], ) @@ -1088,7 +1110,13 @@ def test_groupby_datetime(nelem, as_index, agg): else: pdres = pdg.agg({"datetime": agg}) gdres = gdg.agg({"datetime": agg}) - assert_groupby_results_equal(pdres, gdres, check_dtype=check_dtype) + assert_groupby_results_equal( + pdres, + gdres, + check_dtype=check_dtype, + as_index=as_index, + by=["datetime"], + ) def test_groupby_dropna(): @@ -1349,6 +1377,8 @@ def test_reset_index_after_empty_groupby(): assert_groupby_results_equal( pdf.groupby("a").sum().reset_index(), gdf.groupby("a").sum().reset_index(), + as_index=False, + by="a", ) From 3975f1039861ad714c66c921c5521264b747aa35 Mon Sep 17 00:00:00 2001 From: AJ Schmidt Date: Thu, 20 May 2021 17:12:45 -0400 Subject: [PATCH 09/27] Update `CHANGELOG.md` links for calver (#8303) This PR updates the `0.20` references in `CHANGELOG.md` to be `21.06`. Authors: - AJ Schmidt (https://github.com/ajschmidt8) Approvers: - https://github.com/jakirkham URL: https://github.com/rapidsai/cudf/pull/8303 --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e932eb8249e..195a8732b71 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ -# cuDF 0.20.0 (Date TBD) +# cuDF 21.06.00 (Date TBD) -Please see https://github.com/rapidsai/cudf/releases/tag/v0.20.0a for the latest changes to this development branch. +Please see https://github.com/rapidsai/cudf/releases/tag/v21.06.00a for the latest changes to this development branch. # cuDF 0.19.0 (21 Apr 2021) From 2a1075e462be8df207180c872e60cd4fbeef88d9 Mon Sep 17 00:00:00 2001 From: Rong Ou Date: Thu, 20 May 2021 15:18:08 -0700 Subject: [PATCH 10/27] use address and length for GDS reads/writes (#8301) Since we want GDS reads/writes to be 4 KiB aligned, sometimes we can't use the `DeviceMemoryBuffer` as is and need to adjust the size written. This change makes the JNI APIs more flexible to accommodate those. Authors: - Rong Ou (https://github.com/rongou) Approvers: - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/8301 --- java/src/main/java/ai/rapids/cudf/CuFile.java | 50 +++++++++++++++++-- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/CuFile.java b/java/src/main/java/ai/rapids/cudf/CuFile.java index 00c9cdb9fd5..4baad834570 100644 --- a/java/src/main/java/ai/rapids/cudf/CuFile.java +++ b/java/src/main/java/ai/rapids/cudf/CuFile.java @@ -78,11 +78,25 @@ public static boolean libraryLoaded() { * @param path The file path to copy to. * @param file_offset The file offset from which to write the buffer. * @param buffer The device buffer to copy from. - * @return The file offset from which the buffer was appended. */ public static void writeDeviceBufferToFile(File path, long file_offset, BaseDeviceMemoryBuffer buffer) { - writeToFile(path.getAbsolutePath(), file_offset, buffer.getAddress(), buffer.getLength()); + writeDeviceMemoryToFile(path, file_offset, buffer.getAddress(), buffer.getLength()); + } + + /** + * Write device memory to a given file path synchronously. + *

+ * This method is NOT thread safe if the path points to the same file on disk. + * + * @param path The file path to copy to. + * @param file_offset The file offset from which to write the buffer. + * @param address The device memory address to copy from. + * @param length The length to copy. + */ + public static void writeDeviceMemoryToFile(File path, long file_offset, long address, + long length) { + writeToFile(path.getAbsolutePath(), file_offset, address, length); } /** @@ -95,7 +109,21 @@ public static void writeDeviceBufferToFile(File path, long file_offset, * @return The file offset from which the buffer was appended. */ public static long appendDeviceBufferToFile(File path, BaseDeviceMemoryBuffer buffer) { - return appendToFile(path.getAbsolutePath(), buffer.getAddress(), buffer.getLength()); + return appendDeviceMemoryToFile(path, buffer.getAddress(), buffer.getLength()); + } + + /** + * Append device memory to a given file path synchronously. + *

+ * This method is NOT thread safe if the path points to the same file on disk. + * + * @param path The file path to copy to. + * @param address The device memory address to copy from. + * @param length The length to copy. + * @return The file offset from which the buffer was appended. + */ + public static long appendDeviceMemoryToFile(File path, long address, long length) { + return appendToFile(path.getAbsolutePath(), address, length); } /** @@ -109,7 +137,21 @@ public static long appendDeviceBufferToFile(File path, BaseDeviceMemoryBuffer bu */ public static void readFileToDeviceBuffer(BaseDeviceMemoryBuffer buffer, File path, long fileOffset) { - readFromFile(buffer.getAddress(), buffer.getLength(), path.getAbsolutePath(), fileOffset); + readFileToDeviceMemory(buffer.getAddress(), buffer.getLength(), path, fileOffset); + } + + /** + * Read a file into device memory synchronously. + *

+ * This method is NOT thread safe if the path points to the same file on disk. + * + * @param address The device memory address to read into. + * @param length The length to read. + * @param path The file path to copy from. + * @param fileOffset The file offset from which to copy the content. + */ + public static void readFileToDeviceMemory(long address, long length, File path, long fileOffset) { + readFromFile(address, length, path.getAbsolutePath(), fileOffset); } private static native void writeToFile(String path, long file_offset, long address, long length); From b5531448243794974fae6987957d65d3339ee2ef Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Thu, 20 May 2021 17:26:02 -0500 Subject: [PATCH 11/27] Return python lists for __getitem__ calls to list type series (#8265) Make it so that this works: ``` x = cudf.Series([[1,2,None]]) x[0] # [1, 2, ] ``` Authors: - https://github.com/brandon-b-miller Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Michael Wang (https://github.com/isVoid) URL: https://github.com/rapidsai/cudf/pull/8265 --- python/cudf/cudf/_lib/cpp/scalar/scalar.pxd | 6 ++ python/cudf/cudf/_lib/scalar.pyx | 63 +++++++++++++++++++-- python/cudf/cudf/core/indexing.py | 6 +- python/cudf/cudf/tests/test_list.py | 18 ++++++ 4 files changed, 87 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd b/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd index fec1c6382e6..de5cb05447c 100644 --- a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd +++ b/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd @@ -9,6 +9,9 @@ from libcpp.string cimport string from cudf._lib.cpp.types cimport data_type from cudf._lib.cpp.wrappers.decimals cimport scale_type +from cudf._lib.cpp.column.column_view cimport column_view + + cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil: cdef cppclass scalar: scalar() except + @@ -60,3 +63,6 @@ cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil: bool is_valid) except + int64_t value() except + # TODO: Figure out how to add an int32 overload of value() + + cdef cppclass list_scalar(scalar): + column_view view() except + diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 9f8a8ee6b1e..cb355a15f15 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -18,9 +18,18 @@ from libcpp.utility cimport move from libcpp cimport bool import cudf -from cudf._lib.types import cudf_to_np_types, duration_unit_map +from cudf.core.dtypes import ListDtype +from cudf._lib.types import ( + cudf_to_np_types, + duration_unit_map +) from cudf._lib.types import datetime_unit_map -from cudf._lib.types cimport underlying_type_t_type_id +from cudf._lib.types cimport underlying_type_t_type_id, dtype_from_column_view + +from cudf._lib.column cimport Column +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.table cimport Table +from cudf._lib.interop import to_arrow from cudf._lib.cpp.wrappers.timestamps cimport ( timestamp_s, @@ -41,12 +50,12 @@ from cudf._lib.cpp.scalar.scalar cimport ( timestamp_scalar, duration_scalar, string_scalar, - fixed_point_scalar + fixed_point_scalar, + list_scalar, ) -from cudf.utils.dtypes import _decimal_to_int64 +from cudf.utils.dtypes import _decimal_to_int64, is_list_dtype cimport cudf._lib.cpp.types as libcudf_types - cdef class DeviceScalar: def __init__(self, value, dtype): @@ -97,6 +106,8 @@ cdef class DeviceScalar: def _to_host_scalar(self): if isinstance(self.dtype, cudf.Decimal64Dtype): result = _get_py_decimal_from_fixed_point(self.c_value) + elif is_list_dtype(self.dtype): + result = _get_py_list_from_list(self.c_value) elif pd.api.types.is_string_dtype(self.dtype): result = _get_py_string_from_string(self.c_value) elif pd.api.types.is_numeric_dtype(self.dtype): @@ -159,6 +170,22 @@ cdef class DeviceScalar: raise TypeError( "Must pass a dtype when constructing from a fixed-point scalar" ) + elif cdtype.id() == libcudf_types.LIST: + if ( + s.get_raw_ptr() + )[0].view().type().id() == libcudf_types.LIST: + s._dtype = dtype_from_column_view( + (s.get_raw_ptr())[0].view() + ) + else: + s._dtype = ListDtype( + cudf_to_np_types[ + ( + (s.get_raw_ptr())[0] + .view().type().id() + ) + ] + ) else: if dtype is not None: s._dtype = dtype @@ -268,6 +295,19 @@ cdef _set_decimal64_from_scalar(unique_ptr[scalar]& s, ) ) +cdef _get_py_list_from_list(unique_ptr[scalar]& s): + + if not s.get()[0].is_valid(): + return cudf.NA + + cdef column_view list_col_view = (s.get()).view() + cdef Column list_col = Column.from_column_view(list_col_view, None) + cdef Table to_arrow_table = Table({"col": list_col}) + + arrow_table = to_arrow(to_arrow_table, [["col", []]]) + result = arrow_table['col'].to_pylist() + return _nested_na_replace(result) + cdef _get_py_string_from_string(unique_ptr[scalar]& s): if not s.get()[0].is_valid(): return cudf.NA @@ -440,3 +480,16 @@ def _create_proxy_nat_scalar(dtype): return result else: raise TypeError('NAT only valid for datetime and timedelta') + + +def _nested_na_replace(input_list): + ''' + Replace `None` with `cudf.NA` in the result of + `__getitem__` calls to list type columns + ''' + for idx, value in enumerate(input_list): + if isinstance(value, list): + _nested_na_replace(value) + elif value is None: + input_list[idx] = cudf.NA + return input_list diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index 7de1aaf9726..21d075ae67d 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -85,7 +85,11 @@ def __getitem__(self, arg): arg = list(arg) data = self._sr._column[arg] - if is_scalar(data) or _is_null_host_scalar(data): + if ( + isinstance(data, list) + or is_scalar(data) + or _is_null_host_scalar(data) + ): return data index = self._sr.index.take(arg) return self._sr._copy_construct(data=data, index=index) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 5dcecc6c9e1..7edcb08a7c8 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -7,6 +7,7 @@ import pytest import cudf +from cudf import NA from cudf.tests.utils import assert_eq @@ -332,3 +333,20 @@ def test_concatenate_list_with_nonlist(): gdf1 = cudf.DataFrame({"A": [["a", "c"], ["b", "d"], ["c", "d"]]}) gdf2 = cudf.DataFrame({"A": ["a", "b", "c"]}) gdf1["A"] + gdf2["A"] + + +@pytest.mark.parametrize( + "indata,expect", + [ + ([1], [1]), + ([1, 2, 3], [1, 2, 3]), + ([[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]), + ([None], [NA]), + ([1, None, 3], [1, NA, 3]), + ([[1, None, 3], [None, 5, 6]], [[1, NA, 3], [NA, 5, 6]]), + ], +) +def test_list_getitem(indata, expect): + list_sr = cudf.Series([indata]) + # __getitem__ shall fill None with cudf.NA + assert list_sr[0] == expect From c7d052426d6ceceff732307df13bebfbc15b046a Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 20 May 2021 16:09:23 -0700 Subject: [PATCH 12/27] Copy nested types upon construction (#8244) Closes #7561 This PR makes sure upon constructing cudf object, nested types from the pyarrow array is copied to cudf object. This should handle arbitrary nesting of `Lists`, `Structs`. For decimal types, precision is copied from the array. Authors: - Michael Wang (https://github.com/isVoid) - Keith Kraus (https://github.com/kkraus14) Approvers: - Keith Kraus (https://github.com/kkraus14) URL: https://github.com/rapidsai/cudf/pull/8244 --- python/cudf/cudf/core/column/column.py | 64 ++++++++++++++- python/cudf/cudf/core/dtypes.py | 8 +- python/cudf/cudf/tests/test_dtypes.py | 104 ++++++++++++++++++++++++- 3 files changed, 169 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 20f302f7e59..4bf4b2b87f2 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -40,7 +40,12 @@ from cudf._typing import BinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.core.abc import Serializable from cudf.core.buffer import Buffer -from cudf.core.dtypes import CategoricalDtype, IntervalDtype +from cudf.core.dtypes import ( + CategoricalDtype, + IntervalDtype, + ListDtype, + StructDtype, +) from cudf.utils import ioutils, utils from cudf.utils.dtypes import ( check_cast_unsupported_dtype, @@ -291,8 +296,7 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: "None" ] - if isinstance(result.dtype, cudf.Decimal64Dtype): - result.dtype.precision = array.type.precision + result = _copy_type_metadata_from_arrow(array, result) return result def _get_mask_as_column(self) -> ColumnBase: @@ -2230,6 +2234,60 @@ def full(size: int, fill_value: ScalarLike, dtype: Dtype = None) -> ColumnBase: return ColumnBase.from_scalar(cudf.Scalar(fill_value, dtype), size) +def _copy_type_metadata_from_arrow( + arrow_array: pa.array, cudf_column: ColumnBase +) -> ColumnBase: + """ + Similar to `Column._copy_type_metadata`, except copies type metadata + from arrow array into a cudf column. Recursive for every level. + * When `arrow_array` is struct type and `cudf_column` is StructDtype, copy + field names. + * When `arrow_array` is decimal type and `cudf_column` is + Decimal64Dtype, copy precisions. + """ + if pa.types.is_decimal(arrow_array.type) and isinstance( + cudf_column, cudf.core.column.DecimalColumn + ): + cudf_column.dtype.precision = arrow_array.type.precision + elif pa.types.is_struct(arrow_array.type) and isinstance( + cudf_column, cudf.core.column.StructColumn + ): + base_children = tuple( + _copy_type_metadata_from_arrow(arrow_array.field(i), col_child) + for i, col_child in enumerate(cudf_column.base_children) + ) + cudf_column.set_base_children(base_children) + return cudf.core.column.StructColumn( + data=None, + size=cudf_column.base_size, + dtype=StructDtype.from_arrow(arrow_array.type), + mask=cudf_column.base_mask, + offset=cudf_column.offset, + null_count=cudf_column.null_count, + children=base_children, + ) + elif pa.types.is_list(arrow_array.type) and isinstance( + cudf_column, cudf.core.column.ListColumn + ): + if arrow_array.values and cudf_column.base_children: + base_children = ( + cudf_column.base_children[0], + _copy_type_metadata_from_arrow( + arrow_array.values, cudf_column.base_children[1] + ), + ) + return cudf.core.column.ListColumn( + size=cudf_column.base_size, + dtype=ListDtype.from_arrow(arrow_array.type), + mask=cudf_column.base_mask, + offset=cudf_column.offset, + null_count=cudf_column.null_count, + children=base_children, + ) + + return cudf_column + + def _concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: """Concatenate a sequence of columns.""" if len(objs) == 0: diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 7db8ba15caa..f0b0dbba4a5 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -143,6 +143,8 @@ def __init__(self, element_type: Any) -> None: def element_type(self) -> Dtype: if isinstance(self._typ.value_type, pa.ListType): return ListDtype.from_arrow(self._typ.value_type) + elif isinstance(self._typ.value_type, pa.StructType): + return StructDtype.from_arrow(self._typ.value_type) else: return np.dtype(self._typ.value_type.to_pandas_dtype()).name @@ -176,10 +178,10 @@ def __eq__(self, other): return self._typ.equals(other._typ) def __repr__(self): - if isinstance(self.element_type, ListDtype): - return f"ListDtype({self.element_type.__repr__()})" + if isinstance(self.element_type, (ListDtype, StructDtype)): + return f"{type(self).__name__}({self.element_type.__repr__()})" else: - return f"ListDtype({self.element_type})" + return f"{type(self).__name__}({self.element_type})" def __hash__(self): return hash(self._typ) diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index b6e2aac0304..a5895caf49f 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -6,14 +6,16 @@ import pytest import cudf +from cudf.core.column import ColumnBase from cudf.core.dtypes import ( CategoricalDtype, Decimal64Dtype, + IntervalDtype, ListDtype, StructDtype, - IntervalDtype, ) from cudf.tests.utils import assert_eq +from cudf.utils.dtypes import np_to_pa_dtype def test_cdt_basic(): @@ -155,3 +157,103 @@ def test_interval_dtype_pyarrow_round_trip(fields, closed): expect = pa_array got = IntervalDtype.from_arrow(expect).to_arrow() assert expect.equals(got) + + +def assert_column_array_dtype_equal(column: ColumnBase, array: pa.array): + """ + In cudf, each column holds its dtype. And since column may have child + columns, child columns also holds their datatype. This method tests + that every level of `column` matches the type of the given `array` + recursively. + """ + + if isinstance(column.dtype, ListDtype): + return array.type.equals( + column.dtype.to_arrow() + ) and assert_column_array_dtype_equal( + column.base_children[1], array.values + ) + elif isinstance(column.dtype, StructDtype): + return array.type.equals(column.dtype.to_arrow()) and all( + [ + assert_column_array_dtype_equal(child, array.field(i)) + for i, child in enumerate(column.base_children) + ] + ) + elif isinstance(column.dtype, Decimal64Dtype): + return array.type.equals(column.dtype.to_arrow()) + elif isinstance(column.dtype, CategoricalDtype): + raise NotImplementedError() + else: + return array.type.equals(np_to_pa_dtype(column.dtype)) + + +@pytest.mark.parametrize( + "data", + [ + [[{"name": 123}]], + [ + [ + { + "IsLeapYear": False, + "data": {"Year": 1999, "Month": 7}, + "names": ["Mike", None], + }, + { + "IsLeapYear": True, + "data": {"Year": 2004, "Month": 12}, + "names": None, + }, + { + "IsLeapYear": False, + "data": {"Year": 1996, "Month": 2}, + "names": ["Rose", "Richard"], + }, + ] + ], + [ + [None, {"human?": True, "deets": {"weight": 2.4, "age": 27}}], + [ + {"human?": None, "deets": {"weight": 5.3, "age": 25}}, + {"human?": False, "deets": {"weight": 8.0, "age": 31}}, + {"human?": False, "deets": None}, + ], + [], + None, + [{"human?": None, "deets": {"weight": 6.9, "age": None}}], + ], + [ + { + "name": "var0", + "val": [ + {"name": "var1", "val": None, "type": "optional"} + ], + "type": "list", + }, + {}, + { + "name": "var2", + "val": [ + { + "name": "var3", + "val": {"field": 42}, + "type": "optional", + }, + { + "name": "var4", + "val": {"field": 3.14}, + "type": "optional", + }, + ], + "type": "list", + }, + None, + ], + ], +) +def test_lists_of_structs_dtype(data): + got = cudf.Series(data) + expected = pa.array(data) + + assert_column_array_dtype_equal(got._column, expected) + assert expected.equals(got._column.to_arrow()) From 9a85b3baf0742b89ebce8389309efade89bdca3f Mon Sep 17 00:00:00 2001 From: pxLi Date: Fri, 21 May 2021 08:52:26 +0800 Subject: [PATCH 13/27] Update cudfjni version to 21.06.0 (#8292) Signed-off-by: Peixin Li supplement to #8267, as discussed, cudf JNI and plugin will follow pattern YY.MM.P Authors: - pxLi (https://github.com/pxLi) Approvers: - Jason Lowe (https://github.com/jlowe) - Robert (Bobby) Evans (https://github.com/revans2) URL: https://github.com/rapidsai/cudf/pull/8292 --- java/ci/README.md | 2 +- java/pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/java/ci/README.md b/java/ci/README.md index 458a76bcd04..968ce279a2c 100644 --- a/java/ci/README.md +++ b/java/ci/README.md @@ -49,5 +49,5 @@ scl enable devtoolset-9 "java/ci/build-in-docker.sh" ### The output -You can find the cuDF jar in java/target/ like cudf-21.06-SNAPSHOT-cuda11.jar. +You can find the cuDF jar in java/target/ like cudf-21.06.0-SNAPSHOT-cuda11.jar. diff --git a/java/pom.xml b/java/pom.xml index cec20ec04af..fe2d9a453f7 100755 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,7 +21,7 @@ ai.rapids cudf - 21.06-SNAPSHOT + 21.06.0-SNAPSHOT cudfjni From b84c7923519cb7b64c247a9d010686e0ed4bf1fc Mon Sep 17 00:00:00 2001 From: Alfred Xu Date: Fri, 21 May 2021 22:42:33 +0800 Subject: [PATCH 14/27] Fix concatenate_lists_ignore_null on rows of all_nulls (#8312) After the rework of `cudf::lists::concatenate_rows`, something changed on null handling failed [corresponding cuDF Java tests](https://github.com/rapidsai/cudf/blob/branch-21.06/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java#L2234). In specific, when we apply `concatenate_null_policy::IGNORE`, the output lists are always null free, even if input data contains rows consisting of all nulls. In my opinion, we had better creating null mask for input rows of `all_nulls`, to keep align with single column concatenate. Signed-off-by: sperlingxx Authors: - Alfred Xu (https://github.com/sperlingxx) Approvers: - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/8312 --- .../combine/concatenate_list_elements.cu | 40 ++++++++++++++++--- .../concatenate_list_elements_tests.cpp | 25 +++++++----- .../lists/combine/concatenate_rows_tests.cpp | 36 ++++++++++------- 3 files changed, 70 insertions(+), 31 deletions(-) diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu index b76cd19d94b..c5a28a8ec5f 100644 --- a/cpp/src/lists/combine/concatenate_list_elements.cu +++ b/cpp/src/lists/combine/concatenate_list_elements.cu @@ -41,6 +41,7 @@ namespace { * concatenation. */ std::unique_ptr concatenate_lists_ignore_null(column_view const& input, + bool build_null_mask, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -50,9 +51,13 @@ std::unique_ptr concatenate_lists_ignore_null(column_view const& input, auto out_offsets = make_numeric_column( data_type{type_id::INT32}, num_rows + 1, mask_state::UNALLOCATED, stream, mr); + // The array of int8_t stores validities for the output list elements. + auto validities = rmm::device_uvector(build_null_mask ? num_rows : 0, stream); + auto const d_out_offsets = out_offsets->mutable_view().template begin(); auto const d_row_offsets = lists_column_view(input).offsets_begin(); auto const d_list_offsets = lists_column_view(lists_column_view(input).child()).offsets_begin(); + auto const lists_dv_ptr = column_device_view::create(lists_column_view(input).child()); // Concatenating the lists at the same row by converting the entry offsets from the child column // into row offsets of the root column. Those entry offsets are subtracted by the first entry @@ -62,7 +67,22 @@ std::unique_ptr concatenate_lists_ignore_null(column_view const& input, iter, iter + num_rows + 1, d_out_offsets, - [d_row_offsets, d_list_offsets] __device__(auto const idx) { + [d_row_offsets, + d_list_offsets, + lists_dv = *lists_dv_ptr, + d_validities = validities.begin(), + build_null_mask, + iter] __device__(auto const idx) { + if (build_null_mask) { + // The output row will be null only if all lists on the input row are null. + auto const is_valid = thrust::any_of(thrust::seq, + iter + d_row_offsets[idx], + iter + d_row_offsets[idx + 1], + [&] __device__(auto const list_idx) { + return lists_dv.is_valid(list_idx); + }); + d_validities[idx] = static_cast(is_valid); + } auto const start_offset = d_list_offsets[d_row_offsets[0]]; return d_list_offsets[d_row_offsets[idx]] - start_offset; }); @@ -71,11 +91,18 @@ std::unique_ptr concatenate_lists_ignore_null(column_view const& input, auto out_entries = std::make_unique( lists_column_view(lists_column_view(input).get_sliced_child(stream)).get_sliced_child(stream)); + auto [null_mask, null_count] = [&] { + return build_null_mask + ? cudf::detail::valid_if( + validities.begin(), validities.end(), thrust::identity{}, stream, mr) + : std::make_pair(cudf::detail::copy_bitmask(input, stream, mr), input.null_count()); + }(); + return make_lists_column(num_rows, std::move(out_offsets), std::move(out_entries), - input.null_count(), - cudf::detail::copy_bitmask(input, stream, mr), + null_count, + null_count > 0 ? std::move(null_mask) : rmm::device_buffer{}, stream, mr); } @@ -241,9 +268,10 @@ std::unique_ptr concatenate_list_elements(column_view const& input, if (input.size() == 0) { return cudf::empty_like(input); } - return (null_policy == concatenate_null_policy::IGNORE || - !lists_column_view(input).child().has_nulls()) - ? concatenate_lists_ignore_null(input, stream, mr) + bool has_null_list = lists_column_view(input).child().has_nulls(); + + return (null_policy == concatenate_null_policy::IGNORE || !has_null_list) + ? concatenate_lists_ignore_null(input, has_null_list, stream, mr) : concatenate_lists_nullifying_rows(input, stream, mr); } diff --git a/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp b/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp index de6307471a9..7d79cf4aebe 100644 --- a/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp +++ b/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp @@ -147,19 +147,23 @@ TYPED_TEST(ConcatenateListElementsTypedTest, SimpleInputWithNulls) auto row5 = ListsCol{ListsCol{{1, 2, 3, null}, null_at(3)}, ListsCol{{null}, null_at(0)}, ListsCol{{null, null, null, null, null}, all_nulls()}}; - auto const col = build_lists_col(row0, row1, row2, row3, row4, row5); + auto row6 = + ListsCol{{ListsCol{} /*NULL*/, ListsCol{} /*NULL*/, ListsCol{} /*NULL*/}, all_nulls()}; + auto const col = build_lists_col(row0, row1, row2, row3, row4, row5, row6); // Ignore null list elements. { auto const results = cudf::lists::concatenate_list_elements(col); auto const expected = - ListsCol{ListsCol{{1, null, 3, 4, 10, 11, 12, null}, null_at({1, 7})}, - ListsCol{{null, 2, 3, 4, 13, 14, 15, 16, 17, null, 20, null}, null_at({0, 9, 11})}, - ListsCol{{null, 2, 3, 4, null, 21, null, null}, null_at({0, 4, 6, 7})}, - ListsCol{{null, 18}, null_at(0)}, - ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})}, - ListsCol{{1, 2, 3, null, null, null, null, null, null, null}, - null_at({3, 4, 5, 6, 7, 8, 9})}}; + ListsCol{{ListsCol{{1, null, 3, 4, 10, 11, 12, null}, null_at({1, 7})}, + ListsCol{{null, 2, 3, 4, 13, 14, 15, 16, 17, null, 20, null}, null_at({0, 9, 11})}, + ListsCol{{null, 2, 3, 4, null, 21, null, null}, null_at({0, 4, 6, 7})}, + ListsCol{{null, 18}, null_at(0)}, + ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})}, + ListsCol{{1, 2, 3, null, null, null, null, null, null, null}, + null_at({3, 4, 5, 6, 7, 8, 9})}, + ListsCol{} /*NULL*/}, + null_at(6)}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); } @@ -174,8 +178,9 @@ TYPED_TEST(ConcatenateListElementsTypedTest, SimpleInputWithNulls) ListsCol{} /*NULL*/, ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})}, ListsCol{{1, 2, 3, null, null, null, null, null, null, null}, - null_at({3, 4, 5, 6, 7, 8, 9})}}, - null_at({0, 2, 3})}; + null_at({3, 4, 5, 6, 7, 8, 9})}, + ListsCol{} /*NULL*/}, + null_at({0, 2, 3, 6})}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); } } diff --git a/cpp/tests/lists/combine/concatenate_rows_tests.cpp b/cpp/tests/lists/combine/concatenate_rows_tests.cpp index 3e085af7740..af22f329634 100644 --- a/cpp/tests/lists/combine/concatenate_rows_tests.cpp +++ b/cpp/tests/lists/combine/concatenate_rows_tests.cpp @@ -184,24 +184,27 @@ TYPED_TEST(ListConcatenateRowsTypedTest, SimpleInputWithNulls) ListsCol{{null, 2, 3, 4}, null_at(0)}, ListsCol{} /*NULL*/, ListsCol{{1, 2, null, 4}, null_at(2)}, - ListsCol{{1, 2, 3, null}, null_at(3)}}, - null_at(3)} + ListsCol{{1, 2, 3, null}, null_at(3)}, + ListsCol{} /*NULL*/}, + null_at({3, 6})} .release(); auto const col2 = ListsCol{{ListsCol{{10, 11, 12, null}, null_at(3)}, ListsCol{{13, 14, 15, 16, 17, null}, null_at(5)}, ListsCol{} /*NULL*/, ListsCol{{null, 18}, null_at(0)}, ListsCol{{19, 20, null}, null_at(2)}, - ListsCol{{null}, null_at(0)}}, - null_at(2)} + ListsCol{{null}, null_at(0)}, + ListsCol{} /*NULL*/}, + null_at({2, 6})} .release(); auto const col3 = ListsCol{{ListsCol{} /*NULL*/, ListsCol{{20, null}, null_at(1)}, ListsCol{{null, 21, null, null}, null_at({0, 2, 3})}, ListsCol{}, ListsCol{22, 23, 24, 25}, - ListsCol{{null, null, null, null, null}, all_nulls()}}, - null_at(0)} + ListsCol{{null, null, null, null, null}, all_nulls()}, + ListsCol{} /*NULL*/}, + null_at({0, 6})} .release(); // Ignore null list elements @@ -209,13 +212,15 @@ TYPED_TEST(ListConcatenateRowsTypedTest, SimpleInputWithNulls) auto const results = cudf::lists::concatenate_rows(TView{{col1->view(), col2->view(), col3->view()}}); auto const expected = - ListsCol{ListsCol{{1, null, 3, 4, 10, 11, 12, null}, null_at({1, 7})}, - ListsCol{{null, 2, 3, 4, 13, 14, 15, 16, 17, null, 20, null}, null_at({0, 9, 11})}, - ListsCol{{null, 2, 3, 4, null, 21, null, null}, null_at({0, 4, 6, 7})}, - ListsCol{{null, 18}, null_at(0)}, - ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})}, - ListsCol{{1, 2, 3, null, null, null, null, null, null, null}, - null_at({3, 4, 5, 6, 7, 8, 9})}} + ListsCol{{ListsCol{{1, null, 3, 4, 10, 11, 12, null}, null_at({1, 7})}, + ListsCol{{null, 2, 3, 4, 13, 14, 15, 16, 17, null, 20, null}, null_at({0, 9, 11})}, + ListsCol{{null, 2, 3, 4, null, 21, null, null}, null_at({0, 4, 6, 7})}, + ListsCol{{null, 18}, null_at(0)}, + ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})}, + ListsCol{{1, 2, 3, null, null, null, null, null, null, null}, + null_at({3, 4, 5, 6, 7, 8, 9})}, + ListsCol{} /*NULL*/}, + null_at(6)} .release(); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all); } @@ -232,8 +237,9 @@ TYPED_TEST(ListConcatenateRowsTypedTest, SimpleInputWithNulls) ListsCol{} /*NULL*/, ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})}, ListsCol{{1, 2, 3, null, null, null, null, null, null, null}, - null_at({3, 4, 5, 6, 7, 8, 9})}}, - null_at({0, 2, 3})} + null_at({3, 4, 5, 6, 7, 8, 9})}, + ListsCol{} /*NULL*/}, + null_at({0, 2, 3, 6})} .release(); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all); } From 6920f9be9237c77258972aab9bfebd1566ac11aa Mon Sep 17 00:00:00 2001 From: Ray Douglass <3107146+raydouglass@users.noreply.github.com> Date: Fri, 21 May 2021 13:17:20 -0400 Subject: [PATCH 15/27] Update readme with correct CUDA versions (#8315) Replaces CUDA 10.1/10.2 with 11.0/11.2. Authors: - Ray Douglass (https://github.com/raydouglass) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) URL: https://github.com/rapidsai/cudf/pull/8315 --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 733d1c7897b..587f18d2603 100644 --- a/README.md +++ b/README.md @@ -67,13 +67,13 @@ cuDF can be installed with conda ([miniconda](https://conda.io/miniconda.html), For `cudf version == 21.06` : ```bash -# for CUDA 10.1 +# for CUDA 11.0 conda install -c rapidsai -c nvidia -c numba -c conda-forge \ - cudf=21.06 python=3.7 cudatoolkit=10.1 + cudf=21.06 python=3.7 cudatoolkit=11.0 -# or, for CUDA 10.2 +# or, for CUDA 11.2 conda install -c rapidsai -c nvidia -c numba -c conda-forge \ - cudf=21.06 python=3.7 cudatoolkit=10.2 + cudf=21.06 python=3.7 cudatoolkit=11.2 ``` From 5c6b92a38c5a82ee259b6414a8bbc568d8e78389 Mon Sep 17 00:00:00 2001 From: MithunR Date: Fri, 21 May 2021 10:44:27 -0700 Subject: [PATCH 16/27] COLLECT_LIST support returning empty output columns. (#8279) Fixes the group-by portion of #7611. When `COLLECT_LIST()` or `COLLECT_SET()` aggregations are called on a grouped input, if the input column is empty, then one sees the following failure: ``` C++ exception with description "cuDF failure at: .../cpp/src/column/column_factories.cpp:67: make_empty_column is invalid to call on nested types" thrown in the test body. ``` The operation should have resulted in an empty `LIST` column. `make_empty_column()` does not support `LIST` types (in part because the `data_type` parameter does not capture the types of the child columns). This commit fixes this by constructing the output column from the specified `values` input, but only for `COLLECT_LIST()` and `COLLECT_SET()`; other aggregation types are unchanged. Authors: - MithunR (https://github.com/mythrocks) Approvers: - Conor Hoekstra (https://github.com/codereport) - Nghia Truong (https://github.com/ttnghia) - https://github.com/nvdbaranec URL: https://github.com/rapidsai/cudf/pull/8279 --- cpp/src/groupby/groupby.cu | 41 +++++++++++++- cpp/tests/groupby/collect_list_tests.cpp | 70 ++++++++++++++++++++++++ cpp/tests/groupby/collect_set_tests.cpp | 3 +- cpp/tests/groupby/nth_element_tests.cpp | 40 ++++++++++++++ 4 files changed, 151 insertions(+), 3 deletions(-) diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index a5fd6d6f9bb..f132d6b1511 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -79,6 +79,44 @@ std::pair, std::vector> groupby::disp groupby::~groupby() = default; namespace { + +/** + * @brief Factory to construct empty result columns. + * + * Adds special handling for COLLECT_LIST/COLLECT_SET, because: + * 1. `make_empty_column()` does not support construction of nested columns. + * 2. Empty lists need empty child columns, to persist type information. + */ +struct empty_column_constructor { + column_view values; + + template + std::unique_ptr operator()() const + { + using namespace cudf; + using namespace cudf::detail; + + if constexpr (k == aggregation::Kind::COLLECT_LIST || k == aggregation::Kind::COLLECT_SET) { + return make_lists_column( + 0, make_empty_column(data_type{type_to_id()}), empty_like(values), 0, {}); + } + + // If `values` is LIST typed, and the aggregation results match the type, + // construct empty results based on `values`. + // Most generally, this applies if input type matches output type. + // + // Note: `target_type_t` is not recursive, and `ValuesType` does not consider children. + // It is important that `COLLECT_LIST` and `COLLECT_SET` are handled before this + // point, because `COLLECT_LIST(LIST)` produces `LIST`, but `target_type_t` + // wouldn't know the difference. + if constexpr (std::is_same_v, ValuesType>) { + return empty_like(values); + } + + return make_empty_column(target_type(values.type(), k)); + } +}; + /// Make an empty table with appropriate types for requested aggs auto empty_results(host_span requests) { @@ -93,7 +131,8 @@ auto empty_results(host_span requests) request.aggregations.end(), std::back_inserter(results), [&request](auto const& agg) { - return make_empty_column(cudf::detail::target_type(request.values.type(), agg->kind)); + return cudf::detail::dispatch_type_and_aggregation( + request.values.type(), agg->kind, empty_column_constructor{request.values}); }); return aggregation_result{std::move(results)}; diff --git a/cpp/tests/groupby/collect_list_tests.cpp b/cpp/tests/groupby/collect_list_tests.cpp index 7580c1c4e3b..9d2141c913c 100644 --- a/cpp/tests/groupby/collect_list_tests.cpp +++ b/cpp/tests/groupby/collect_list_tests.cpp @@ -86,6 +86,21 @@ TYPED_TEST(groupby_collect_list_test, CollectWithNullExclusion) test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg)); } +TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInput) +{ + using K = int32_t; + using V = TypeParam; + + fixed_width_column_wrapper keys{}; + fixed_width_column_wrapper values{}; + + fixed_width_column_wrapper expect_keys{}; + lists_column_wrapper expect_vals{}; + + auto agg = cudf::make_collect_list_aggregation(null_policy::EXCLUDE); + test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg)); +} + TYPED_TEST(groupby_collect_list_test, CollectLists) { using K = int32_t; @@ -124,6 +139,61 @@ TYPED_TEST(groupby_collect_list_test, CollectListsWithNullExclusion) test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg)); } +TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputLists) +{ + using K = int32_t; + using V = TypeParam; + + using LCW = cudf::test::lists_column_wrapper; + + auto offsets = data_type{type_to_id()}; + + fixed_width_column_wrapper keys{}; + auto values = cudf::make_lists_column(0, make_empty_column(offsets), LCW{}.release(), 0, {}); + + fixed_width_column_wrapper expect_keys{}; + + auto expect_child = + cudf::make_lists_column(0, make_empty_column(offsets), LCW{}.release(), 0, {}); + auto expect_values = + cudf::make_lists_column(0, make_empty_column(offsets), std::move(expect_child), 0, {}); + + auto agg = cudf::make_collect_list_aggregation(); + test_single_agg(keys, values->view(), expect_keys, expect_values->view(), std::move(agg)); +} + +TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputListsOfStructs) +{ + using K = int32_t; + using V = TypeParam; + + using LCW = cudf::test::lists_column_wrapper; + + fixed_width_column_wrapper keys{}; + auto struct_child = LCW{}; + auto struct_column = structs_column_wrapper{{struct_child}}; + + auto values = cudf::make_lists_column( + 0, make_empty_column(data_type{type_to_id()}), struct_column.release(), 0, {}); + + fixed_width_column_wrapper expect_keys{}; + + auto expect_struct_child = LCW{}; + auto expect_struct_column = structs_column_wrapper{{expect_struct_child}}; + + auto expect_child = + cudf::make_lists_column(0, + make_empty_column(data_type{type_to_id()}), + expect_struct_column.release(), + 0, + {}); + auto expect_values = cudf::make_lists_column( + 0, make_empty_column(data_type{type_to_id()}), std::move(expect_child), 0, {}); + + auto agg = cudf::make_collect_list_aggregation(); + test_single_agg(keys, values->view(), expect_keys, expect_values->view(), std::move(agg)); +} + TYPED_TEST(groupby_collect_list_test, dictionary) { using K = int32_t; diff --git a/cpp/tests/groupby/collect_set_tests.cpp b/cpp/tests/groupby/collect_set_tests.cpp index ce3a9a49372..d5a881a1993 100644 --- a/cpp/tests/groupby/collect_set_tests.cpp +++ b/cpp/tests/groupby/collect_set_tests.cpp @@ -58,8 +58,7 @@ TYPED_TEST_CASE(CollectSetTypedTest, FixedWidthTypesNotBool); TYPED_TEST(CollectSetTypedTest, TrivialInput) { // Empty input - // TODO: Enable this test after issue#7611 has been fixed - // test_single_agg(COL_K{}, COL_V{}, COL_K{}, COL_V{}, COLLECT_SET); + test_single_agg(COL_K{}, COL_V{}, COL_K{}, LCL_V{}, CollectSetTest::collect_set()); // Single key input { diff --git a/cpp/tests/groupby/nth_element_tests.cpp b/cpp/tests/groupby/nth_element_tests.cpp index ec0265a3023..5630cba09da 100644 --- a/cpp/tests/groupby/nth_element_tests.cpp +++ b/cpp/tests/groupby/nth_element_tests.cpp @@ -362,5 +362,45 @@ TEST_F(groupby_nth_element_string_test, dictionary) keys, vals, expect_keys, expect_vals->view(), cudf::make_nth_element_aggregation(2)); } +template +struct groupby_nth_element_lists_test : BaseFixture { +}; + +TYPED_TEST_CASE(groupby_nth_element_lists_test, FixedWidthTypesWithoutFixedPoint); + +TYPED_TEST(groupby_nth_element_lists_test, Basics) +{ + using K = int32_t; + using V = TypeParam; + + using lists = cudf::test::lists_column_wrapper; + + auto keys = fixed_width_column_wrapper{1, 1, 2, 2, 3, 3}; + auto values = lists{{1, 2}, {3, 4}, {5, 6, 7}, lists{}, {9, 10}, {11}}; + + auto expected_keys = fixed_width_column_wrapper{1, 2, 3}; + auto expected_values = lists{{1, 2}, {5, 6, 7}, {9, 10}}; + + test_single_agg( + keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(0)); +} + +TYPED_TEST(groupby_nth_element_lists_test, EmptyInput) +{ + using K = int32_t; + using V = TypeParam; + + using lists = cudf::test::lists_column_wrapper; + + auto keys = fixed_width_column_wrapper{}; + auto values = lists{}; + + auto expected_keys = fixed_width_column_wrapper{}; + auto expected_values = lists{}; + + test_single_agg( + keys, values, expected_keys, expected_values, cudf::make_nth_element_aggregation(2)); +} + } // namespace test } // namespace cudf From de579a59714f960fe33440811b4c49e5efeb3f3f Mon Sep 17 00:00:00 2001 From: Kumar Aatish Date: Fri, 21 May 2021 16:05:18 -0400 Subject: [PATCH 17/27] Added decimal writing for CSV writer (#8296) Addresses #7110 column_to_strings_fn was specialized for fixed point type to enable support for csv writer. A test was added to validate output file created by csv writer for decimal type column. Authors: - Kumar Aatish (https://github.com/kaatish) Approvers: - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) - Vukasin Milovanovic (https://github.com/vuule) - Devavret Makkar (https://github.com/devavret) URL: https://github.com/rapidsai/cudf/pull/8296 --- cpp/src/io/csv/writer_impl.cu | 12 +++- cpp/tests/io/csv_test.cpp | 104 ++++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index d2b6be5eead..13760381373 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -119,7 +119,8 @@ struct column_to_strings_fn { return not((std::is_same::value) || (std::is_integral::value) || (std::is_floating_point::value) || - (cudf::is_timestamp()) || (cudf::is_duration())); + (cudf::is_fixed_point()) || (cudf::is_timestamp()) || + (cudf::is_duration())); } explicit column_to_strings_fn( @@ -189,6 +190,15 @@ struct column_to_strings_fn { return cudf::strings::detail::from_floats(column, stream_, mr_); } + // fixed point: + // + template + std::enable_if_t(), std::unique_ptr> operator()( + column_view const& column) const + { + return cudf::strings::detail::from_fixed_point(column, stream_, mr_); + } + // timestamps: // template diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index 6bc08cf24a6..e45b67505ba 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -22,9 +22,11 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -61,6 +63,16 @@ using table_view = cudf::table_view; auto const temp_env = static_cast( ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment)); +// Base test fixture for tests +struct CsvWriterTest : public cudf::test::BaseFixture { +}; + +template +struct CsvFixedPointWriterTest : public CsvWriterTest { +}; + +TYPED_TEST_CASE(CsvFixedPointWriterTest, cudf::test::FixedPointTypes); + // Base test fixture for tests struct CsvReaderTest : public cudf::test::BaseFixture { }; @@ -307,6 +319,98 @@ TYPED_TEST(CsvReaderNumericTypeTest, SingleColumn) expect_column_data_equal(std::vector(sequence, sequence + num_rows), view.column(0)); } +TYPED_TEST(CsvFixedPointWriterTest, SingleColumnNegativeScale) +{ + std::vector reference_strings = { + "1.23", "-8.76", "5.43", "-0.12", "0.25", "-0.23", "-0.27", "0.00", "0.00"}; + + auto validity = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return (i % 2 == 0) ? true : false; }); + cudf::test::strings_column_wrapper strings( + reference_strings.begin(), reference_strings.end(), validity); + + std::vector valid_reference_strings; + thrust::copy_if(thrust::host, + reference_strings.begin(), + reference_strings.end(), + thrust::make_counting_iterator(0), + std::back_inserter(valid_reference_strings), + validity.functor()); + reference_strings = valid_reference_strings; + + using DecimalType = TypeParam; + auto input_column = cudf::strings::to_fixed_point( + cudf::strings_column_view(strings), + cudf::data_type{cudf::type_to_id(), numeric::scale_type{-2}}); + + auto input_table = cudf::table_view{std::vector{*input_column}}; + + auto filepath = temp_env->get_temp_dir() + "FixedPointSingleColumnNegativeScale.csv"; + + cudf_io::csv_writer_options writer_options = + cudf_io::csv_writer_options::builder(cudf_io::sink_info(filepath), input_table); + + cudf_io::write_csv(writer_options); + + std::vector result_strings; + result_strings.reserve(reference_strings.size()); + + std::ifstream read_result_file(filepath); + assert(read_result_file.is_open()); + + std::copy(std::istream_iterator(read_result_file), + std::istream_iterator(), + std::back_inserter(result_strings)); + + EXPECT_EQ(result_strings, reference_strings); +} + +TYPED_TEST(CsvFixedPointWriterTest, SingleColumnPositiveScale) +{ + std::vector reference_strings = { + "123000", "-876000", "543000", "-12000", "25000", "-23000", "-27000", "0000", "0000"}; + + auto validity = cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return (i % 2 == 0) ? true : false; }); + cudf::test::strings_column_wrapper strings( + reference_strings.begin(), reference_strings.end(), validity); + + std::vector valid_reference_strings; + thrust::copy_if(thrust::host, + reference_strings.begin(), + reference_strings.end(), + thrust::make_counting_iterator(0), + std::back_inserter(valid_reference_strings), + validity.functor()); + reference_strings = valid_reference_strings; + + using DecimalType = TypeParam; + auto input_column = cudf::strings::to_fixed_point( + cudf::strings_column_view(strings), + cudf::data_type{cudf::type_to_id(), numeric::scale_type{3}}); + + auto input_table = cudf::table_view{std::vector{*input_column}}; + + auto filepath = temp_env->get_temp_dir() + "FixedPointSingleColumnPositiveScale.csv"; + + cudf_io::csv_writer_options writer_options = + cudf_io::csv_writer_options::builder(cudf_io::sink_info(filepath), input_table); + + cudf_io::write_csv(writer_options); + + std::vector result_strings; + result_strings.reserve(reference_strings.size()); + + std::ifstream read_result_file(filepath); + assert(read_result_file.is_open()); + + std::copy(std::istream_iterator(read_result_file), + std::istream_iterator(), + std::back_inserter(result_strings)); + + EXPECT_EQ(result_strings, reference_strings); +} + TEST_F(CsvReaderTest, MultiColumn) { constexpr auto num_rows = 10; From 696902d236eb580f947a89ddd147d1c6b7fd1c89 Mon Sep 17 00:00:00 2001 From: ChrisJar Date: Sun, 23 May 2021 06:33:51 -0500 Subject: [PATCH 18/27] Enable implicit casting when concatenating mixed types (#8276) This enables implicit casting when decimal columns are concatenated with numeric columns by casting the numeric columns to decimal columns. Closes #8264 Authors: - https://github.com/ChrisJar Approvers: - Ashwin Srinath (https://github.com/shwina) - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/8276 --- python/cudf/cudf/core/frame.py | 19 +- python/cudf/cudf/core/series.py | 9 +- python/cudf/cudf/tests/test_concat.py | 265 ++++++++++++++++++++++++++ python/cudf/cudf/utils/dtypes.py | 26 ++- 4 files changed, 291 insertions(+), 28 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index f59954aaf08..cda4e8cbd4c 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -32,6 +32,7 @@ is_numerical_dtype, is_scalar, min_scalar_type, + find_common_type, ) T = TypeVar("T", bound="Frame") @@ -4029,8 +4030,11 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes): # default to the first non-null dtype dtypes[idx] = cols[0].dtype # If all the non-null dtypes are int/float, find a common dtype - if all(is_numerical_dtype(col.dtype) for col in cols): - dtypes[idx] = np.find_common_type([col.dtype for col in cols], []) + if all( + is_numerical_dtype(col.dtype) or is_decimal_dtype(col.dtype) + for col in cols + ): + dtypes[idx] = find_common_type([col.dtype for col in cols]) # If all categorical dtypes, combine the categories elif all( isinstance(col, cudf.core.column.CategoricalColumn) for col in cols @@ -4045,17 +4049,6 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes): # Set the column dtype to the codes' dtype. The categories # will be re-assigned at the end dtypes[idx] = min_scalar_type(len(categories[idx])) - elif all( - isinstance(col, cudf.core.column.DecimalColumn) for col in cols - ): - # Find the largest scale and the largest difference between - # precision and scale of the columns to be concatenated - s = max([col.dtype.scale for col in cols]) - lhs = max([col.dtype.precision - col.dtype.scale for col in cols]) - # Combine to get the necessary precision and clip at the maximum - # precision - p = min(cudf.Decimal64Dtype.MAX_PRECISION, s + lhs) - dtypes[idx] = cudf.Decimal64Dtype(p, s) # Otherwise raise an error if columns have different dtypes elif not all(is_dtype_equal(c.dtype, dtypes[idx]) for c in cols): raise ValueError("All columns must be the same type") diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index d812214caf8..a894baf8235 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -45,7 +45,6 @@ from cudf.utils import cudautils, docutils, ioutils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( - _decimal_normalize_types, can_convert_to_column, is_decimal_dtype, is_list_dtype, @@ -53,7 +52,7 @@ is_mixed_with_object_dtype, is_scalar, min_scalar_type, - numeric_normalize_types, + find_common_type, ) from cudf.utils.utils import ( get_appropriate_dispatched_func, @@ -2402,10 +2401,8 @@ def _concat(cls, objs, axis=0, index=True): ) if dtype_mismatch: - if isinstance(objs[0]._column, cudf.core.column.DecimalColumn): - objs = _decimal_normalize_types(*objs) - else: - objs = numeric_normalize_types(*objs) + common_dtype = find_common_type([obj.dtype for obj in objs]) + objs = [obj.astype(common_dtype) for obj in objs] col = _concat_columns([o._column for o in objs]) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 31dc6012905..5c4c121db4d 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd import pytest +from decimal import Decimal import cudf as gd from cudf.tests.utils import assert_eq, assert_exceptions_equal @@ -1262,3 +1263,267 @@ def test_concat_decimal_series(ltype, rtype): expected = pd.concat([ps1, ps2]) assert_eq(expected, got) + + +@pytest.mark.parametrize( + "df1, df2, df3, expected", + [ + ( + gd.DataFrame( + {"val": [Decimal("42.5"), Decimal("8.7")]}, + dtype=Decimal64Dtype(5, 2), + ), + gd.DataFrame( + {"val": [Decimal("9.23"), Decimal("-67.49")]}, + dtype=Decimal64Dtype(6, 4), + ), + gd.DataFrame({"val": [8, -5]}, dtype="int32"), + gd.DataFrame( + { + "val": [ + Decimal("42.5"), + Decimal("8.7"), + Decimal("9.23"), + Decimal("-67.49"), + Decimal("8"), + Decimal("-5"), + ] + }, + dtype=Decimal64Dtype(7, 4), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ( + gd.DataFrame( + {"val": [Decimal("95.2"), Decimal("23.4")]}, + dtype=Decimal64Dtype(5, 2), + ), + gd.DataFrame({"val": [54, 509]}, dtype="uint16"), + gd.DataFrame({"val": [24, -48]}, dtype="int32"), + gd.DataFrame( + { + "val": [ + Decimal("95.2"), + Decimal("23.4"), + Decimal("54"), + Decimal("509"), + Decimal("24"), + Decimal("-48"), + ] + }, + dtype=Decimal64Dtype(5, 2), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ( + gd.DataFrame( + {"val": [Decimal("36.56"), Decimal("-59.24")]}, + dtype=Decimal64Dtype(9, 4), + ), + gd.DataFrame({"val": [403.21, 45.13]}, dtype="float32"), + gd.DataFrame({"val": [52.262, -49.25]}, dtype="float64"), + gd.DataFrame( + { + "val": [ + Decimal("36.56"), + Decimal("-59.24"), + Decimal("403.21"), + Decimal("45.13"), + Decimal("52.262"), + Decimal("-49.25"), + ] + }, + dtype=Decimal64Dtype(9, 4), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ( + gd.DataFrame( + {"val": [Decimal("9563.24"), Decimal("236.633")]}, + dtype=Decimal64Dtype(9, 4), + ), + gd.DataFrame({"val": [5393, -95832]}, dtype="int64"), + gd.DataFrame({"val": [-29.234, -31.945]}, dtype="float64"), + gd.DataFrame( + { + "val": [ + Decimal("9563.24"), + Decimal("236.633"), + Decimal("5393"), + Decimal("-95832"), + Decimal("-29.234"), + Decimal("-31.945"), + ] + }, + dtype=Decimal64Dtype(9, 4), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ], +) +def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected): + df = gd.concat([df1, df2, df3]) + assert_eq(df, expected) + assert_eq(df.val.dtype, expected.val.dtype) + + +@pytest.mark.parametrize( + "s1, s2, s3, expected", + [ + ( + gd.Series( + [Decimal("32.8"), Decimal("-87.7")], dtype=Decimal64Dtype(6, 2) + ), + gd.Series( + [Decimal("101.243"), Decimal("-92.449")], + dtype=Decimal64Dtype(9, 6), + ), + gd.Series([94, -22], dtype="int32"), + gd.Series( + [ + Decimal("32.8"), + Decimal("-87.7"), + Decimal("101.243"), + Decimal("-92.449"), + Decimal("94"), + Decimal("-22"), + ], + dtype=Decimal64Dtype(10, 6), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ( + gd.Series( + [Decimal("7.2"), Decimal("122.1")], dtype=Decimal64Dtype(5, 2) + ), + gd.Series([33, 984], dtype="uint32"), + gd.Series([593, -702], dtype="int32"), + gd.Series( + [ + Decimal("7.2"), + Decimal("122.1"), + Decimal("33"), + Decimal("984"), + Decimal("593"), + Decimal("-702"), + ], + dtype=Decimal64Dtype(5, 2), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ( + gd.Series( + [Decimal("982.94"), Decimal("-493.626")], + dtype=Decimal64Dtype(9, 4), + ), + gd.Series([847.98, 254.442], dtype="float32"), + gd.Series([5299.262, -2049.25], dtype="float64"), + gd.Series( + [ + Decimal("982.94"), + Decimal("-493.626"), + Decimal("847.98"), + Decimal("254.442"), + Decimal("5299.262"), + Decimal("-2049.25"), + ], + dtype=Decimal64Dtype(9, 4), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ( + gd.Series( + [Decimal("492.204"), Decimal("-72824.455")], + dtype=Decimal64Dtype(9, 4), + ), + gd.Series([8438, -27462], dtype="int64"), + gd.Series([-40.292, 49202.953], dtype="float64"), + gd.Series( + [ + Decimal("492.204"), + Decimal("-72824.455"), + Decimal("8438"), + Decimal("-27462"), + Decimal("-40.292"), + Decimal("49202.953"), + ], + dtype=Decimal64Dtype(9, 4), + index=[0, 1, 0, 1, 0, 1], + ), + ), + ], +) +def test_concat_decimal_numeric_series(s1, s2, s3, expected): + s = gd.concat([s1, s2, s3]) + assert_eq(s, expected) + + +@pytest.mark.parametrize( + "s1, s2, expected", + [ + ( + gd.Series( + [Decimal("955.22"), Decimal("8.2")], dtype=Decimal64Dtype(5, 2) + ), + gd.Series(["2007-06-12", "2006-03-14"], dtype="datetime64"), + gd.Series( + [ + "955.22", + "8.20", + "2007-06-12 00:00:00", + "2006-03-14 00:00:00", + ], + index=[0, 1, 0, 1], + ), + ), + ( + gd.Series( + [Decimal("-52.44"), Decimal("365.22")], + dtype=Decimal64Dtype(5, 2), + ), + gd.Series( + np.arange( + "2005-02-01T12", "2005-02-01T15", dtype="datetime64[h]" + ), + dtype="datetime64[s]", + ), + gd.Series( + [ + "-52.44", + "365.22", + "2005-02-01 12:00:00", + "2005-02-01 13:00:00", + "2005-02-01 14:00:00", + ], + index=[0, 1, 0, 1, 2], + ), + ), + ( + gd.Series( + [Decimal("753.0"), Decimal("94.22")], + dtype=Decimal64Dtype(5, 2), + ), + gd.Series([np.timedelta64(111, "s"), np.timedelta64(509, "s")]), + gd.Series( + ["753.00", "94.22", "0 days 00:01:51", "0 days 00:08:29"], + index=[0, 1, 0, 1], + ), + ), + ( + gd.Series( + [Decimal("753.0"), Decimal("94.22")], + dtype=Decimal64Dtype(5, 2), + ), + gd.Series( + [np.timedelta64(940252, "s"), np.timedelta64(758385, "s")] + ), + gd.Series( + ["753.00", "94.22", "10 days 21:10:52", "8 days 18:39:45"], + index=[0, 1, 0, 1], + ), + ), + ], +) +def test_concat_decimal_non_numeric(s1, s2, expected): + s = gd.concat([s1, s2]) + assert_eq(s, expected) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 16c35bab4b1..0b59116f8e6 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -290,13 +290,15 @@ def is_decimal_dtype(obj): ) -def _decimal_normalize_types(*args): - s = max([a.dtype.scale for a in args]) - lhs = max([a.dtype.precision - a.dtype.scale for a in args]) +def _find_common_type_decimal(dtypes): + # Find the largest scale and the largest difference between + # precision and scale of the columns to be concatenated + s = max([dtype.scale for dtype in dtypes]) + lhs = max([dtype.precision - dtype.scale for dtype in dtypes]) + # Combine to get the necessary precision and clip at the maximum + # precision p = min(cudf.Decimal64Dtype.MAX_PRECISION, s + lhs) - dtype = cudf.Decimal64Dtype(p, s) - - return [a.astype(dtype) for a in args] + return cudf.Decimal64Dtype(p, s) def cudf_dtype_from_pydata_dtype(dtype): @@ -690,9 +692,15 @@ def find_common_type(dtypes): dtypes = set(dtypes) if any(is_decimal_dtype(dtype) for dtype in dtypes): - raise NotImplementedError( - "DecimalDtype is not yet supported in find_common_type" - ) + if all( + is_decimal_dtype(dtype) or is_numerical_dtype(dtype) + for dtype in dtypes + ): + return _find_common_type_decimal( + [dtype for dtype in dtypes if is_decimal_dtype(dtype)] + ) + else: + return np.dtype("O") # Corner case 1: # Resort to np.result_type to handle "M" and "m" types separately From ef20706d2f66ba6b32611f99c7b265c26d543d11 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 24 May 2021 07:22:37 -0400 Subject: [PATCH 19/27] Add separator-on-null parameter to strings concatenate APIs (#8282) Closes #4728 This PR adds a new parameter to the `cudf::strings::concatenate` APIs to specify if separators should be added between null entries when the null-replacement (narep) parameter is valid. If the narep scalar is invalid (i.e. null itself) then the entire output row becomes null. If not, separators are added between each element. Examples: ``` s1 = ['a', 'b', null, 'dd', null] s2 = ['A', null, 'CC', 'D', null] concatenate( {s1, s2}, sep='+', narep=invalid ) -> ['a+A', null, null, 'dd+D', null] concatenate( {s1, s2}, sep='+', narep='@' ) -> ['a+A', 'b+@', '@+CC', 'dd+D', '@+@'] concatenate( {s1, s2}, sep='+', narep='' ) -> ['a+A', 'b+', '+CC', 'dd+D', '+'] ``` The new parameter is an enum `separator_on_nulls` which has `YES` or `NO` settings. The default parameter value will be `YES` to keep the current behavior as expected by Python cudf and for consistency with Pandas behavior. Specifying `NO` here will suppress the separator with null elements (when narep is valid). ``` concatenate( {s1, s2}, sep='+', narep='', NO ) -> ['a+A', 'b', 'CC', 'dd+D', ''] ``` This PR also changes the name of the `cudf::strings::concatenate_list_elements` API to `cudf::strings::join_list_elements` instead. The API pattern and behavior more mimic the `cudf::strings::join_strings` then the concatenate functions. Also, these are called by the Python `join` functions so the rename makes it more consistent with cudf. This is a breaking change in order to make these APIs more consistent. Previously, the separators column version was returning nulls only for an all-null row. This has been changed to honor the `separator_on_null` parameter instead. Currently there was no Python cudf API calling this version. Only the rename required minor changes to the Cython layer. The gtests were updated to reflect the new behavior. None of the pytests required any changes since the default parameter value matches the original behavior for those APIs that cudf actually calls. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Nghia Truong (https://github.com/ttnghia) - Keith Kraus (https://github.com/kkraus14) - Thomas Graves (https://github.com/tgravescs) - Christopher Harris (https://github.com/cwharris) URL: https://github.com/rapidsai/cudf/pull/8282 --- cpp/CMakeLists.txt | 2 +- cpp/include/cudf/strings/combine.hpp | 134 ++++++++----- cpp/include/cudf/strings/detail/combine.hpp | 4 +- cpp/src/io/csv/writer_impl.cu | 15 +- cpp/src/strings/combine/concatenate.cu | 177 +++++++++--------- ...list_elements.cu => join_list_elements.cu} | 128 +++++++------ cpp/tests/CMakeLists.txt | 2 +- .../strings/combine/concatenate_tests.cpp | 125 ++++++++++--- ...tests.cpp => join_list_elements_tests.cpp} | 117 +++++++----- python/cudf/cudf/_lib/cpp/strings/combine.pxd | 4 +- python/cudf/cudf/_lib/strings/combine.pyx | 6 +- 11 files changed, 445 insertions(+), 269 deletions(-) rename cpp/src/strings/combine/{concatenate_list_elements.cu => join_list_elements.cu} (64%) rename cpp/tests/strings/combine/{concatenate_list_elements_tests.cpp => join_list_elements_tests.cpp} (82%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index af6f60b031d..aa3b4406320 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -333,8 +333,8 @@ add_library(cudf src/strings/char_types/char_cases.cu src/strings/char_types/char_types.cu src/strings/combine/concatenate.cu - src/strings/combine/concatenate_list_elements.cu src/strings/combine/join.cu + src/strings/combine/join_list_elements.cu src/strings/contains.cu src/strings/convert/convert_booleans.cu src/strings/convert/convert_datetime.cu diff --git a/cpp/include/cudf/strings/combine.hpp b/cpp/include/cudf/strings/combine.hpp index 6887ef0e670..360efe15303 100644 --- a/cpp/include/cudf/strings/combine.hpp +++ b/cpp/include/cudf/strings/combine.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,12 +30,21 @@ namespace strings { * @brief Strings APIs for concatenate and join */ +/** + * @brief Setting for specifying how separators are added with + * null strings elements. + */ +enum class separator_on_nulls { + YES, ///< Always add separators between elements + NO ///< Do not add separators if an element is null +}; + /** * @brief Concatenates all strings in the column into one new string delimited * by an optional separator string. * * This returns a column with one string. Any null entries are ignored unless - * the narep parameter specifies a replacement string. + * the @p narep parameter specifies a replacement string. * * @code{.pseudo} * Example: @@ -70,11 +79,9 @@ std::unique_ptr join_strings( * * - If row separator for a given row is null, output column for that row is null, unless * there is a valid @p separator_narep - * - If all column values for a given row is null, output column for that row is null, unless - * there is a valid @p col_narep - * - null column values for a given row are skipped, if the column replacement isn't valid - * - The separator is only applied between two valid column values - * - If valid @p separator_narep and @p col_narep are provided, the output column is always + * - The separator is applied between two output row values if the @p separate_nulls + * is `YES` or only between valid rows if @p separate_nulls is `NO`. + * - If @p separator_narep and @p col_narep are both valid, the output column is always * non nullable * * @code{.pseudo} @@ -83,16 +90,25 @@ std::unique_ptr join_strings( * c1 = [null, 'cc', 'dd', null, null, 'gg'] * c2 = ['bb', '', null, null, null, 'hh'] * sep = ['::', '%%', '^^', '!', '*', null] - * out0 = concatenate([c0, c1, c2], sep) - * out0 is ['aa::bb', 'cc%%', '^^dd', 'ee', null, null] + * out = concatenate({c0, c1, c2}, sep) + * // all rows have at least one null or sep[i]==null + * out is [null, null, null, null, null, null] * * sep_rep = '+' - * out1 = concatenate([c0, c1, c2], sep, sep_rep) - * out1 is ['aa::bb', 'cc%%', '^^dd', 'ee', null, 'ff+gg+hh'] - * - * col_rep = '-' - * out2 = concatenate([c0, c1, c2], sep, invalid_sep_rep, col_rep) - * out2 is ['aa::-::bb', '-%%cc%%', '^^dd^^-', 'ee!-!-', '-*-*-', null] + * out = concatenate({c0, c1, c2}, sep, sep_rep) + * // all rows with at least one null output as null + * out is [null, null, null, null, null, 'ff+gg+hh'] + * + * col_narep = '-' + * sep_na = non-valid scalar + * out = concatenate({c0, c1, c2}, sep, sep_na, col_narep) + * // only the null entry in the sep column produces a null row + * out is ['aa::-::bb', '-%%cc%%', '^^dd^^-', 'ee!-!-', '-*-*-', null] + * + * col_narep = '' + * out = concatenate({c0, c1, c2}, sep, sep_rep, col_narep, separator_on_nulls:NO) + * // parameter suppresses separator for null rows + * out is ['aa::bb', 'cc%%', '^^dd', 'ee', '', 'ff+gg+hh'] * @endcode * * @throw cudf::logic_error if no input columns are specified - table view is empty @@ -108,6 +124,8 @@ std::unique_ptr join_strings( * @param col_narep String that should be used in place of any null strings * found in any column. Default of invalid-scalar means no null column value replacements. * Default is an invalid string. + * @param separate_nulls If YES, then the separator is included for null rows + * if `col_narep` is valid. * @param mr Resource for allocating device memory. * @return New column with concatenated results. */ @@ -116,15 +134,9 @@ std::unique_ptr concatenate( strings_column_view const& separators, string_scalar const& separator_narep = string_scalar("", false), string_scalar const& col_narep = string_scalar("", false), + separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** - * @addtogroup strings_combine - * @{ - * @file strings/combine.hpp - * @brief Strings APIs for concatenate and join - */ - /** * @brief Row-wise concatenates the given list of strings columns and * returns a single strings column result. @@ -136,20 +148,30 @@ std::unique_ptr concatenate( * row to be null entry unless a narep string is specified to be used * in its place. * - * The number of strings in the columns provided must be the same. + * If @p separate_nulls is set to `NO` and @p narep is valid then + * separators are not added to the output between null elements. + * Otherwise, separators are always added if @p narep is valid. + * + * More than one column must be specified in the input @p strings_columns + * table. * * @code{.pseudo} * Example: - * s1 = ['aa', null, '', 'aa'] - * s2 = ['', 'bb', 'bb', null] - * r1 = concatenate([s1,s2]) - * r1 is ['aa', null, 'bb', null] - * r2 = concatenate([s1,s2],':','_') - * r2 is ['aa:', '_:bb', ':bb', 'aa:_'] + * s1 = ['aa', null, '', 'dd'] + * s2 = ['', 'bb', 'cc', null] + * out = concatenate({s1, s2}) + * out is ['aa', null, 'cc', null] + * + * out = concatenate({s1, s2}, ':', '_') + * out is ['aa:', '_:bb', ':cc', 'dd:_'] + * + * out = concatenate({s1, s2}, ':', '', separator_on_nulls::NO) + * out is ['aa:', 'bb', ':cc', 'dd'] * @endcode * * @throw cudf::logic_error if input columns are not all strings columns. * @throw cudf::logic_error if separator is not valid. + * @throw cudf::logic_error if only one column is specified * * @param strings_columns List of string columns to concatenate. * @param separator String that should inserted between each string from each row. @@ -157,6 +179,7 @@ std::unique_ptr concatenate( * @param narep String that should be used in place of any null strings * found in any column. Default of invalid-scalar means any null entry in any column will * produces a null result for that row. + * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New column with concatenated results. */ @@ -164,6 +187,7 @@ std::unique_ptr concatenate( table_view const& strings_columns, string_scalar const& separator = string_scalar(""), string_scalar const& narep = string_scalar("", false), + separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -171,24 +195,30 @@ std::unique_ptr concatenate( * within each row and returns a single strings column result. * * Each new string is created by concatenating the strings from the same row (same list element) - * delimited by the row separator provided in the `separators` strings column. + * delimited by the row separator provided in the @p separators strings column. * * A null list row will always result in a null string in the output row. Any non-null list row * having a null element will result in the corresponding output row to be null unless a valid - * `string_narep` scalar is provided to be used in its place. Any null row in the `separators` - * column will also result in a null output row unless a valid `separator_narep` scalar is provided + * @p string_narep scalar is provided to be used in its place. Any null row in the @p separators + * column will also result in a null output row unless a valid @p separator_narep scalar is provided * to be used in place of the null separators. * + * If @p separate_nulls is set to `NO` and @p narep is valid then separators are not added to the + * output between null elements. Otherwise, separators are always added if @p narep is valid. + * * @code{.pseudo} * Example: - * s = [ {'aa', 'bb', 'cc'}, null, {'', 'dd'}, {'ee', null}, {'ff', 'gg'} ] + * s = [ ['aa', 'bb', 'cc'], null, ['', 'dd'], ['ee', null], ['ff', 'gg'] ] * sep = ['::', '%%', '!', '*', null] * - * r1 = strings::concatenate_list_elements(s, sep) - * r1 is ['aa::bb::cc', null, '!dd', null, null] + * out = join_list_elements(s, sep) + * out is ['aa::bb::cc', null, '!dd', null, null] + * + * out = join_list_elements(s, sep, ':', '_') + * out is ['aa::bb::cc', null, '!dd', 'ee*_', 'ff:gg'] * - * r2 = strings::concatenate_list_elements(s, sep, ':', '_') - * r2 is ['aa::bb::cc', null, '!dd', 'ee*_', 'ff:gg'] + * out = join_list_elements(s, sep, ':', '', separator_on_nulls::NO) + * out is ['aa::bb::cc', null, '!dd', 'ee', 'ff:gg'] * @endcode * * @throw cudf::logic_error if input column is not lists of strings column. @@ -203,14 +233,16 @@ std::unique_ptr concatenate( * @param string_narep String that should be used to replace null strings in any non-null list row, * default is an invalid-scalar denoting that list rows containing null strings will result * in null string in the corresponding output rows. + * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column with concatenated results. */ -std::unique_ptr concatenate_list_elements( +std::unique_ptr join_list_elements( const lists_column_view& lists_strings_column, const strings_column_view& separators, string_scalar const& separator_narep = string_scalar("", false), string_scalar const& string_narep = string_scalar("", false), + separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -218,21 +250,27 @@ std::unique_ptr concatenate_list_elements( * within each row and returns a single strings column result. * * Each new string is created by concatenating the strings from the same row (same list element) - * delimited by the separator provided. + * delimited by the @p separator provided. * * A null list row will always result in a null string in the output row. Any non-null list row - * having a null elenent will result in the corresponding output row to be null unless a narep - * string is specified to be used in its place. + * having a null elenent will result in the corresponding output row to be null unless a + * @p narep string is specified to be used in its place. + * + * If @p separate_nulls is set to `NO` and @p narep is valid then separators are not added to the + * output between null elements. Otherwise, separators are always added if @p narep is valid. * * @code{.pseudo} * Example: - * s = [ {'aa', 'bb', 'cc'}, null, {'', 'dd'}, {'ee', null}, {'ff'} ] + * s = [ ['aa', 'bb', 'cc'], null, ['', 'dd'], ['ee', null], ['ff'] ] + * + * out = join_list_elements(s) + * out is ['aabbcc', null, 'dd', null, 'ff'] * - * r1 = strings::concatenate_list_elements(s) - * r1 is ['aabbcc', null, 'dd', null, 'ff'] + * out = join_list_elements(s, ':', '_') + * out is ['aa:bb:cc', null, ':dd', 'ee:_', 'ff'] * - * r2 = strings::concatenate_list_elements(s, ':', '_') - * r2 is ['aa:bb:cc', null, ':dd', 'ee:_', 'ff'] + * out = join_list_elements(s, ':', '', separator_on_nulls::NO) + * out is ['aa:bb:cc', null, ':dd', 'ee', 'ff'] * @endcode * * @throw cudf::logic_error if input column is not lists of strings column. @@ -244,13 +282,15 @@ std::unique_ptr concatenate_list_elements( * @param narep String that should be used to replace null strings in any non-null list row, default * is an invalid-scalar denoting that list rows containing null strings will result in null * string in the corresponding output rows. + * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column with concatenated results. */ -std::unique_ptr concatenate_list_elements( +std::unique_ptr join_list_elements( const lists_column_view& lists_strings_column, string_scalar const& separator = string_scalar(""), string_scalar const& narep = string_scalar("", false), + separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/detail/combine.hpp b/cpp/include/cudf/strings/detail/combine.hpp index 6e25a4dfa38..d6bdf398886 100644 --- a/cpp/include/cudf/strings/detail/combine.hpp +++ b/cpp/include/cudf/strings/detail/combine.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -36,6 +37,7 @@ std::unique_ptr concatenate( table_view const& strings_columns, string_scalar const& separator, string_scalar const& narep, + separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index 13760381373..bc0e1243d4f 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -414,11 +415,19 @@ void writer::impl::write(table_view const& table, auto str_table_view = str_table_ptr->view(); // concatenate columns in each row into one big string column - //(using null representation and delimiter): + // (using null representation and delimiter): // std::string delimiter_str{options_.get_inter_column_delimiter()}; - auto str_concat_col = cudf::strings::detail::concatenate( - str_table_view, delimiter_str, options_.get_na_rep(), stream); + auto str_concat_col = [&] { + if (str_table_view.num_columns() > 1) + return cudf::strings::detail::concatenate(str_table_view, + delimiter_str, + options_.get_na_rep(), + strings::separator_on_nulls::YES, + stream); + cudf::string_scalar narep{options_.get_na_rep()}; + return cudf::strings::detail::replace_nulls(str_table_view.column(0), narep, stream); + }(); write_chunked(str_concat_col->view(), metadata, stream); } diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu index 5d7b9152ff3..1329ad3113f 100644 --- a/cpp/src/strings/combine/concatenate.cu +++ b/cpp/src/strings/combine/concatenate.cu @@ -41,67 +41,93 @@ namespace strings { namespace detail { namespace { -/** - * @brief Concatenate strings functor - * - * This will concatenate the strings from each row of the given table - * and apply the separator. The null-replacement string `d_narep` is - * used in place of any string in a row that contains a null entry. - */ -struct concat_strings_fn { +struct concat_strings_base { table_device_view const d_table; - string_view const d_separator; string_scalar_device_view const d_narep; + separator_on_nulls separate_nulls; offset_type* d_offsets{}; char* d_chars{}; - __device__ void operator()(size_type idx) + /** + * @brief Concatenate each table row to a single output string. + * + * This will concatenate the strings from each row of the given table + * and apply the separator. The null-replacement string `d_narep` is + * used in place of any string in a row that contains a null entry. + * + * @param idx The current row to process + * @param d_separator String to place in between each column's row + */ + __device__ void process_row(size_type idx, string_view const d_separator) { - bool const null_element = - thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) { - return col.is_null(idx); - }); - // handle a null row - if (null_element && !d_narep.is_valid()) { + if (!d_narep.is_valid() && + thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) { + return col.is_null(idx); + })) { if (!d_chars) d_offsets[idx] = 0; return; } - char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; - size_type bytes = 0; + char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; + offset_type bytes = 0; + bool write_separator = false; + for (auto itr = d_table.begin(); itr < d_table.end(); ++itr) { - auto const d_column = *itr; - auto const d_str = - d_column.is_null(idx) ? d_narep.value() : d_column.element(idx); - if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_str); - bytes += d_str.size_bytes(); - // separator goes only in between elements - if (itr + 1 < d_table.end()) { + auto const d_column = *itr; + bool const null_element = d_column.is_null(idx); + + if (write_separator && (separate_nulls == separator_on_nulls::YES || !null_element)) { if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_separator); bytes += d_separator.size_bytes(); + write_separator = false; } + + // write out column's row data (or narep if the row is null) + auto const d_str = null_element ? d_narep.value() : d_column.element(idx); + if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_str); + bytes += d_str.size_bytes(); + + write_separator = + write_separator || (separate_nulls == separator_on_nulls::YES) || !null_element; } + if (!d_chars) d_offsets[idx] = bytes; } }; +/** + * @brief Single separator concatenate functor + */ +struct concat_strings_fn : concat_strings_base { + string_view const d_separator; + + concat_strings_fn(table_device_view const& d_table, + string_view const& d_separator, + string_scalar_device_view const& d_narep, + separator_on_nulls separate_nulls) + : concat_strings_base{d_table, d_narep, separate_nulls}, d_separator(d_separator) + { + } + + __device__ void operator()(size_type idx) { process_row(idx, d_separator); } +}; + } // namespace std::unique_ptr concatenate(table_view const& strings_columns, string_scalar const& separator, string_scalar const& narep, + separator_on_nulls separate_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { auto const num_columns = strings_columns.num_columns(); - CUDF_EXPECTS(num_columns > 0, "At least one column must be specified"); + CUDF_EXPECTS(num_columns > 1, "At least two columns must be specified"); // check all columns are of type string CUDF_EXPECTS(std::all_of(strings_columns.begin(), strings_columns.end(), [](auto c) { return c.type().id() == type_id::STRING; }), "All columns must be of type string"); - if (num_columns == 1) // single strings column returns a copy - return std::make_unique(*(strings_columns.begin()), stream, mr); auto const strings_count = strings_columns.num_rows(); if (strings_count == 0) // empty begets empty return detail::make_empty_strings_column(stream, mr); @@ -112,7 +138,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, // Create device views from the strings columns. auto d_table = table_device_view::create(strings_columns, stream); - concat_strings_fn fn{*d_table, d_separator, d_narep}; + concat_strings_fn fn{*d_table, d_separator, d_narep, separate_nulls}; auto children = make_strings_children(fn, strings_count, stream, mr); // create resulting null mask @@ -120,9 +146,9 @@ std::unique_ptr concatenate(table_view const& strings_columns, thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), [d_table = *d_table, d_narep] __device__(size_type idx) { - bool null_element = thrust::any_of( + if (d_narep.is_valid()) return true; + return !thrust::any_of( thrust::seq, d_table.begin(), d_table.end(), [idx](auto col) { return col.is_null(idx); }); - return (!null_element || d_narep.is_valid()); }, stream, mr); @@ -145,68 +171,42 @@ namespace { * when a separator row is null `d_separator_narep`. The `d_narep` is * used in place of a null entry in the strings columns. */ -struct multi_separator_concat_fn { - table_device_view const d_table; +struct multi_separator_concat_fn : concat_strings_base { column_device_view const d_separators; string_scalar_device_view const d_separator_narep; - string_scalar_device_view const d_narep; - offset_type* d_offsets{}; - char* d_chars{}; - __device__ void operator()(size_type idx) + multi_separator_concat_fn(table_device_view const& d_table, + column_device_view const& d_separators, + string_scalar_device_view const& d_separator_narep, + string_scalar_device_view const& d_narep, + separator_on_nulls separate_nulls) + : concat_strings_base{d_table, d_narep, separate_nulls}, + d_separators(d_separators), + d_separator_narep(d_separator_narep) { - bool const all_nulls = - thrust::all_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) { - return col.is_null(idx); - }); + } - if ((d_separators.is_null(idx) && !d_separator_narep.is_valid()) || - (all_nulls && !d_narep.is_valid())) { + __device__ void operator()(size_type idx) + { + if (d_separators.is_null(idx) && !d_separator_narep.is_valid()) { if (!d_chars) d_offsets[idx] = 0; return; } - // point to output location - char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; - offset_type bytes = 0; - - // there is at least one non-null column value auto const d_separator = d_separators.is_valid(idx) ? d_separators.element(idx) : d_separator_narep.value(); - auto const d_null_rep = d_narep.is_valid() ? d_narep.value() : string_view{}; - - // write output entry for this row - bool colval_written = false; // state variable for writing separators - for (auto const d_column : d_table) { - // if the row is null and if there is no replacement, skip it - if (d_column.is_null(idx) && !d_narep.is_valid()) continue; - - // separator in this row is written only after the first output - if (colval_written) { - if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_separator); - bytes += d_separator.size_bytes(); - } - - // write out column's row data (or narep if the row is null) - string_view const d_str = - d_column.is_null(idx) ? d_null_rep : d_column.element(idx); - if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_str); - bytes += d_str.size_bytes(); - - // column's string or narep could by empty so we need this flag - // to know we got this far even if no actual bytes were copied - colval_written = true; // use the separator before the next column - } - - if (!d_chars) d_offsets[idx] = bytes; + // base class utility function handles the rest + process_row(idx, d_separator); } }; + } // namespace std::unique_ptr concatenate(table_view const& strings_columns, strings_column_view const& separators, string_scalar const& separator_narep, string_scalar const& col_narep, + separator_on_nulls separate_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -234,20 +234,19 @@ std::unique_ptr concatenate(table_view const& strings_columns, // Create device views from the strings columns. auto d_table = table_device_view::create(strings_columns, stream); - multi_separator_concat_fn mscf{*d_table, separator_col_view, separator_rep, col_rep}; + multi_separator_concat_fn mscf{ + *d_table, separator_col_view, separator_rep, col_rep, separate_nulls}; auto children = make_strings_children(mscf, strings_count, stream, mr); // Create resulting null mask auto [null_mask, null_count] = cudf::detail::valid_if( thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), - [d_table = *d_table, separator_col_view, separator_rep, col_rep] __device__(size_type ridx) { - if (!separator_col_view.is_valid(ridx) && !separator_rep.is_valid()) return false; - bool all_nulls = - thrust::all_of(thrust::seq, d_table.begin(), d_table.end(), [ridx](auto const& col) { - return col.is_null(ridx); - }); - return all_nulls ? col_rep.is_valid() : true; + [d_table = *d_table, separator_col_view, separator_rep, col_rep] __device__(size_type idx) { + if (!separator_col_view.is_valid(idx) && !separator_rep.is_valid()) return false; + if (col_rep.is_valid()) return true; + return !thrust::any_of( + thrust::seq, d_table.begin(), d_table.end(), [idx](auto col) { return col.is_null(idx); }); }, stream, mr); @@ -268,21 +267,29 @@ std::unique_ptr concatenate(table_view const& strings_columns, std::unique_ptr concatenate(table_view const& strings_columns, string_scalar const& separator, string_scalar const& narep, + separator_on_nulls separate_nulls, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate(strings_columns, separator, narep, rmm::cuda_stream_default, mr); + return detail::concatenate( + strings_columns, separator, narep, separate_nulls, rmm::cuda_stream_default, mr); } std::unique_ptr concatenate(table_view const& strings_columns, strings_column_view const& separators, string_scalar const& separator_narep, string_scalar const& col_narep, + separator_on_nulls separate_nulls, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate( - strings_columns, separators, separator_narep, col_narep, rmm::cuda_stream_default, mr); + return detail::concatenate(strings_columns, + separators, + separator_narep, + col_narep, + separate_nulls, + rmm::cuda_stream_default, + mr); } } // namespace strings diff --git a/cpp/src/strings/combine/concatenate_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu similarity index 64% rename from cpp/src/strings/combine/concatenate_list_elements.cu rename to cpp/src/strings/combine/join_list_elements.cu index 1157b8f3fce..7a83097566c 100644 --- a/cpp/src/strings/combine/concatenate_list_elements.cu +++ b/cpp/src/strings/combine/join_list_elements.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,7 @@ struct compute_size_and_concatenate_fn { offset_type const* const list_offsets; column_device_view const strings_dv; string_scalar_device_view const string_narep_dv; + separator_on_nulls const separate_nulls; offset_type* d_offsets{nullptr}; @@ -72,33 +74,38 @@ struct compute_size_and_concatenate_fn { return; } - auto const separator = func.separator(idx); - auto const separator_size = separator.size_bytes(); - auto size_bytes = size_type{0}; - bool written = false; - char* output_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr; + auto const separator = func.separator(idx); + auto size_bytes = size_type{0}; + char* output_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr; + bool write_separator = false; for (size_type str_idx = list_offsets[idx], idx_end = list_offsets[idx + 1]; str_idx < idx_end; ++str_idx) { - if (not d_chars and (strings_dv.is_null(str_idx) and not string_narep_dv.is_valid())) { + bool null_element = strings_dv.is_null(str_idx); + + if (not d_chars and (null_element and not string_narep_dv.is_valid())) { d_offsets[idx] = 0; d_validities[idx] = false; return; // early termination: the entire list of strings will result in a null string } - auto const d_str = strings_dv.is_null(str_idx) ? string_narep_dv.value() - : strings_dv.element(str_idx); - size_bytes += separator_size + d_str.size_bytes(); - if (output_ptr) { - // Separator is inserted only in between strings - if (written) { output_ptr = detail::copy_string(output_ptr, separator); } - output_ptr = detail::copy_string(output_ptr, d_str); - written = true; + + if (write_separator && (separate_nulls == separator_on_nulls::YES || !null_element)) { + if (output_ptr) output_ptr = detail::copy_string(output_ptr, separator); + size_bytes += separator.size_bytes(); + write_separator = false; } + + auto const d_str = + null_element ? string_narep_dv.value() : strings_dv.element(str_idx); + if (output_ptr) output_ptr = detail::copy_string(output_ptr, d_str); + size_bytes += d_str.size_bytes(); + + write_separator = + write_separator || (separate_nulls == separator_on_nulls::YES) || !null_element; } - // Separator is inserted only in between strings if (not d_chars) { - d_offsets[idx] = static_cast(size_bytes - separator_size); + d_offsets[idx] = size_bytes; d_validities[idx] = true; } } @@ -123,11 +130,12 @@ struct scalar_separator_fn { } // namespace -std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, - string_scalar const& separator, - string_scalar const& narep, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr join_list_elements(lists_column_view const& lists_strings_column, + string_scalar const& separator, + string_scalar const& narep, + separator_on_nulls separate_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING, "The input column must be a column of lists of strings"); @@ -146,14 +154,14 @@ std::unique_ptr concatenate_list_elements(lists_column_view const& lists auto const sep_dv = get_scalar_device_view(const_cast(separator)); auto const string_narep_dv = get_scalar_device_view(const_cast(narep)); - auto const func = scalar_separator_fn{sep_dv}; - auto const comp_fn = compute_size_and_concatenate_fn{ - func, - *lists_dv_ptr, - lists_strings_column.offsets_begin(), - *strings_dv_ptr, - string_narep_dv, - }; + auto const func = scalar_separator_fn{sep_dv}; + auto const comp_fn = + compute_size_and_concatenate_fn{func, + *lists_dv_ptr, + lists_strings_column.offsets_begin(), + *strings_dv_ptr, + string_narep_dv, + separate_nulls}; auto [offsets_column, chars_column, null_mask, null_count] = make_strings_children_with_null_mask(comp_fn, num_rows, num_rows, stream, mr); @@ -191,12 +199,13 @@ struct column_separators_fn { } // namespace -std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, - strings_column_view const& separators, - string_scalar const& separator_narep, - string_scalar const& string_narep, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr join_list_elements(lists_column_view const& lists_strings_column, + strings_column_view const& separators, + string_scalar const& separator_narep, + string_scalar const& string_narep, + separator_on_nulls separate_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING, "The input column must be a column of lists of strings"); @@ -217,14 +226,14 @@ std::unique_ptr concatenate_list_elements(lists_column_view const& lists auto const sep_dv_ptr = column_device_view::create(separators.parent(), stream); auto const sep_narep_dv = get_scalar_device_view(const_cast(separator_narep)); - auto const func = column_separators_fn{*sep_dv_ptr, sep_narep_dv}; - auto const comp_fn = compute_size_and_concatenate_fn{ - func, - *lists_dv_ptr, - lists_strings_column.offsets_begin(), - *strings_dv_ptr, - string_narep_dv, - }; + auto const func = column_separators_fn{*sep_dv_ptr, sep_narep_dv}; + auto const comp_fn = + compute_size_and_concatenate_fn{func, + *lists_dv_ptr, + lists_strings_column.offsets_begin(), + *strings_dv_ptr, + string_narep_dv, + separate_nulls}; auto [offsets_column, chars_column, null_mask, null_count] = make_strings_children_with_null_mask(comp_fn, num_rows, num_rows, stream, mr); @@ -239,25 +248,32 @@ std::unique_ptr concatenate_list_elements(lists_column_view const& lists } // namespace detail -std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, - string_scalar const& separator, - string_scalar const& narep, - rmm::mr::device_memory_resource* mr) +std::unique_ptr join_list_elements(lists_column_view const& lists_strings_column, + string_scalar const& separator, + string_scalar const& narep, + separator_on_nulls separate_nulls, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate_list_elements( - lists_strings_column, separator, narep, rmm::cuda_stream_default, mr); + return detail::join_list_elements( + lists_strings_column, separator, narep, separate_nulls, rmm::cuda_stream_default, mr); } -std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, - strings_column_view const& separators, - string_scalar const& separator_narep, - string_scalar const& string_narep, - rmm::mr::device_memory_resource* mr) +std::unique_ptr join_list_elements(lists_column_view const& lists_strings_column, + strings_column_view const& separators, + string_scalar const& separator_narep, + string_scalar const& string_narep, + separator_on_nulls separate_nulls, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate_list_elements( - lists_strings_column, separators, separator_narep, string_narep, rmm::cuda_stream_default, mr); + return detail::join_list_elements(lists_strings_column, + separators, + separator_narep, + string_narep, + separate_nulls, + rmm::cuda_stream_default, + mr); } } // namespace strings diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index f36ec70479b..bbcfd69a52b 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -328,8 +328,8 @@ ConfigureTest(STRINGS_TEST strings/booleans_tests.cpp strings/case_tests.cpp strings/chars_types_tests.cpp - strings/combine/concatenate_list_elements_tests.cpp strings/combine/concatenate_tests.cpp + strings/combine/join_list_elements_tests.cpp strings/combine/join_strings_tests.cpp strings/concatenate_tests.cpp strings/contains_tests.cpp diff --git a/cpp/tests/strings/combine/concatenate_tests.cpp b/cpp/tests/strings/combine/concatenate_tests.cpp index c1c390e8a82..d91f669e42d 100644 --- a/cpp/tests/strings/combine/concatenate_tests.cpp +++ b/cpp/tests/strings/combine/concatenate_tests.cpp @@ -95,6 +95,58 @@ TEST_F(StringsCombineTest, Concatenate) } } +TEST_F(StringsCombineTest, ConcatenateSkipNulls) +{ + cudf::test::strings_column_wrapper strings1({"eee", "", "", "", "aa", "bbb", "ééé"}, + {1, 0, 0, 1, 1, 1, 1}); + cudf::test::strings_column_wrapper strings2({"xyz", "", "d", "éa", "", "", "f"}, + {1, 0, 1, 1, 1, 0, 1}); + cudf::test::strings_column_wrapper strings3({"q", "", "s", "t", "u", "", "w"}, + {1, 1, 1, 1, 1, 0, 1}); + + cudf::table_view table({strings1, strings2, strings3}); + + { + cudf::test::strings_column_wrapper expected( + {"eee+xyz+q", "++", "+d+s", "+éa+t", "aa++u", "bbb++", "ééé+f+w"}); + auto results = cudf::strings::concatenate(table, + cudf::string_scalar("+"), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::YES); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + } + { + cudf::test::strings_column_wrapper expected( + {"eee+xyz+q", "", "d+s", "+éa+t", "aa++u", "bbb", "ééé+f+w"}); + auto results = cudf::strings::concatenate(table, + cudf::string_scalar("+"), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + } + { + cudf::test::strings_column_wrapper expected( + {"eee+xyz+q", "", "", "+éa+t", "aa++u", "", "ééé+f+w"}, {1, 0, 0, 1, 1, 0, 1}); + auto results = cudf::strings::concatenate(table, + cudf::string_scalar("+"), + cudf::string_scalar("", false), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + } + { + cudf::test::strings_column_wrapper sep_col({"+", "-", ".", "@", "*", "^^", "#"}); + auto results = cudf::strings::concatenate(table, + cudf::strings_column_view(sep_col), + cudf::string_scalar(""), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + + cudf::test::strings_column_wrapper expected( + {"eee+xyz+q", "", "d.s", "@éa@t", "aa**u", "bbb", "ééé#f#w"}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + } +} + TEST_F(StringsCombineTest, ConcatZeroSizeStringsColumns) { cudf::column_view zero_size_strings_column( @@ -107,6 +159,12 @@ TEST_F(StringsCombineTest, ConcatZeroSizeStringsColumns) cudf::test::expect_strings_empty(results->view()); } +TEST_F(StringsCombineTest, SingleColumnErrorCheck) +{ + cudf::column_view col0(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); + EXPECT_THROW(cudf::strings::concatenate(cudf::table_view{{col0}}), cudf::logic_error); +} + struct StringsConcatenateWithColSeparatorTest : public cudf::test::BaseFixture { }; @@ -157,7 +215,6 @@ TEST_F(StringsConcatenateWithColSeparatorTest, SingleColumnEmptyAndNullStringsNo auto exp_results = cudf::test::strings_column_wrapper({"", "", "", ""}, {false, true, false, false}); - auto results = cudf::strings::concatenate(cudf::table_view{{col0}}, cudf::strings_column_view(sep_col)); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true); @@ -295,12 +352,20 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnEmptyAndNullStringsNoR auto sep_col = cudf::test::strings_column_wrapper( {"", "", "", "", "", "", "", ""}, {true, false, true, false, true, false, true, false}); - auto exp_results = cudf::test::strings_column_wrapper( - {"", "", "", "", "", "", "", ""}, {false, false, true, false, true, false, true, false}); - + auto exp_results1 = cudf::test::strings_column_wrapper( + {"", "", "", "", "", "", "", ""}, {false, false, true, false, false, false, false, false}); auto results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results1, true); + + auto exp_results2 = cudf::test::strings_column_wrapper( + {"", "", "", "", "", "", "", ""}, {true, false, true, false, true, false, true, false}); + results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, + cudf::strings_column_view(sep_col), + cudf::string_scalar("", false), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results2, true); } TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixNoReplacements) @@ -315,13 +380,23 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixNoReplacement {"", "~~~", "", "@", "", "", "", "^^^^", "", "--", "*****", "######"}, {true, true, false, true, false, true, false, true, true, true, true, true}); - auto exp_results = cudf::test::strings_column_wrapper( - {"eeexyzfoo", "~~~", "", "éééf", "", "", "", "valid", "doo", "", "", ""}, - {true, true, false, true, false, true, false, true, true, false, false, false}); + auto exp_results1 = cudf::test::strings_column_wrapper( + {"eeexyzfoo", "~~~", "", "", "", "", "", "", "", "", "", ""}, + {true, true, false, false, false, false, false, false, false, false, false, false}); auto results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results1, true); + + auto exp_results2 = cudf::test::strings_column_wrapper( + {"eeexyzfoo", "~~~", "", "éééf", "", "", "", "valid", "doo", "", "", ""}, + {true, true, false, true, false, true, false, true, true, true, true, true}); + results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, + cudf::strings_column_view(sep_col), + cudf::string_scalar("", false), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results2, true); } TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixSeparatorReplacement) @@ -335,26 +410,26 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixSeparatorRepl auto sep_col = cudf::test::strings_column_wrapper( {"", "~~~", "", "@", "", "", "", "^^^^", "", "--", "*****", "######"}, {true, true, false, true, false, true, false, true, true, true, true, true}); - auto sep_rep = cudf::string_scalar("!!!!!!!!!!"); + auto sep_rep = cudf::string_scalar("!!!!!!!"); - auto exp_results = cudf::test::strings_column_wrapper( - {"eeexyzfoo", - "~~~", - "!!!!!!!!!!éaff", - "éééf", - "éa", - "", - "éaff", - "valid", - "doo", - "", - "", - ""}, - {true, true, true, true, true, true, true, true, true, false, false, false}); + auto exp_results1 = cudf::test::strings_column_wrapper( + {"eeexyzfoo", "~~~", "!!!!!!!éaff", "éééf", "éa", "", "éaff", "valid", "doo", "", "", ""}, + {true, true, true, false, false, false, false, false, false, false, false, false}); auto results = cudf::strings::concatenate( cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col), sep_rep); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results1, true); + + auto exp_results2 = cudf::test::strings_column_wrapper( + {"eeexyzfoo", "~~~", "!!!!!!!éaff", "éééf", "éa", "", "éaff", "valid", "doo", "", "", ""}, + {true, true, true, true, true, true, true, true, true, true, true, true}); + + results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, + cudf::strings_column_view(sep_col), + sep_rep, + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results2, true); } TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixColumnReplacement) diff --git a/cpp/tests/strings/combine/concatenate_list_elements_tests.cpp b/cpp/tests/strings/combine/join_list_elements_tests.cpp similarity index 82% rename from cpp/tests/strings/combine/concatenate_list_elements_tests.cpp rename to cpp/tests/strings/combine/join_list_elements_tests.cpp index b6afd588dfb..e2f7c3e36a2 100644 --- a/cpp/tests/strings/combine/concatenate_list_elements_tests.cpp +++ b/cpp/tests/strings/combine/join_list_elements_tests.cpp @@ -58,7 +58,7 @@ TEST_F(StringsListsConcatenateTest, InvalidInput) { auto const string_lists = INT_LISTS{{1, 2, 3}, {4, 5, 6}}.release(); auto const string_lv = cudf::lists_column_view(string_lists->view()); - EXPECT_THROW(cudf::strings::concatenate_list_elements(string_lv), cudf::logic_error); + EXPECT_THROW(cudf::strings::join_list_elements(string_lv), cudf::logic_error); } // Invalid scalar separator @@ -66,9 +66,8 @@ TEST_F(StringsListsConcatenateTest, InvalidInput) auto const string_lists = STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release(); auto const string_lv = cudf::lists_column_view(string_lists->view()); - EXPECT_THROW( - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("", false)), - cudf::logic_error); + EXPECT_THROW(cudf::strings::join_list_elements(string_lv, cudf::string_scalar("", false)), + cudf::logic_error); } // Invalid column separators @@ -77,7 +76,7 @@ TEST_F(StringsListsConcatenateTest, InvalidInput) STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release(); auto const string_lv = cudf::lists_column_view(string_lists->view()); auto const separators = STR_COL{"+++"}.release(); // size doesn't match with lists column size - EXPECT_THROW(cudf::strings::concatenate_list_elements(string_lv, separators->view()), + EXPECT_THROW(cudf::strings::join_list_elements(string_lv, separators->view()), cudf::logic_error); } } @@ -87,26 +86,26 @@ TEST_F(StringsListsConcatenateTest, EmptyInput) auto const string_lists = STR_LISTS{}.release(); auto const string_lv = cudf::lists_column_view(string_lists->view()); auto const expected = STR_COL{}; - auto results = cudf::strings::concatenate_list_elements(string_lv); + auto results = cudf::strings::join_list_elements(string_lv); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); auto const separators = STR_COL{}.release(); - results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); + results = cudf::strings::join_list_elements(string_lv, separators->view()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); } TEST_F(StringsListsConcatenateTest, ZeroSizeStringsInput) { auto const string_lists = - STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release(); + STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}, STR_LISTS{}}.release(); auto const string_lv = cudf::lists_column_view(string_lists->view()); - auto const expected = STR_COL{"", "", ""}; + auto const expected = STR_COL{"", "", "", ""}; - auto results = cudf::strings::concatenate_list_elements(string_lv); + auto results = cudf::strings::join_list_elements(string_lv); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - auto const separators = STR_COL{"", "", ""}.release(); - results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); + auto const separators = STR_COL{"", "", "", ""}.release(); + results = cudf::strings::join_list_elements(string_lv, separators->view()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); } @@ -120,29 +119,35 @@ TEST_F(StringsListsConcatenateTest, AllNullsStringsInput) auto const string_lv = cudf::lists_column_view(string_lists->view()); auto const expected = STR_COL{{"", "", ""}, all_nulls()}; - auto results = cudf::strings::concatenate_list_elements(string_lv); + auto results = cudf::strings::join_list_elements(string_lv); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); auto const separators = STR_COL{{"", "", ""}, all_nulls()}.release(); - results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); + results = cudf::strings::join_list_elements(string_lv, separators->view()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); } +auto null_at(std::initializer_list indices) +{ + return cudf::detail::make_counting_transform_iterator( + 0, [indices](auto i) { return std::find(indices.begin(), indices.end(), i) == indices.end(); }); +} + TEST_F(StringsListsConcatenateTest, ScalarSeparator) { auto const string_lists = STR_LISTS{{STR_LISTS{{"a", "bb" /*NULL*/, "ccc"}, null_at(1)}, STR_LISTS{}, /*NULL*/ STR_LISTS{{"ddd" /*NULL*/, "efgh", "ijk"}, null_at(0)}, - STR_LISTS{"zzz", "xxxxx"}}, + STR_LISTS{"zzz", "xxxxx"}, + STR_LISTS{{"v", "", "", "w"}, null_at({1, 2})}}, null_at(1)} .release(); auto const string_lv = cudf::lists_column_view(string_lists->view()); // No null replacement { - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); - std::vector h_expected{nullptr, nullptr, nullptr, "zzz+++xxxxx"}; + auto const results = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++")); + std::vector h_expected{nullptr, nullptr, nullptr, "zzz+++xxxxx", nullptr}; auto const expected = STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); @@ -150,10 +155,22 @@ TEST_F(StringsListsConcatenateTest, ScalarSeparator) // With null replacement { - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); std::vector h_expected{ - "a+++___+++ccc", nullptr, "___+++efgh+++ijk", "zzz+++xxxxx"}; + "a+++___+++ccc", nullptr, "___+++efgh+++ijk", "zzz+++xxxxx", "v+++___+++___+++w"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Turn off separator-on-nulls + { + auto const results = cudf::strings::join_list_elements(string_lv, + cudf::string_scalar("+++"), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + std::vector h_expected{"a+++ccc", nullptr, "efgh+++ijk", "zzz+++xxxxx", "v+++w"}; auto const expected = STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); @@ -181,8 +198,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the entire lists column, no null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + auto const results = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++")); std::vector h_expected{nullptr, nullptr, nullptr, @@ -202,7 +218,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the entire lists column, with null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); std::vector h_expected{"a+++___+++ccc", nullptr, @@ -223,8 +239,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the first half of the lists column, no null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + auto const results = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++")); std::vector h_expected{nullptr, nullptr, nullptr, "zzz+++xxxxx"}; auto const expected = STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; @@ -234,7 +249,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the first half of the lists column, with null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); std::vector h_expected{ "a+++___+++ccc", nullptr, "___+++efgh+++ijk", "zzz+++xxxxx"}; @@ -246,8 +261,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the second half of the lists column, no null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + auto const results = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++")); std::vector h_expected{ nullptr, nullptr, "0a0b0c+++5x5y5z", nullptr, "ééé+++12345abcdef", "aaaééébbbéééccc+++12345"}; auto const expected = @@ -258,7 +272,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the second half of the lists column, with null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); std::vector h_expected{"abcdef+++012345+++___+++xxx000", "___+++11111+++00000", @@ -274,8 +288,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the middle part of the lists column, no null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + auto const results = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++")); std::vector h_expected{ "zzz+++xxxxx", nullptr, nullptr, nullptr, "0a0b0c+++5x5y5z"}; auto const expected = @@ -286,7 +299,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the middle part of the lists column, with null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); std::vector h_expected{"zzz+++xxxxx", nullptr, @@ -318,7 +331,7 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators) // No null replacement { - auto const results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); + auto const results = cudf::strings::join_list_elements(string_lv, separators->view()); std::vector h_expected{nullptr, nullptr, nullptr, nullptr, nullptr, "zzz^^^xxxxx"}; auto const expected = STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; @@ -327,8 +340,8 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators) // With null replacement for separators { - auto const results = cudf::strings::concatenate_list_elements( - string_lv, separators->view(), cudf::string_scalar("|||")); + auto const results = + cudf::strings::join_list_elements(string_lv, separators->view(), cudf::string_scalar("|||")); std::vector h_expected{ nullptr, nullptr, "0a0b0c|||xyzééé", nullptr, nullptr, "zzz^^^xxxxx"}; auto const expected = @@ -338,7 +351,7 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators) // With null replacement for strings { - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, separators->view(), cudf::string_scalar("", false), cudf::string_scalar("XXXXX")); std::vector h_expected{ "a+++XXXXX+++ccc", nullptr, nullptr, nullptr, "XXXXX%%%ááá%%%ííí", "zzz^^^xxxxx"}; @@ -349,7 +362,7 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators) // With null replacement for both separators and strings { - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, separators->view(), cudf::string_scalar("|||"), cudf::string_scalar("XXXXX")); std::vector h_expected{"a+++XXXXX+++ccc", nullptr, @@ -361,6 +374,20 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators) STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); } + + // Turn off separator-on-nulls + { + auto const results = cudf::strings::join_list_elements(string_lv, + separators->view(), + cudf::string_scalar("+++"), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + std::vector h_expected{ + "a+++ccc", nullptr, "0a0b0c+++xyzééé", "efgh+++ijk", "ááá%%%ííí", "zzz^^^xxxxx"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } } TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) @@ -390,7 +417,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); + auto const results = cudf::strings::join_list_elements(string_lv, sep_col); std::vector h_expected{nullptr, nullptr, nullptr, @@ -411,7 +438,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); std::vector h_expected{"a+++___+++ccc", nullptr, @@ -433,7 +460,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 4})[0]); - auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); + auto const results = cudf::strings::join_list_elements(string_lv, sep_col); std::vector h_expected{nullptr, nullptr, nullptr, nullptr}; auto const expected = STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; @@ -444,7 +471,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 4})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); std::vector h_expected{ "a+++___+++ccc", nullptr, "___|||efgh|||ijk", "zzz|||xxxxx"}; @@ -457,7 +484,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {5, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); + auto const results = cudf::strings::join_list_elements(string_lv, sep_col); std::vector h_expected{ nullptr, nullptr, "0a0b0c###5x5y5z", nullptr, "ééé-+-12345abcdef", "aaaééébbbéééccc=+=12345"}; auto const expected = @@ -469,7 +496,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {5, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); std::vector h_expected{"abcdef^^^012345^^^___^^^xxx000", "___~!~11111~!~00000", @@ -486,7 +513,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {3, 8})[0]); - auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); + auto const results = cudf::strings::join_list_elements(string_lv, sep_col); std::vector h_expected{nullptr, nullptr, nullptr, nullptr, "0a0b0c###5x5y5z"}; auto const expected = STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; @@ -497,7 +524,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {3, 8})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); std::vector h_expected{"zzz|||xxxxx", nullptr, diff --git a/python/cudf/cudf/_lib/cpp/strings/combine.pxd b/python/cudf/cudf/_lib/cpp/strings/combine.pxd index 250c6441882..51c706b68d0 100644 --- a/python/cudf/cudf/_lib/cpp/strings/combine.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/combine.pxd @@ -18,13 +18,13 @@ cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil: string_scalar separator, string_scalar narep) except + - cdef unique_ptr[column] concatenate_list_elements( + cdef unique_ptr[column] join_list_elements( column_view lists_strings_column, column_view separators, string_scalar separator_narep, string_scalar string_narep) except + - cdef unique_ptr[column] concatenate_list_elements( + cdef unique_ptr[column] join_list_elements( column_view lists_strings_column, string_scalar separator, string_scalar narep) except + diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx index 25619de3ed0..0d7dfb5c619 100644 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ b/python/cudf/cudf/_lib/strings/combine.pyx @@ -16,7 +16,7 @@ from cudf._lib.table cimport Table from cudf._lib.cpp.strings.combine cimport ( concatenate as cpp_concatenate, join_strings as cpp_join_strings, - concatenate_list_elements as cpp_concatenate_list_elements + join_list_elements as cpp_join_list_elements ) @@ -105,7 +105,7 @@ def join_lists_with_scalar( ) with nogil: - c_result = move(cpp_concatenate_list_elements( + c_result = move(cpp_join_list_elements( source_view, scalar_separator[0], scalar_narep[0] @@ -142,7 +142,7 @@ def join_lists_with_column( ) with nogil: - c_result = move(cpp_concatenate_list_elements( + c_result = move(cpp_join_list_elements( source_view, separator_view, scalar_separator_narep[0], From b9588d1fe570c09ad333ada52210ad1e1c998da7 Mon Sep 17 00:00:00 2001 From: Liangcai Li Date: Mon, 24 May 2021 21:11:03 +0800 Subject: [PATCH 20/27] JNI: Refactor the code of making column from scalar (#8310) This small PR is to replace the JNI implementation with the corresponding cudf API `make_column_from_scalar`. The PR https://github.com/rapidsai/cudf/pull/8185/ has added the support for nested type, so it is ok to do this now. Signed-off-by: Firestarman Authors: - Liangcai Li (https://github.com/firestarman) Approvers: - Bobby Wang (https://github.com/wbo4958) - Robert (Bobby) Evans (https://github.com/revans2) - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/8310 --- java/src/main/native/src/ColumnVectorJni.cpp | 43 ++------------------ 1 file changed, 4 insertions(+), 39 deletions(-) diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp index a09de5c61e3..2953a6221e8 100644 --- a/java/src/main/native/src/ColumnVectorJni.cpp +++ b/java/src/main/native/src/ColumnVectorJni.cpp @@ -220,49 +220,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeList(JNIEnv *env, j JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromScalar(JNIEnv *env, jclass, jlong j_scalar, jint row_count) { - using ScalarType = cudf::scalar_type_t; JNI_NULL_CHECK(env, j_scalar, "scalar is null", 0); try { cudf::jni::auto_set_device(env); auto scalar_val = reinterpret_cast(j_scalar); - auto dtype = scalar_val->type(); - cudf::mask_state mask_state = - scalar_val->is_valid() ? cudf::mask_state::UNALLOCATED : cudf::mask_state::ALL_NULL; std::unique_ptr col; - if (dtype.id() == cudf::type_id::LIST) { - // Neither 'cudf::make_empty_column' nor 'cudf::make_column_from_scalar' supports - // LIST type for now (https://github.com/rapidsai/cudf/issues/8088), so the list - // precedes the others and takes care of the empty column itself. - auto s_list = reinterpret_cast(scalar_val); - cudf::column_view s_val = s_list->view(); - - // Offsets: [0, list_size, list_size*2, ..., list_szie*row_count] - auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32)); - auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32)); - zero->set_valid(true); - step->set_valid(true); - static_cast(zero.get())->set_value(0); - static_cast(step.get())->set_value(s_val.size()); - std::unique_ptr offsets = cudf::sequence(row_count + 1, *zero, *step); - // Data: - // Builds the data column by leveraging `cudf::concatenate` to repeat the 's_val' - // 'row_count' times, because 'cudf::make_column_from_scalar' does not support list - // type. - // (Assumes the `row_count` is not big, otherwise there would be a performance issue.) - // Checks the `row_count` because `cudf::concatenate` does not support no rows. - auto data_col = row_count > 0 - ? cudf::concatenate(std::vector(row_count, s_val)) - : cudf::empty_like(s_val); - col = cudf::make_lists_column(row_count, std::move(offsets), std::move(data_col), - cudf::state_null_count(mask_state, row_count), - cudf::create_null_mask(row_count, mask_state)); - } else if (row_count == 0) { - col = cudf::make_empty_column(dtype); - } else if (cudf::is_fixed_width(dtype)) { - col = cudf::make_fixed_width_column(dtype, row_count, mask_state); - auto mut_view = col->mutable_view(); - cudf::fill_in_place(mut_view, 0, row_count, *scalar_val); - } else if (dtype.id() == cudf::type_id::STRING) { + if (scalar_val->type().id() == cudf::type_id::STRING) { + // Tests fail when using the cudf implementation, complaining no child for string column. + // So here take care of the String type itself. // create a string column of all empty strings to fill (cheapest string column to create) auto offsets = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, row_count + 1, cudf::mask_state::UNALLOCATED); @@ -273,7 +238,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromScalar(JNIEnv *env, col = cudf::fill(str_col->view(), 0, row_count, *scalar_val); } else { - JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Invalid data type", 0); + col = cudf::make_column_from_scalar(*scalar_val, row_count); } return reinterpret_cast(col.release()); } From 936b02d3c8966c059317a6306a96297637fe545d Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 24 May 2021 06:47:55 -0700 Subject: [PATCH 21/27] Add description of the cuIO GDS integration (#8293) Adds a document to describe cuIO behavior with respect to the GDS library use. Also includes a disclaimer about the current state of the integration. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Keith Kraus (https://github.com/kkraus14) URL: https://github.com/rapidsai/cudf/pull/8293 --- docs/cudf/source/io-gds-integration.rst | 22 ++++++++++++++++++++++ docs/cudf/source/io.rst | 3 ++- 2 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 docs/cudf/source/io-gds-integration.rst diff --git a/docs/cudf/source/io-gds-integration.rst b/docs/cudf/source/io-gds-integration.rst new file mode 100644 index 00000000000..9ccf773b2e4 --- /dev/null +++ b/docs/cudf/source/io-gds-integration.rst @@ -0,0 +1,22 @@ +GPUDirect Storage Integration +============================= + +Many IO APIs can use GPUDirect Storage (GDS) library to optimize IO operations. +GDS enables a direct data path for direct memory access (DMA) transfers between GPU memory and storage, which avoids a bounce buffer through the CPU. +GDS also has a compatibility mode that allows the library to fall back to copying through a CPU bounce buffer. +The SDK is available for download `here `_. + +Use of GPUDirect Storage in cuDF is disabled by default, and can be enabled through environment variable ``LIBCUDF_CUFILE_POLICY``. +This variable also controls the GDS compatibility mode. There are two special values for the environment variable: + +- "GDS": Use of GDS is enabled; GDS compatibility mode is *off*. +- "ALWAYS": Use of GDS is enabled; GDS compatibility mode is *on*. + +Any other value (or no value set) will keep the GDS disabled for use in cuDF and IO will be done using cuDF's CPU bounce buffers. + +This environment variable also affects how cuDF treats GDS errors. +When ``LIBCUDF_CUFILE_POLICY`` is set to "GDS" and a GDS API call fails for any reason, cuDF falls back to the internal implementation with bounce buffers. +When ``LIBCUDF_CUFILE_POLICY`` is set to "ALWAYS" and a GDS API call fails for any reason (unlikely, given that the compatibility mode is on), +cuDF throws an exception to propagate the error to te user. + +NOTE: current GDS integration is not fully optimized and enabling GDS will not lead to performance improvements in all cases. \ No newline at end of file diff --git a/docs/cudf/source/io.rst b/docs/cudf/source/io.rst index 5186473ae10..e88162d8f52 100644 --- a/docs/cudf/source/io.rst +++ b/docs/cudf/source/io.rst @@ -8,4 +8,5 @@ This page contains Input / Output related APIs in cuDF. :maxdepth: 2 :caption: Contents: - io-supported-types.rst \ No newline at end of file + io-supported-types.rst + io-gds-integration.rst \ No newline at end of file From 259d69ba4916d62a9d345e741b6c2be5ae4183fd Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 24 May 2021 09:19:57 -0500 Subject: [PATCH 22/27] Revert "patch thrust to fix intmax num elements limitation in scan_by_key" (#8263) Reverts #8199 According to @allisonvacanti (NVIDIA/thrust#1424 (comment)) this patch will likely have adverse effect on performance. We should revert it until a better solution can be found. Authors: - Christopher Harris (https://github.com/cwharris) Approvers: - David Wendt (https://github.com/davidwendt) - Keith Kraus (https://github.com/kkraus14) - Elias Stehle (https://github.com/elstehle) URL: https://github.com/rapidsai/cudf/pull/8263 --- cpp/cmake/thrust.patch | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/cpp/cmake/thrust.patch b/cpp/cmake/thrust.patch index c14b8cdafe5..2f9201d8ab4 100644 --- a/cpp/cmake/thrust.patch +++ b/cpp/cmake/thrust.patch @@ -81,25 +81,3 @@ index c0c6d59..937ee31 100644 { typedef AgentScanPolicy< 128, 15, ///< Threads per block, items per thread -diff --git a/thrust/system/cuda/detail/scan_by_key.h b/thrust/system/cuda/detail/scan_by_key.h -index fe4b321c..b3974c69 100644 ---- a/thrust/system/cuda/detail/scan_by_key.h -+++ b/thrust/system/cuda/detail/scan_by_key.h -@@ -513,7 +513,7 @@ namespace __scan_by_key { - scan_op(scan_op_) - { - int tile_idx = blockIdx.x; -- Size tile_base = ITEMS_PER_TILE * tile_idx; -+ Size tile_base = ITEMS_PER_TILE * static_cast(tile_idx); - Size num_remaining = num_items - tile_base; - - if (num_remaining > ITEMS_PER_TILE) -@@ -734,7 +734,7 @@ namespace __scan_by_key { - ScanOp scan_op, - AddInitToScan add_init_to_scan) - { -- int num_items = static_cast(thrust::distance(keys_first, keys_last)); -+ size_t num_items = static_cast(thrust::distance(keys_first, keys_last)); - size_t storage_size = 0; - cudaStream_t stream = cuda_cub::stream(policy); - bool debug_sync = THRUST_DEBUG_SYNC_FLAG; From 3da0d121b0296d8baba92133f078fe108ac5b72c Mon Sep 17 00:00:00 2001 From: shaneding Date: Mon, 24 May 2021 10:23:06 -0400 Subject: [PATCH 23/27] added _is_homogeneous property (#8299) This PR closes #7067. This was implemented by adding the `_is_homogeneous` property to `DataFrame`. Included are appropriate test cases. Authors: - https://github.com/shaneding Approvers: - https://github.com/brandon-b-miller - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/8299 --- python/cudf/cudf/core/frame.py | 9 +++ python/cudf/cudf/tests/test_dataframe.py | 97 ++++++++++++++++++++++++ 2 files changed, 106 insertions(+) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index cda4e8cbd4c..1c6c1ed85e6 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -157,6 +157,15 @@ def size(self): """ return self._num_columns * self._num_rows + @property + def _is_homogeneous(self): + # make sure that the dataframe has columns + if not self._data.columns: + return True + + first_type = self._data.columns[0].dtype.name + return all(x.dtype.name == first_type for x in self._data.columns) + @property def empty(self): """ diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index e5e36ba7e21..0b73f32e94d 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8579,3 +8579,100 @@ def test_dataframe_init_from_series(data, columns, index): actual, check_index_type=False if len(expected) == 0 else True, ) + + +@pytest.mark.parametrize( + "data, expected", + [ + ({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "c": [1.2, 1, 2, 3]}, False), + ({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, True), + ({"a": ["a", "b", "c"], "b": [4, 5, 6], "c": [7, 8, 9]}, False), + ({"a": [True, False, False], "b": [False, False, True]}, True), + ({"a": [True, False, False]}, True), + ({"a": [[1, 2], [3, 4]]}, True), + ({"a": [[1, 2], [3, 4]], "b": ["a", "b"]}, False), + ({"a": [{"c": 5}, {"e": 5}], "b": [{"c": 5}, {"g": 7}]}, True), + ({}, True), + ], +) +def test_is_homogeneous_dataframe(data, expected): + actual = cudf.DataFrame(data)._is_homogeneous + + assert actual == expected + + +@pytest.mark.parametrize( + "data, indexes, expected", + [ + ( + {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "c": [1.2, 1, 2, 3]}, + ["a", "b"], + True, + ), + ( + { + "a": [1, 2, 3, 4], + "b": [5, 6, 7, 8], + "c": [1.2, 1, 2, 3], + "d": ["hello", "world", "cudf", "rapids"], + }, + ["a", "b"], + False, + ), + ( + { + "a": ["a", "b", "c"], + "b": [4, 5, 6], + "c": [7, 8, 9], + "d": [1, 2, 3], + }, + ["a", "b"], + True, + ), + ], +) +def test_is_homogeneous_multiIndex_dataframe(data, indexes, expected): + test_dataframe = cudf.DataFrame(data).set_index(indexes) + actual = cudf.DataFrame(test_dataframe)._is_homogeneous + + assert actual == expected + + +@pytest.mark.parametrize( + "data, expected", [([1, 2, 3, 4], True), ([True, False], True)] +) +def test_is_homogeneous_series(data, expected): + actual = cudf.Series(data)._is_homogeneous + + assert actual == expected + + +@pytest.mark.parametrize( + "levels, codes, expected", + [ + ( + [["lama", "cow", "falcon"], ["speed", "weight", "length"]], + [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], + True, + ), + ( + [[1, 2, 3], [True, False, True]], + [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], + False, + ), + ], +) +def test_is_homogeneous_multiIndex(levels, codes, expected): + actual = cudf.MultiIndex(levels=levels, codes=codes)._is_homogeneous + + assert actual == expected + + +@pytest.mark.parametrize( + "data, expected", + [([1, 2, 3], True), (["Hello", "World"], True), ([True, False], True)], +) +def test_is_homogeneous_index(data, expected): + actual = cudf.Index(data)._is_homogeneous + + assert actual == expected From 63faf2f14eb6114997eb7406b67eb174d26cfdf8 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Mon, 24 May 2021 10:03:29 -0500 Subject: [PATCH 24/27] Use empty_like in scatter (#8314) This prevents things like partition from working with deeply nested arrays. I marked this as non-breaking, but I am happy to change it to breaking because I removed a detailed API that is not used anywhere else and is flawed. Authors: - Robert (Bobby) Evans (https://github.com/revans2) Approvers: - https://github.com/nvdbaranec - Conor Hoekstra (https://github.com/codereport) - Jason Lowe (https://github.com/jlowe) - Jake Hemstad (https://github.com/jrhemstad) URL: https://github.com/rapidsai/cudf/pull/8314 --- cpp/include/cudf/lists/detail/copying.hpp | 17 +---------------- cpp/include/cudf/lists/detail/scatter.cuh | 5 +---- cpp/src/lists/copying/copying.cu | 13 ------------- cpp/tests/partitioning/partition_test.cpp | 20 ++++++++++++++++++++ 4 files changed, 22 insertions(+), 33 deletions(-) diff --git a/cpp/include/cudf/lists/detail/copying.hpp b/cpp/include/cudf/lists/detail/copying.hpp index 548fec7e7f6..3760294f079 100644 --- a/cpp/include/cudf/lists/detail/copying.hpp +++ b/cpp/include/cudf/lists/detail/copying.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -48,21 +48,6 @@ std::unique_ptr copy_slice(lists_column_view const& lists, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); -/** - * @brief Create a single-level empty lists column. - * - * An empty lists column contains empty children so the column's - * basic type is recorded. - * - * @param child_type The type used for the child column. - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New empty lists column. - */ -std::unique_ptr make_empty_lists_column(data_type child_type, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - } // namespace detail } // namespace lists } // namespace cudf diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh index b179ccf228b..aec45d260bf 100644 --- a/cpp/include/cudf/lists/detail/scatter.cuh +++ b/cpp/include/cudf/lists/detail/scatter.cuh @@ -526,10 +526,7 @@ struct list_child_constructor { if (num_child_rows == 0) { // make an empty lists column using the input child type - return make_empty_lists_column( - source_lists_column_view.child().child(lists_column_view::child_column_index).type(), - stream, - mr); + return empty_like(source_lists_column_view.child()); } auto child_list_views = rmm::device_uvector(num_child_rows, stream, mr); diff --git a/cpp/src/lists/copying/copying.cu b/cpp/src/lists/copying/copying.cu index 3275a496cfd..ff4649f4945 100644 --- a/cpp/src/lists/copying/copying.cu +++ b/cpp/src/lists/copying/copying.cu @@ -84,19 +84,6 @@ std::unique_ptr copy_slice(lists_column_view const& lists, std::move(null_mask)); } -std::unique_ptr make_empty_lists_column(data_type child_type, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - return cudf::make_lists_column(0, - make_empty_column(data_type{type_to_id()}), - make_empty_column(child_type), - 0, // Null count - rmm::device_buffer{0, stream, mr}, // Null mask - stream, - mr); -} - } // namespace detail } // namespace lists } // namespace cudf diff --git a/cpp/tests/partitioning/partition_test.cpp b/cpp/tests/partitioning/partition_test.cpp index bdd5e7bc780..669d406d80a 100644 --- a/cpp/tests/partitioning/partition_test.cpp +++ b/cpp/tests/partitioning/partition_test.cpp @@ -310,3 +310,23 @@ TEST_F(PartitionTestNotTyped, ListOfListOfIntEmpty) CUDF_TEST_EXPECT_TABLES_EQUAL(table_to_partition, result.first->view()); EXPECT_EQ(3, result.second.size()); } + +TEST_F(PartitionTestNotTyped, ListOfListOfListOfIntEmpty) +{ + cudf::test::lists_column_wrapper level_3_list{}; + + fixed_width_column_wrapper level_2_offsets{}; + std::unique_ptr level_2_list = + cudf::make_lists_column(0, level_2_offsets.release(), level_3_list.release(), 0, {}); + + fixed_width_column_wrapper level_1_offsets{0, 0}; + std::unique_ptr level_1_list = + cudf::make_lists_column(1, level_1_offsets.release(), std::move(level_2_list), 0, {}); + + auto table_to_partition = cudf::table_view{{*level_1_list}}; + fixed_width_column_wrapper map{0}; + + auto result = cudf::partition(table_to_partition, map, 2); + CUDF_TEST_EXPECT_TABLES_EQUAL(table_to_partition, result.first->view()); + EXPECT_EQ(3, result.second.size()); +} From e555643b00d166bc43d8fbfaeccc9513dd7f15e1 Mon Sep 17 00:00:00 2001 From: AJ Schmidt Date: Mon, 24 May 2021 11:47:44 -0400 Subject: [PATCH 25/27] Update environment variable used to determine `cuda_version` (#8321) This PR updates the environment variable thats used to determine the `cuda_version` varaible in our conda recipes. The `CUDA` environment variable is explicitly set by the Ops team in our Jenkins jobs, whereas `CUDA_VERSION` comes from the `nvidia/cuda` Docker images that we base our images from. Authors: - AJ Schmidt (https://github.com/ajschmidt8) Approvers: - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/cudf/pull/8321 --- conda/recipes/cudf/meta.yaml | 2 +- conda/recipes/cudf_kafka/meta.yaml | 2 +- conda/recipes/custreamz/meta.yaml | 2 +- conda/recipes/dask-cudf/meta.yaml | 2 +- conda/recipes/libcudf/meta.yaml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 39f2ba3188c..631ebf16aea 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -3,7 +3,7 @@ {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version=environ.get('CONDA_PY', 36) %} -{% set cuda_version='.'.join(environ.get('CUDA_VERSION', '10.1').split('.')[:2]) %} +{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %} package: name: cudf diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml index 35dfb1791d8..b59a49b0db7 100644 --- a/conda/recipes/cudf_kafka/meta.yaml +++ b/conda/recipes/cudf_kafka/meta.yaml @@ -3,7 +3,7 @@ {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version=environ.get('CONDA_PY', 36) %} -{% set cuda_version='.'.join(environ.get('CUDA_VERSION', '10.1').split('.')[:2]) %} +{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %} package: name: cudf_kafka diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml index 0ae0ce830ad..bb5186d7057 100644 --- a/conda/recipes/custreamz/meta.yaml +++ b/conda/recipes/custreamz/meta.yaml @@ -3,7 +3,7 @@ {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version=environ.get('CONDA_PY', 36) %} -{% set cuda_version='.'.join(environ.get('CUDA_VERSION', '10.1').split('.')[:2]) %} +{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %} package: name: custreamz diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml index e66b4c930ec..14376f54ba1 100644 --- a/conda/recipes/dask-cudf/meta.yaml +++ b/conda/recipes/dask-cudf/meta.yaml @@ -3,7 +3,7 @@ {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version=environ.get('CONDA_PY', 36) %} -{% set cuda_version='.'.join(environ.get('CUDA_VERSION', '10.1').split('.')[:2]) %} +{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %} package: name: dask-cudf diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index d42daf3194c..a8abe5b09f0 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -2,7 +2,7 @@ {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} -{% set cuda_version='.'.join(environ.get('CUDA_VERSION', '10.1').split('.')[:2]) %} +{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %} package: name: libcudf From b1d7788edb41cc32965fa9d2b31347976ee4caec Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Mon, 24 May 2021 12:05:55 -0500 Subject: [PATCH 26/27] Update Java string concatenate test for single column (#8330) to stringConcatenate when using a scalar separator. Reference https://github.com/rapidsai/cudf/pull/8282 changed to throw an exception if only a single column is passed in to the stringConcatenate using scalar separator. Update our Java test for that functionality. Signed-off-by: Thomas Graves Authors: - Thomas Graves (https://github.com/tgravescs) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/8330 --- .../java/ai/rapids/cudf/ColumnVectorTest.java | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 83795799a24..8da70afc6f3 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -2099,15 +2099,16 @@ void testStringConcatWithNulls() { assertColumnsAreEqual(concat, e_concat); } - try (ColumnVector v = ColumnVector.fromStrings("a", "B", "cd", "\u0480\u0481", "E\tf", - "g\nH", "IJ\"\u0100\u0101\u0500\u0501", - "kl m", "Nop1", "\\qRs2", null, - "3tuV\'", "wX4Yz", "\ud720\ud721"); - Scalar emptyString = Scalar.fromString(""); - Scalar nullSubstitute = Scalar.fromString("NULL"); - ColumnVector concat = ColumnVector.stringConcatenate(emptyString, nullSubstitute, new ColumnView[]{v})) { - assertColumnsAreEqual(v, concat); - } + assertThrows(CudfException.class, () -> { + try (ColumnVector v = ColumnVector.fromStrings("a", "B", "cd", "\u0480\u0481", "E\tf", + "g\nH", "IJ\"\u0100\u0101\u0500\u0501", + "kl m", "Nop1", "\\qRs2", null, + "3tuV\'", "wX4Yz", "\ud720\ud721"); + Scalar emptyString = Scalar.fromString(""); + Scalar nullSubstitute = Scalar.fromString("NULL"); + ColumnVector concat = ColumnVector.stringConcatenate(emptyString, nullSubstitute, new ColumnView[]{v})) { + } + }); } @Test From 5c0a75b3ab23da656762b95cc984cfff1db88323 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 24 May 2021 12:24:16 -0500 Subject: [PATCH 27/27] Fix cudf release version in readme (#8331) As part of this commit https://github.com/rapidsai/cudf/commit/84065228e0c0b5d94cdc6a44518eb9c353290c89 we accidentally changed the release version of readme to `21.06`, whereas the stable version currently in `rapidsai` channel is `0.19`. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Keith Kraus (https://github.com/kkraus14) URL: https://github.com/rapidsai/cudf/pull/8331 --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 587f18d2603..545e3331681 100644 --- a/README.md +++ b/README.md @@ -65,15 +65,15 @@ Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapids cuDF can be installed with conda ([miniconda](https://conda.io/miniconda.html), or the full [Anaconda distribution](https://www.anaconda.com/download)) from the `rapidsai` channel: -For `cudf version == 21.06` : +For `cudf version == 0.19.2` : ```bash -# for CUDA 11.0 +# for CUDA 10.1 conda install -c rapidsai -c nvidia -c numba -c conda-forge \ - cudf=21.06 python=3.7 cudatoolkit=11.0 + cudf=0.19 python=3.7 cudatoolkit=10.1 -# or, for CUDA 11.2 +# or, for CUDA 10.2 conda install -c rapidsai -c nvidia -c numba -c conda-forge \ - cudf=21.06 python=3.7 cudatoolkit=11.2 + cudf=0.19 python=3.7 cudatoolkit=10.2 ```