From c732cef58a10ccfc7bdf7370ddd218cc02f96476 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 19 May 2021 18:28:47 -0700 Subject: [PATCH] Support create lists column from a `list_scalar` (#8185) This PR adds support to `make_column_from_scalar` for `list_scalar`. For 0-length columns, a well-formed `LIST` type column, whose child column has the same column hierarchy to the row data stored in `list_scalar` is returned. Example: ``` slr.data = [1, 2, 3] // An integer list of 1, 2, 3, `data` is an INT column make_column_from_scalar(s, 2) // List column: {[1, 2, 3], [1, 2, 3]}, whose child column is an `INT` column. slr.data = [[1, 2], [3]] // A list of integer lists, `data` is a List column make_column_from_scalar(s, 0) // Well formed, 0-length List> column, whose child column is a List column. ``` Closes #8088 Authors: - Michael Wang (https://github.com/isVoid) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - Devavret Makkar (https://github.com/devavret) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/8185 --- conda/recipes/libcudf/meta.yaml | 1 + cpp/include/cudf/column/column_factories.hpp | 3 +- .../cudf/lists/lists_column_factories.hpp | 42 +++ cpp/src/column/column_factories.cu | 8 +- cpp/src/lists/lists_column_factories.cu | 67 +++- cpp/tests/column/factories_test.cpp | 296 ++++++++++++++++++ 6 files changed, 413 insertions(+), 4 deletions(-) create mode 100644 cpp/include/cudf/lists/lists_column_factories.hpp diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index ea2fda399fd..0fcf62a2606 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -135,6 +135,7 @@ test: - test -f $PREFIX/include/cudf/join.hpp - test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp - test -f $PREFIX/include/cudf/lists/detail/copying.hpp + - test -f $PREFIX/include/cudf/lists/lists_column_factories.hpp - test -f $PREFIX/include/cudf/lists/detail/drop_list_duplicates.hpp - test -f $PREFIX/include/cudf/lists/detail/interleave_columns.hpp - test -f $PREFIX/include/cudf/lists/detail/sorting.hpp diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp index 43c2407d629..e5424f0fc44 100644 --- a/cpp/include/cudf/column/column_factories.hpp +++ b/cpp/include/cudf/column/column_factories.hpp @@ -541,7 +541,8 @@ std::unique_ptr make_structs_column( * * The output column will have the same type as `s.type()` * The output column will contain all null rows if `s.invalid()==false` - * The output column will be empty if `size==0`. + * The output column will be empty if `size==0`. For LIST scalars, the column hierarchy + * from @p s is preserved. * * @param[in] s The scalar to use for values in the column. * @param[in] size The number of rows for the output column. diff --git a/cpp/include/cudf/lists/lists_column_factories.hpp b/cpp/include/cudf/lists/lists_column_factories.hpp new file mode 100644 index 00000000000..bdf06cfa9e7 --- /dev/null +++ b/cpp/include/cudf/lists/lists_column_factories.hpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +namespace cudf { +namespace lists { +namespace detail { + +/** + * @brief Internal API to construct a lists column from a `list_scalar`, for public + * use, use `cudf::make_column_from_scalar`. + * + * @param[in] value The `list_scalar` to construct from + * @param[in] size The number of rows for the output column. + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + * @param[in] mr Device memory resource used to allocate the returned column's device memory. + */ +std::unique_ptr make_lists_column_from_scalar( + list_scalar const& value, + size_type size, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} // namespace detail +} // namespace lists +} // namespace cudf diff --git a/cpp/src/column/column_factories.cu b/cpp/src/column/column_factories.cu index 60e642ea3d5..6ba8497b320 100644 --- a/cpp/src/column/column_factories.cu +++ b/cpp/src/column/column_factories.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -32,6 +33,7 @@ struct column_from_scalar_dispatch { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { + if (size == 0) return make_empty_column(value.type()); if (!value.is_valid()) return make_fixed_width_column(value.type(), size, mask_state::ALL_NULL, stream, mr); auto output_column = @@ -49,6 +51,7 @@ std::unique_ptr column_from_scalar_dispatch::operator() column_from_scalar_dispatch::operator()(&value); + return lists::detail::make_lists_column_from_scalar(*lv, size, stream, mr); } template <> @@ -94,6 +98,7 @@ std::unique_ptr column_from_scalar_dispatch::operator() const&>(value); auto iter = thrust::make_constant_iterator(0); @@ -117,7 +122,6 @@ std::unique_ptr make_column_from_scalar(scalar const& s, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (size == 0) return make_empty_column(s.type()); return type_dispatcher(s.type(), column_from_scalar_dispatch{}, s, size, stream, mr); } diff --git a/cpp/src/lists/lists_column_factories.cu b/cpp/src/lists/lists_column_factories.cu index ebf5e07f76a..3291aeb9f22 100644 --- a/cpp/src/lists/lists_column_factories.cu +++ b/cpp/src/lists/lists_column_factories.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,10 +16,75 @@ #include #include +#include +#include #include +#include + +#include +#include namespace cudf { +namespace lists { +namespace detail { + +std::unique_ptr make_lists_column_from_scalar(list_scalar const& value, + size_type size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (size == 0) { + return make_lists_column(0, + make_empty_column(data_type{type_to_id()}), + empty_like(value.view()), + 0, + cudf::detail::create_null_mask(0, mask_state::UNALLOCATED, stream, mr), + stream, + mr); + } + auto mr_final = size == 1 ? mr : rmm::mr::get_current_device_resource(); + + // Handcraft a 1-row column + auto offsets = make_numeric_column( + data_type{type_to_id()}, 2, mask_state::UNALLOCATED, stream, mr_final); + auto m_offsets = offsets->mutable_view(); + thrust::sequence(rmm::exec_policy(stream), + m_offsets.begin(), + m_offsets.end(), + 0, + value.view().size()); + size_type null_count = value.is_valid(stream) ? 0 : 1; + auto null_mask_state = null_count ? mask_state::ALL_NULL : mask_state::UNALLOCATED; + auto null_mask = cudf::detail::create_null_mask(1, null_mask_state, stream, mr_final); + + if (size == 1) { + auto child = std::make_unique(value.view(), stream, mr_final); + return make_lists_column( + 1, std::move(offsets), std::move(child), null_count, std::move(null_mask), stream, mr_final); + } + + auto children_views = std::vector{offsets->view(), value.view()}; + auto one_row_col_view = column_view(data_type{type_id::LIST}, + 1, + nullptr, + static_cast(null_mask.data()), + null_count, + 0, + children_views); + + auto begin = thrust::make_constant_iterator(0); + auto res = cudf::detail::gather(table_view({one_row_col_view}), + begin, + begin + size, + out_of_bounds_policy::DONT_CHECK, + stream, + mr_final); + return std::move(res->release()[0]); +} + +} // namespace detail +} // namespace lists /** * @copydoc cudf::make_lists_column diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp index 71f65eedd91..f9e83311b1b 100644 --- a/cpp/tests/column/factories_test.cpp +++ b/cpp/tests/column/factories_test.cpp @@ -20,7 +20,9 @@ #include #include +#include #include +#include #include #include @@ -462,6 +464,300 @@ TEST_F(ColumnFactoryTest, DictionaryFromStringScalarError) EXPECT_THROW(cudf::make_dictionary_from_scalar(value, 1), cudf::logic_error); } +template +class ListsFixedWidthLeafTest : public ColumnFactoryTest { +}; + +TYPED_TEST_CASE(ListsFixedWidthLeafTest, cudf::test::FixedWidthTypes); + +TYPED_TEST(ListsFixedWidthLeafTest, FromNonNested) +{ + using FCW = cudf::test::fixed_width_column_wrapper; + using LCW = cudf::test::lists_column_wrapper; + using valid_t = std::vector; + + auto s = cudf::make_list_scalar(FCW({1, -1, 3}, {1, 0, 1})); + auto col = cudf::make_column_from_scalar(*s, 3); + + auto expected = LCW{LCW({1, 2, 3}, valid_t{1, 0, 1}.begin()), + LCW({1, 2, 3}, valid_t{1, 0, 1}.begin()), + LCW({1, 2, 3}, valid_t{1, 0, 1}.begin())}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, expected); +} + +TYPED_TEST(ListsFixedWidthLeafTest, FromNested) +{ + using LCW = cudf::test::lists_column_wrapper; + using valid_t = std::vector; + +#define row_data \ + LCW({LCW({-1, -1, 3}, valid_t{0, 0, 1}.begin()), LCW{}, LCW{}}, valid_t{1, 0, 1}.begin()) + + auto s = cudf::make_list_scalar(row_data); + auto col = cudf::make_column_from_scalar(*s, 5); + + auto expected = LCW{row_data, row_data, row_data, row_data, row_data}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, expected); + +#undef row_data +} + +template +class ListsDictionaryLeafTest : public ColumnFactoryTest { +}; + +TYPED_TEST_CASE(ListsDictionaryLeafTest, cudf::test::FixedWidthTypes); + +TYPED_TEST(ListsDictionaryLeafTest, FromNonNested) +{ + using DCW = cudf::test::dictionary_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; + + auto s = cudf::make_list_scalar(DCW({1, 3, -1, 1, 3}, {1, 1, 0, 1, 1})); + auto col = cudf::make_column_from_scalar(*s, 2); + + DCW leaf({1, 3, -1, 1, 3, 1, 3, -1, 1, 3}, {1, 1, 0, 1, 1, 1, 1, 0, 1, 1}); + offset_t offsets{0, 5, 10}; + auto mask = cudf::create_null_mask(2, cudf::mask_state::UNALLOCATED); + + auto expected = cudf::make_lists_column(2, offsets.release(), leaf.release(), 0, std::move(mask)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, *expected); +} + +TYPED_TEST(ListsDictionaryLeafTest, FromNested) +{ + using DCW = cudf::test::dictionary_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; + + DCW leaf({1, 3, -1, 1, 3, 1, 3, -1, 1, 3}, {1, 1, 0, 1, 1, 1, 1, 0, 1, 1}); + offset_t offsets{0, 3, 3, 6, 6, 10}; + auto mask = cudf::create_null_mask(5, cudf::mask_state::ALL_VALID); + cudf::set_null_mask(static_cast(mask.data()), 1, 2, false); + auto data = cudf::make_lists_column(5, offsets.release(), leaf.release(), 0, std::move(mask)); + + auto s = cudf::make_list_scalar(*data); + auto col = cudf::make_column_from_scalar(*s, 3); + + DCW leaf2( + {1, 3, -1, 1, 3, 1, 3, -1, 1, 3, 1, 3, -1, 1, 3, + 1, 3, -1, 1, 3, 1, 3, -1, 1, 3, 1, 3, -1, 1, 3}, + {1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1}); + offset_t offsets2{0, 3, 3, 6, 6, 10, 13, 13, 16, 16, 20, 23, 23, 26, 26, 30}; + auto mask2 = cudf::create_null_mask(15, cudf::mask_state::ALL_VALID); + cudf::set_null_mask(static_cast(mask2.data()), 1, 2, false); + cudf::set_null_mask(static_cast(mask2.data()), 6, 7, false); + cudf::set_null_mask(static_cast(mask2.data()), 11, 12, false); + auto nested = + cudf::make_lists_column(15, offsets2.release(), leaf2.release(), 3, std::move(mask2)); + + offset_t offsets3{0, 5, 10, 15}; + auto mask3 = cudf::create_null_mask(3, cudf::mask_state::UNALLOCATED); + auto expected = + cudf::make_lists_column(3, offsets3.release(), std::move(nested), 0, std::move(mask3)); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, *expected); +} + +class ListsStringLeafTest : public ColumnFactoryTest { +}; + +TEST_F(ListsStringLeafTest, FromNonNested) +{ + using SCW = cudf::test::strings_column_wrapper; + using LCW = cudf::test::lists_column_wrapper; + using valid_t = std::vector; + + auto s = cudf::make_list_scalar(SCW({"xx", "", "z"}, {true, false, true})); + auto col = cudf::make_column_from_scalar(*s, 4); + + auto expected = LCW{LCW({"xx", "", "z"}, valid_t{1, 0, 1}.begin()), + LCW({"xx", "", "z"}, valid_t{1, 0, 1}.begin()), + LCW({"xx", "", "z"}, valid_t{1, 0, 1}.begin()), + LCW({"xx", "", "z"}, valid_t{1, 0, 1}.begin())}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, expected); +} + +TEST_F(ListsStringLeafTest, FromNested) +{ + using LCW = cudf::test::lists_column_wrapper; + using valid_t = std::vector; + +#define row_data \ + LCW({LCW{}, \ + LCW({"@@", "rapids", "", "四", "ら"}, valid_t{1, 1, 0, 1, 1}.begin()), \ + LCW{}, \ + LCW({"hello", ""}, valid_t{1, 0}.begin())}, \ + valid_t{0, 1, 1, 1}.begin()) + + auto s = cudf::make_list_scalar(row_data); + + auto col = cudf::make_column_from_scalar(*s, 3); + + auto expected = LCW{row_data, row_data, row_data}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, expected); +#undef row_data +} + +template +class ListsStructsLeafTest : public ColumnFactoryTest { + protected: + using SCW = cudf::test::structs_column_wrapper; + /** + * @brief Create a structs column that contains 3 fields: int, string, List + */ + template + SCW make_test_structs_column(cudf::test::fixed_width_column_wrapper field1, + cudf::test::strings_column_wrapper field2, + cudf::test::lists_column_wrapper field3, + MaskIterator mask) + { + return SCW{{field1, field2, field3}, mask}; + } +}; + +TYPED_TEST_CASE(ListsStructsLeafTest, cudf::test::FixedWidthTypes); + +TYPED_TEST(ListsStructsLeafTest, FromNonNested) +{ + using LCWinner_t = cudf::test::lists_column_wrapper; + using StringCW = cudf::test::strings_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; + using valid_t = std::vector; + + auto data = this->make_test_structs_column( + {{1, 3, 5, 2, 4}, {1, 0, 1, 0, 1}}, + StringCW({"fleur", "flower", "", "花", "はな"}, {true, true, false, true, true}), + LCWinner_t({{1, 2}, {}, {4, 5}, {-1}, {}}, valid_t{1, 1, 1, 1, 0}.begin()), + valid_t{1, 1, 1, 0, 1}.begin()); + auto s = cudf::make_list_scalar(data); + auto col = cudf::make_column_from_scalar(*s, 2); + + auto leaf = this->make_test_structs_column( + {{1, 3, 5, 2, 4, 1, 3, 5, 2, 4}, {1, 0, 1, 0, 1, 1, 0, 1, 0, 1}}, + StringCW({"fleur", "flower", "", "花", "はな", "fleur", "flower", "", "花", "はな"}, + {true, true, false, true, true, true, true, false, true, true}), + LCWinner_t({{1, 2}, {}, {4, 5}, {-1}, {}, {1, 2}, {}, {4, 5}, {-1}, {}}, + valid_t{1, 1, 1, 1, 0, 1, 1, 1, 1, 0}.begin()), + valid_t{1, 1, 1, 0, 1, 1, 1, 1, 0, 1}.begin()); + auto expected = cudf::make_lists_column(2, + offset_t{0, 5, 10}.release(), + leaf.release(), + 0, + cudf::create_null_mask(2, cudf::mask_state::UNALLOCATED)); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, *expected); +} + +TYPED_TEST(ListsStructsLeafTest, FromNested) +{ + using LCWinner_t = cudf::test::lists_column_wrapper; + using StringCW = cudf::test::strings_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; + using valid_t = std::vector; + auto leaf = this->make_test_structs_column( + {{1, 2}, {0, 1}}, + StringCW({"étoile", "星"}, {true, true}), + LCWinner_t({LCWinner_t{}, LCWinner_t{42}}, valid_t{1, 1}.begin()), + valid_t{0, 1}.begin()); + auto mask = cudf::create_null_mask(3, cudf::mask_state::ALL_VALID); + cudf::set_null_mask(static_cast(mask.data()), 0, 1, false); + auto data = + cudf::make_lists_column(3, offset_t{0, 0, 1, 2}.release(), leaf.release(), 1, std::move(mask)); + auto s = cudf::make_list_scalar(*data); + + auto col = cudf::make_column_from_scalar(*s, 3); + + auto leaf2 = this->make_test_structs_column( + {{1, 2, 1, 2, 1, 2}, {0, 1, 0, 1, 0, 1}}, + StringCW({"étoile", "星", "étoile", "星", "étoile", "星"}, + {true, true, true, true, true, true}), + LCWinner_t( + {LCWinner_t{}, LCWinner_t{42}, LCWinner_t{}, LCWinner_t{42}, LCWinner_t{}, LCWinner_t{42}}, + valid_t{1, 1, 1, 1, 1, 1}.begin()), + valid_t{0, 1, 0, 1, 0, 1}.begin()); + auto mask2 = cudf::create_null_mask(9, cudf::mask_state::ALL_VALID); + cudf::set_null_mask(static_cast(mask2.data()), 0, 1, false); + cudf::set_null_mask(static_cast(mask2.data()), 3, 4, false); + cudf::set_null_mask(static_cast(mask2.data()), 6, 7, false); + auto data2 = cudf::make_lists_column( + 9, offset_t{0, 0, 1, 2, 2, 3, 4, 4, 5, 6}.release(), leaf2.release(), 3, std::move(mask2)); + auto expected = cudf::make_lists_column(3, + offset_t{0, 3, 6, 9}.release(), + std::move(data2), + 0, + cudf::create_null_mask(3, cudf::mask_state::UNALLOCATED)); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, *expected); +} + +class ListsZeroLengthColumnTest : public ColumnFactoryTest { + protected: + using StructsCW = cudf::test::structs_column_wrapper; + StructsCW make_test_structs_column(cudf::test::fixed_width_column_wrapper field1, + cudf::test::strings_column_wrapper field2, + cudf::test::lists_column_wrapper field3) + { + return StructsCW{field1, field2, field3}; + } +}; + +TEST_F(ListsZeroLengthColumnTest, MixedTypes) +{ + using FCW = cudf::test::fixed_width_column_wrapper; + using StringCW = cudf::test::strings_column_wrapper; + using LCW = cudf::test::lists_column_wrapper; + using offset_t = cudf::test::fixed_width_column_wrapper; + { + auto s = cudf::make_list_scalar(FCW{1, 2, 3}); + auto got = cudf::make_column_from_scalar(*s, 0); + auto expected = + cudf::make_lists_column(0, + offset_t{}.release(), + FCW{}.release(), + 0, + cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*got, *expected); + } + + { + auto s = cudf::make_list_scalar(LCW{LCW{1, 2, 3}, LCW{}, LCW{5, 6}}); + auto got = cudf::make_column_from_scalar(*s, 0); + auto nested = cudf::make_lists_column(0, + offset_t{}.release(), + FCW{}.release(), + 0, + cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED)); + auto expected = + cudf::make_lists_column(0, + offset_t{}.release(), + std::move(nested), + 0, + cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*got, *expected); + } + + { + auto s = cudf::make_list_scalar( + this->make_test_structs_column({1, 2, 3}, StringCW({"x", "", "y"}), LCW{{5, 6}, {}, {7}})); + auto got = cudf::make_column_from_scalar(*s, 0); + + std::vector> children; + children.emplace_back(FCW{}.release()); + children.emplace_back(StringCW{}.release()); + children.emplace_back(LCW{}.release()); + auto nested = cudf::make_structs_column( + 0, std::move(children), 0, cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED)); + + auto expected = + cudf::make_lists_column(0, + offset_t{}.release(), + std::move(nested), + 0, + cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*got, *expected); + } +} + void struct_from_scalar(bool is_valid) { using LCW = cudf::test::lists_column_wrapper;