diff --git a/CHANGELOG.md b/CHANGELOG.md index 8d1b820035f..a6310550c86 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -56,6 +56,7 @@ - PR #5645 Enforce pd.NA and Pandas nullable dtype parity - PR #5729 Create nvtext normalize_characters API from the subword_tokenize internal function - PR #5572 Add `cudf::encode` API. +- PR #5753 Add `cudf::lists::extract_list_element` API - PR #5568 Add support for `Series.keys()` and `DataFrame.keys()` - PR #5782 Add Kafka support to custreamz - PR #5642 Add `GroupBy.groups()` diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 29eb4734249..e3a5c40b8fd 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -102,6 +102,7 @@ test: - test -f $PREFIX/include/cudf/ipc.hpp - test -f $PREFIX/include/cudf/join.hpp - test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp + - test -f $PREFIX/include/cudf/lists/extract.hpp - test -f $PREFIX/include/cudf/lists/lists_column_view.hpp - test -f $PREFIX/include/cudf/merge.hpp - test -f $PREFIX/include/cudf/null_mask.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 94cf9486e96..0483e2c4cc5 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -558,6 +558,7 @@ add_library(cudf src/strings/substring.cu src/strings/translate.cu src/strings/utilities.cu + src/lists/extract.cu src/lists/lists_column_factories.cu src/lists/lists_column_view.cu src/lists/copying/concatenate.cu diff --git a/cpp/include/cudf/lists/extract.hpp b/cpp/include/cudf/lists/extract.hpp new file mode 100644 index 00000000000..44b124457be --- /dev/null +++ b/cpp/include/cudf/lists/extract.hpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +namespace cudf { +namespace lists { +/** + * @ingroup lists_extract + * @{ + */ + +/** + * @brief Create a column using values from row `index` from each + * sublist within the input `lists_column`. + * + * Output `column[i]` is set from element `lists_column[i][index]`. + * If `index` is larger than the size of the sublist at `lists_column[i]` + * then output `column[i] = null`. + * + * @code{.pseudo} + * l = { {1, 2, 3}, {4}, {5, 6} } + * r = extract_list_element(l, 1) + * r is now {2, null, 6} + * @endcode + * + * The `index` may also be negative in which case the row retrieved is offset + * from the end of each sublist. + * + * @code{.pseudo} + * l = { {"a"}, {"b", "c"}, {"d", "e", "f"} } + * r = extract_list_element(l, -1) + * r is now {"a", "c", "f"} + * @endcode + * + * Any input where `lists_column[i] == null` will produce + * output `column[i] = null`. Also, any element where + * `lists_column[i][index] == null` will produce + * output `column[i] = null`. + * + * @param lists_column Column to extract elements from. + * @param index The row within each sublist to retrieve. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return Column of extracted elements. + */ +std::unique_ptr extract_list_element( + lists_column_view const& lists_column, + size_type index, + rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); + +/** @} */ // end of group +} // namespace lists +} // namespace cudf diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index fd446b032cc..a6238369afb 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -118,6 +118,10 @@ * @defgroup io_readers Readers * @defgroup io_writers Writers * @} + * @defgroup lists_apis Lists + * @{ + * @defgroup lists_extract Extracting + * @} * @defgroup nvtext_apis NVText * @{ * @defgroup nvtext_ngrams NGrams diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu new file mode 100644 index 00000000000..565e5ef6ea4 --- /dev/null +++ b/cpp/src/lists/extract.cu @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include + +#include + +namespace cudf { +namespace lists { +namespace detail { + +namespace { + +/** + * @brief Convert index value for each sublist into a gather index for + * the lists column's child column. + */ +template +struct map_index_fn { + column_device_view const d_offsets; // offsets to each sublist (including validity mask) + size_type const index; // index of element within each sublist + size_type const out_of_bounds; // value to use to indicate out-of-bounds + + __device__ int32_t operator()(size_type idx) + { + if (d_offsets.is_null(idx)) return out_of_bounds; + auto const offset = d_offsets.element(idx); + auto const length = d_offsets.element(idx + 1) - offset; + if (PositiveIndex) + return index < length ? index + offset : out_of_bounds; + else + return index >= -length ? length + index + offset : out_of_bounds; + } +}; + +} // namespace + +/** + * @copydoc cudf::lists::extract_list_element + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr extract_list_element(lists_column_view lists_column, + size_type index, + cudaStream_t stream, + rmm::mr::device_memory_resource* mr) +{ + if (lists_column.size() == 0) return empty_like(lists_column.parent()); + auto const offsets_column = lists_column.offsets(); + + // create a column_view with attributes of the parent and data from the offsets + column_view annotated_offsets(data_type{type_id::INT32}, + lists_column.size() + 1, + offsets_column.data(), + lists_column.null_mask(), + lists_column.null_count(), + lists_column.offset()); + + // create a gather map for extracting elements from the child column + auto gather_map = make_fixed_width_column( + data_type{type_id::INT32}, annotated_offsets.size() - 1, mask_state::UNALLOCATED, stream); + auto d_gather_map = gather_map->mutable_view().data(); + auto const child_column = lists_column.child(); + + // build the gather map using the offsets and the provided index + auto const d_column = column_device_view::create(annotated_offsets, stream); + if (index < 0) + thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(gather_map->size()), + d_gather_map, + map_index_fn{*d_column, index, child_column.size()}); + else + thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(gather_map->size()), + d_gather_map, + map_index_fn{*d_column, index, child_column.size()}); + + // call gather on the child column + auto result = cudf::detail::gather(table_view({child_column}), + d_gather_map, + d_gather_map + gather_map->size(), + true, // nullify-out-of-bounds + mr, + stream) + ->release(); + if (result.front()->null_count() == 0) + result.front()->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); + return std::unique_ptr(std::move(result.front())); +} + +} // namespace detail + +/** + * @copydoc cudf::lists::extract_list_element + */ +std::unique_ptr extract_list_element(lists_column_view const& lists_column, + size_type index, + rmm::mr::device_memory_resource* mr) +{ + return detail::extract_list_element(lists_column, index, 0, mr); +} + +} // namespace lists +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index b8cc3b09213..f3cdf2c66eb 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -583,6 +583,14 @@ set(ENCODE_TEST_SRC ConfigureTest(ENCODE_TEST "${ENCODE_TEST_SRC}") +################################################################################################### +# - lists tests ---------------------------------------------------------------------------------- + +set(LISTS_TEST_SRC + "${CMAKE_CURRENT_SOURCE_DIR}/lists/extract_tests.cpp") + +ConfigureTest(LISTS_TEST "${LISTS_TEST_SRC}") + ################################################################################################### ### enable testing ################################################################################ ################################################################################################### diff --git a/cpp/tests/lists/extract_tests.cpp b/cpp/tests/lists/extract_tests.cpp new file mode 100644 index 00000000000..c98da9e7dc6 --- /dev/null +++ b/cpp/tests/lists/extract_tests.cpp @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +struct ListsExtractTest : public cudf::test::BaseFixture { +}; + +using NumericTypesNotBool = + cudf::test::Concat; + +template +class ListsExtractNumericsTest : public ListsExtractTest { +}; + +TYPED_TEST_CASE(ListsExtractNumericsTest, NumericTypesNotBool); + +TYPED_TEST(ListsExtractNumericsTest, ExtractElement) +{ + auto validity = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [](auto i) { return i != 1; }); + using LCW = cudf::test::lists_column_wrapper; + LCW input({LCW{3, 2, 1}, LCW{}, LCW{30, 20, 10, 50}, LCW{100, 120}, LCW{0}}, validity); + + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), 0); + cudf::test::fixed_width_column_wrapper expected({3, 0, 30, 100, 0}, {1, 0, 1, 1, 1}); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), 1); + cudf::test::fixed_width_column_wrapper expected({2, 0, 20, 120, 0}, {1, 0, 1, 1, 0}); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), 2); + cudf::test::fixed_width_column_wrapper expected({1, 0, 10, 0, 0}, {1, 0, 1, 0, 0}); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), 3); + cudf::test::fixed_width_column_wrapper expected({0, 0, 50, 0, 0}, {0, 0, 1, 0, 0}); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), 4); + cudf::test::fixed_width_column_wrapper expected({0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), -1); + cudf::test::fixed_width_column_wrapper expected({1, 0, 50, 120, 0}, {1, 0, 1, 1, 1}); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), -2); + cudf::test::fixed_width_column_wrapper expected({2, 0, 10, 100, 0}, {1, 0, 1, 1, 0}); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), -3); + cudf::test::fixed_width_column_wrapper expected({3, 0, 20, 0, 0}, {1, 0, 1, 0, 0}); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), -4); + cudf::test::fixed_width_column_wrapper expected({0, 0, 30, 0, 0}, {0, 0, 1, 0, 0}); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), -5); + cudf::test::fixed_width_column_wrapper expected({0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}); + cudf::test::expect_columns_equal(expected, *result); + } +} + +TEST_F(ListsExtractTest, ExtractElementStrings) +{ + auto validity = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [](auto i) { return i != 1; }); + using LCW = cudf::test::lists_column_wrapper; + LCW input( + {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", "", "z"}, LCW{"tést", "String"}, LCW{""}}, + validity); + + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), 0); + cudf::test::strings_column_wrapper expected({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1}); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), 1); + cudf::test::strings_column_wrapper expected({"Héllo", "", "some", "String", ""}, + {1, 0, 1, 1, 0}); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), 2); + cudf::test::strings_column_wrapper expected({"thesé", "", "", "", ""}, {1, 0, 1, 0, 0}); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), 3); + cudf::test::strings_column_wrapper expected({"", "", "z", "", ""}, {0, 0, 1, 0, 0}); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), 4); + cudf::test::strings_column_wrapper expected({"", "", "", "", ""}, {0, 0, 0, 0, 0}); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), -1); + cudf::test::strings_column_wrapper expected({"thesé", "", "z", "String", ""}, {1, 0, 1, 1, 1}); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), -2); + cudf::test::strings_column_wrapper expected({"Héllo", "", "", "tést", ""}, {1, 0, 1, 1, 0}); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), -3); + cudf::test::strings_column_wrapper expected({"", "", "some", "", ""}, {1, 0, 1, 0, 0}); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), -4); + cudf::test::strings_column_wrapper expected({"", "", "are", "", ""}, {0, 0, 1, 0, 0}); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), -5); + cudf::test::strings_column_wrapper expected({"", "", "", "", ""}, {0, 0, 0, 0, 0}); + cudf::test::expect_columns_equal(expected, *result); + } +} + +TYPED_TEST(ListsExtractNumericsTest, ExtractElementNestedLists) +{ + std::vector validity{1, 0, 1, 1}; + using LCW = cudf::test::lists_column_wrapper; + LCW list({LCW{LCW{2, 3}, LCW{4, 5}}, + LCW{LCW{}}, + LCW{LCW{6, 7, 8}, LCW{9, 10, 11}, LCW{12, 13, 14}}, + LCW{LCW{15, 16}, LCW{17, 18}, LCW{19, 20}, LCW{21, 22}, LCW{23, 24}}}, + validity.begin()); + + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(list), 0); + LCW expected({LCW{2, 3}, LCW{}, LCW{6, 7, 8}, LCW{15, 16}}, validity.begin()); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(list), 1); + LCW expected({LCW{4, 5}, LCW{}, LCW{9, 10, 11}, LCW{17, 18}}, validity.begin()); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(list), 2); + std::vector expected_validity{0, 0, 1, 1}; + LCW expected({LCW{}, LCW{}, LCW{12, 13, 14}, LCW{19, 20}}, expected_validity.begin()); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(list), 3); + std::vector expected_validity{0, 0, 0, 1}; + LCW expected({LCW{}, LCW{}, LCW{}, LCW{21, 22}}, expected_validity.begin()); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(list), -1); + LCW expected({LCW{4, 5}, LCW{}, LCW{12, 13, 14}, LCW{23, 24}}, validity.begin()); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(list), -2); + LCW expected({LCW{2, 3}, LCW{}, LCW{9, 10, 11}, LCW{21, 22}}, validity.begin()); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(list), -3); + std::vector expected_validity{0, 0, 1, 1}; + LCW expected({LCW{}, LCW{}, LCW{6, 7, 8}, LCW{19, 20}}, expected_validity.begin()); + cudf::test::expect_columns_equal(expected, *result); + } +} + +TEST_F(ListsExtractTest, ExtractElementEmpty) +{ + auto empty = cudf::make_empty_column(cudf::data_type{cudf::type_id::LIST}); + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(empty->view()), 1); + EXPECT_EQ(0, result->size()); + + using LCW = cudf::test::lists_column_wrapper; + LCW empty_strings({LCW{"", "", ""}}); + result = cudf::lists::extract_list_element(cudf::lists_column_view(empty_strings), 1); + cudf::test::strings_column_wrapper expected({""}); + cudf::test::expect_columns_equal(expected, *result); + + LCW null_strings({LCW{"", "", ""}}, thrust::make_constant_iterator(0)); + result = cudf::lists::extract_list_element(cudf::lists_column_view(null_strings), 1); + cudf::test::strings_column_wrapper expected_null({""}, {0}); + cudf::test::expect_columns_equal(expected_null, *result); +} + +TEST_F(ListsExtractTest, ExtractElementWithNulls) +{ + auto validity = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [](auto i) { return i != 1; }); + using LCW = cudf::test::lists_column_wrapper; + LCW input{ + {{"Héllo", "", "thesé"}, validity}, {"are"}, {{"some", ""}, validity}, {"tést", "strings"}}; + + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), 0); + cudf::test::strings_column_wrapper expected({"Héllo", "are", "some", "tést"}); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), 1); + cudf::test::strings_column_wrapper expected({"", "", "", "strings"}, {0, 0, 0, 1}); + cudf::test::expect_columns_equal(expected, *result); + } + { + auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), -1); + cudf::test::strings_column_wrapper expected({"thesé", "are", "", "strings"}, {1, 1, 0, 1}); + cudf::test::expect_columns_equal(expected, *result); + } +} + +CUDF_TEST_PROGRAM_MAIN()