From 673cc81693cedd274aee140c91a3bebf7b0f83ce Mon Sep 17 00:00:00 2001 From: davidwendt Date: Tue, 19 Jan 2021 17:44:40 -0500 Subject: [PATCH 1/5] Add libcudf lists column count_elements API --- cpp/include/cudf/lists/count_elements.hpp | 56 ++++++++++++ cpp/src/lists/count_elements.cu | 88 +++++++++++++++++++ cpp/tests/CMakeLists.txt | 1 + cpp/tests/lists/count_elements_tests.cpp | 100 ++++++++++++++++++++++ 4 files changed, 245 insertions(+) create mode 100644 cpp/include/cudf/lists/count_elements.hpp create mode 100644 cpp/src/lists/count_elements.cu create mode 100644 cpp/tests/lists/count_elements_tests.cpp diff --git a/cpp/include/cudf/lists/count_elements.hpp b/cpp/include/cudf/lists/count_elements.hpp new file mode 100644 index 00000000000..6b802d2ad5e --- /dev/null +++ b/cpp/include/cudf/lists/count_elements.hpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +namespace cudf { +namespace lists { +/** + * @addtogroup lists_elements + * @{ + * @file + */ + +/** + * @brief Returns a numeric column containing the number of rows in + * each list element in the given lists column. + * + * The output column will have the same number of rows as the + * input lists column. Each `output[i]` will be `input[i].size()`. + * + * @code{.pseudo} + * l = { {1, 2, 3}, {4}, {5, 6} } + * r = count_elements(l) + * r is now {3, 1, 2} + * @endcode + * + * Any null input element will result in a corresponding null entry + * in the output column. + * + * @param input Input lists column. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New INT32 column with the number of elements for each row. + */ +std::unique_ptr count_elements( + lists_column_view const& input, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of lists_elements group + +} // namespace lists +} // namespace cudf diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu new file mode 100644 index 00000000000..54f0d9800c1 --- /dev/null +++ b/cpp/src/lists/count_elements.cu @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +namespace cudf { +namespace lists { +namespace detail { +/** + * @brief Returns a numeric column containing lengths of each element. + * + * @param input Input lists column. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New INT32 column with lengths. + */ +std::unique_ptr count_elements(lists_column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto device_column = cudf::column_device_view::create(input.parent(), stream); + auto d_column = *device_column; + // create output column + auto output = make_fixed_width_column(data_type{type_to_id()}, + input.size(), + copy_bitmask(input.parent()), + input.null_count(), + stream, + mr); + auto d_output = output->mutable_view().data(); + + // fill in the lengths + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + d_output, + [d_column] __device__(size_type idx) { + if (d_column.is_null(idx)) return size_type{0}; + auto d_offsets = + d_column.child(lists_column_view::offsets_column_index).data() + + d_column.offset(); + return d_offsets[idx + 1] - d_offsets[idx]; + }); + + output->set_null_count(input.null_count()); // reset null count + return output; +} + +} // namespace detail + +// external APIS + +std::unique_ptr count_elements(lists_column_view const& input, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::count_elements(input, rmm::cuda_stream_default, mr); +} + +} // namespace lists +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 0d958f47b6b..106638a1a6f 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -653,6 +653,7 @@ ConfigureTest(AST_TEST "${AST_TEST_SRC}") # - lists tests ---------------------------------------------------------------------------------- set(LISTS_TEST_SRC + "${CMAKE_CURRENT_SOURCE_DIR}/lists/count_elements_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/lists/extract_tests.cpp") ConfigureTest(LISTS_TEST "${LISTS_TEST_SRC}") diff --git a/cpp/tests/lists/count_elements_tests.cpp b/cpp/tests/lists/count_elements_tests.cpp new file mode 100644 index 00000000000..d74e73aa548 --- /dev/null +++ b/cpp/tests/lists/count_elements_tests.cpp @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include + +struct ListsElementsTest : public cudf::test::BaseFixture { +}; + +using NumericTypesNotBool = + cudf::test::Concat; + +template +class ListsElementsNumericsTest : public ListsElementsTest { +}; + +TYPED_TEST_CASE(ListsElementsNumericsTest, NumericTypesNotBool); + +TYPED_TEST(ListsElementsNumericsTest, CountElements) +{ + auto validity = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [](auto i) { return i != 1; }); + using LCW = cudf::test::lists_column_wrapper; + LCW input({LCW{3, 2, 1}, LCW{}, LCW{30, 20, 10, 50}, LCW{100, 120}, LCW{0}}, validity); + + auto result = cudf::lists::count_elements(cudf::lists_column_view(input)); + cudf::test::fixed_width_column_wrapper expected({3, 0, 4, 2, 1}, {1, 0, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); +} + +TEST_F(ListsElementsTest, CountElementsStrings) +{ + auto validity = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [](auto i) { return i != 1; }); + using LCW = cudf::test::lists_column_wrapper; + LCW input( + {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", "", "z"}, LCW{"tést", "String"}, LCW{""}}, + validity); + + auto result = cudf::lists::count_elements(cudf::lists_column_view(input)); + cudf::test::fixed_width_column_wrapper expected({3, 0, 4, 2, 1}, {1, 0, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); +} + +TEST_F(ListsElementsTest, CountElementsSliced) +{ + auto validity = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [](auto i) { return i != 1; }); + using LCW = cudf::test::lists_column_wrapper; + LCW input( + {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", "", "z"}, LCW{"tést", "String"}, LCW{""}}, + validity); + + auto sliced = cudf::slice(input, {1, 4}).front(); + auto result = cudf::lists::count_elements(cudf::lists_column_view(sliced)); + cudf::test::fixed_width_column_wrapper expected({0, 4, 2}, {0, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); +} + +TYPED_TEST(ListsElementsNumericsTest, CountElementsNestedLists) +{ + std::vector validity{1, 0, 1, 1}; + using LCW = cudf::test::lists_column_wrapper; + LCW list({LCW{LCW{2, 3}, LCW{4, 5}}, + LCW{LCW{}}, + LCW{LCW{6, 7, 8}, LCW{9, 10, 11}, LCW{12, 13, 14}}, + LCW{LCW{15, 16}, LCW{17, 18}, LCW{19, 20}, LCW{21, 22}, LCW{23, 24}}}, + validity.begin()); + + auto result = cudf::lists::count_elements(cudf::lists_column_view(list)); + cudf::test::fixed_width_column_wrapper expected({2, 1, 3, 5}, {1, 0, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); +} + +TEST_F(ListsElementsTest, CountElementsEmpty) +{ + using LCW = cudf::test::lists_column_wrapper; + + LCW empty{}; + auto result = cudf::lists::count_elements(cudf::lists_column_view(empty)); + EXPECT_EQ(0, result->size()); +} From be46759c48a2448a80a686c56313b34b6161132a Mon Sep 17 00:00:00 2001 From: davidwendt Date: Tue, 19 Jan 2021 18:40:08 -0500 Subject: [PATCH 2/5] update meta.yaml with new header file --- conda/recipes/libcudf/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 51b3e8afc05..c3e84447b3a 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -122,6 +122,7 @@ test: - test -f $PREFIX/include/cudf/join.hpp - test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp - test -f $PREFIX/include/cudf/lists/detail/copying.hpp + - test -f $PREFIX/include/cudf/lists/count_elements.hpp - test -f $PREFIX/include/cudf/lists/extract.hpp - test -f $PREFIX/include/cudf/lists/gather.hpp - test -f $PREFIX/include/cudf/lists/lists_column_view.hpp From dd6761c3182c12adabfd5bb1d52894913b379c6b Mon Sep 17 00:00:00 2001 From: davidwendt Date: Wed, 20 Jan 2021 08:11:47 -0500 Subject: [PATCH 3/5] add lists_elements to doxygen defgroup --- cpp/include/doxygen_groups.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index 03e00b881d8..1d796aca4b7 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -143,6 +143,7 @@ * @defgroup lists_apis Lists * @{ * @defgroup lists_extract Extracting + * @defgroup lists_elements Counting * @} * @defgroup nvtext_apis NVText * @{ From 3c1e30845ad4bc559e6806cd16e1bfacac18b9c8 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Wed, 20 Jan 2021 13:58:34 -0500 Subject: [PATCH 4/5] add nested list with null element to gtest --- cpp/tests/lists/count_elements_tests.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tests/lists/count_elements_tests.cpp b/cpp/tests/lists/count_elements_tests.cpp index d74e73aa548..c5cb9d230c3 100644 --- a/cpp/tests/lists/count_elements_tests.cpp +++ b/cpp/tests/lists/count_elements_tests.cpp @@ -81,7 +81,7 @@ TYPED_TEST(ListsElementsNumericsTest, CountElementsNestedLists) using LCW = cudf::test::lists_column_wrapper; LCW list({LCW{LCW{2, 3}, LCW{4, 5}}, LCW{LCW{}}, - LCW{LCW{6, 7, 8}, LCW{9, 10, 11}, LCW{12, 13, 14}}, + LCW{LCW{6, 7, 8}, LCW{9, 10, 11}, LCW({12, 13, 14}, validity.begin())}, LCW{LCW{15, 16}, LCW{17, 18}, LCW{19, 20}, LCW{21, 22}, LCW{23, 24}}}, validity.begin()); From b0f97a5209cd2476797128be3a9a0c0c7e1b7251 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Wed, 20 Jan 2021 17:06:03 -0500 Subject: [PATCH 5/5] change call to data() to begin() instead --- cpp/src/lists/count_elements.cu | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu index 54f0d9800c1..78549152770 100644 --- a/cpp/src/lists/count_elements.cu +++ b/cpp/src/lists/count_elements.cu @@ -48,19 +48,18 @@ std::unique_ptr count_elements(lists_column_view const& input, auto device_column = cudf::column_device_view::create(input.parent(), stream); auto d_column = *device_column; // create output column - auto output = make_fixed_width_column(data_type{type_to_id()}, + auto output = make_fixed_width_column(data_type{type_to_id()}, input.size(), copy_bitmask(input.parent()), input.null_count(), stream, mr); - auto d_output = output->mutable_view().data(); - // fill in the lengths + // fill in the sizes thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(input.size()), - d_output, + output->mutable_view().begin(), [d_column] __device__(size_type idx) { if (d_column.is_null(idx)) return size_type{0}; auto d_offsets =