Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Add cudf::lists::extract_list_element API #5753

Merged
merged 21 commits into from
Aug 3, 2020
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
- PR #5645 Enforce pd.NA and Pandas nullable dtype parity
- PR #5729 Create nvtext normalize_characters API from the subword_tokenize internal function
- PR #5572 Add `cudf::encode` API.
- PR #5753 Add `cudf::lists::extract_list_element` API
- PR #5568 Add support for `Series.keys()` and `DataFrame.keys()`

## Improvements
Expand Down
1 change: 1 addition & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ test:
- test -f $PREFIX/include/cudf/ipc.hpp
- test -f $PREFIX/include/cudf/join.hpp
- test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp
- test -f $PREFIX/include/cudf/lists/extract.hpp
- test -f $PREFIX/include/cudf/lists/lists_column_view.hpp
- test -f $PREFIX/include/cudf/merge.hpp
- test -f $PREFIX/include/cudf/null_mask.hpp
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,7 @@ add_library(cudf
src/strings/substring.cu
src/strings/translate.cu
src/strings/utilities.cu
src/lists/extract.cu
src/lists/lists_column_factories.cu
src/lists/lists_column_view.cu
src/lists/copying/concatenate.cu
Expand Down
69 changes: 69 additions & 0 deletions cpp/include/cudf/lists/extract.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cudf/column/column.hpp>
#include <cudf/column/column_view.hpp>
#include <cudf/lists/lists_column_view.hpp>

namespace cudf {
namespace lists {
/**
* @ingroup lists_extract
* @{
*/

/**
* @brief Create a column using values from row `index` from each
* sublist within the intput `lists_column`.
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
*
* Output `column[i]` is set from element `lists_column[i][index]`.
* If `index` is larger than the size of the sublist at `lists_column[i]`
* then output `column[i] = null`.
*
* @code{.pseudo}
* l = { {1, 2, 3}, {4}, {5, 6} }
* r = extract_list_element(l, 1)
* r is now {2, null, 6}
* @endcode
*
* The `index` may also be negative in which case the row retrieved is offset
* from the end of each sublist.
*
* @code{.pseudo}
* l = { {"a"}, {"b", "c"}, {"d", "e", "f"} }
* r = extract_list_element(l, -1)
* r is now {"a", "c", "f"}
* @endcode
*
* Any input where `lists_column[i] == null` will produce
* output `column[i] = null`. Also, any element where
* `lists_column[i][index] == null` will produce
* output `column[i] = null`.
*
* @param lists_column Column to extract elements from.
* @param index The row within each sublist to retrieve.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return Column of extracted elements.
*/
std::unique_ptr<column> extract_list_element(
lists_column_view const& lists_column,
size_type index,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());

/** @} */ // end of group
} // namespace lists
} // namespace cudf
4 changes: 4 additions & 0 deletions cpp/include/doxygen_groups.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,10 @@
* @defgroup io_readers Readers
* @defgroup io_writers Writers
* @}
* @defgroup lists_apis Lists
* @{
* @defgroup lists_extract Extracting
* @}
* @defgroup nvtext_apis NVText
* @{
* @defgroup nvtext_ngrams NGrams
Expand Down
121 changes: 121 additions & 0 deletions cpp/src/lists/extract.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/gather.cuh>
#include <cudf/lists/extract.hpp>

#include <thrust/transform.h>

namespace cudf {
namespace lists {
namespace detail {

namespace {

/**
* @brief Convert index value for each sublist into a gather index for
* the lists column's child column.
*/
template <bool PositiveIndex = true>
struct map_index_fn {
column_device_view const d_offsets; // offsets to each sublist (including validity mask)
size_type const index; // index of element within each sublist
size_type const out_of_bounds; // value to use to indicate out-of-bounds

__device__ int32_t operator()(size_type idx)
{
if (d_offsets.is_null(idx)) return out_of_bounds;
auto const offset = d_offsets.element<int32_t>(idx);
auto const length = d_offsets.element<int32_t>(idx + 1) - offset;
if (PositiveIndex)
return index < length ? index + offset : out_of_bounds;
else
return index >= -length ? length + index + offset : out_of_bounds;
}
};

} // namespace

/**
* @copydoc cudf::lists::extract_list_element
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
size_type index,
cudaStream_t stream,
rmm::mr::device_memory_resource* mr)
{
if (lists_column.size() == 0) return empty_like(lists_column.parent());
auto const offsets_column = lists_column.offsets();

// create a column_view with attributes of the parent and data from the offsets
column_view annotated_offsets(data_type{type_id::INT32},
lists_column.size() + 1,
offsets_column.data<int32_t>(),
lists_column.null_mask(),
lists_column.null_count(),
lists_column.offset());

// create a gather map for extracting elements from the child column
auto gather_map = make_fixed_width_column(
data_type{type_id::INT32}, annotated_offsets.size() - 1, mask_state::UNALLOCATED, stream);
auto d_gather_map = gather_map->mutable_view().data<int32_t>();
auto const child_column = lists_column.child();

// build the gather map using the offsets and the provided index
auto const d_column = column_device_view::create(annotated_offsets, stream);
if (index < 0)
thrust::transform(rmm::exec_policy(stream)->on(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(gather_map->size()),
d_gather_map,
map_index_fn<false>{*d_column, index, child_column.size()});
else
thrust::transform(rmm::exec_policy(stream)->on(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(gather_map->size()),
d_gather_map,
map_index_fn<true>{*d_column, index, child_column.size()});

// call gather on the child column
auto result = cudf::detail::gather(table_view({child_column}),
d_gather_map,
d_gather_map + gather_map->size(),
true, // nullify-out-of-bounds
mr,
stream)
->release();
if (result.front()->null_count() == 0)
result.front()->set_null_mask(rmm::device_buffer{0, stream, mr}, 0);
return std::unique_ptr<column>(std::move(result.front()));
}

} // namespace detail

/**
* @copydoc cudf::lists::extract_list_element
*/
std::unique_ptr<column> extract_list_element(lists_column_view const& lists_column,
size_type index,
rmm::mr::device_memory_resource* mr)
{
return detail::extract_list_element(lists_column, index, 0, mr);
}

} // namespace lists
} // namespace cudf
8 changes: 8 additions & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,14 @@ set(ENCODE_TEST_SRC

ConfigureTest(ENCODE_TEST "${ENCODE_TEST_SRC}")

###################################################################################################
# - lists tests ----------------------------------------------------------------------------------

set(LISTS_TEST_SRC
"${CMAKE_CURRENT_SOURCE_DIR}/lists/extract_tests.cpp")

ConfigureTest(LISTS_TEST "${LISTS_TEST_SRC}")

###################################################################################################
### enable testing ################################################################################
###################################################################################################
Expand Down
Loading