Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use offsetalator in cudf::strings::copy_slice #14844

Merged
merged 3 commits into from
Jan 29, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 19 additions & 16 deletions cpp/src/strings/copying/copying.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@

#include <cudf/column/column_factories.hpp>
#include <cudf/detail/copy.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/strings/detail/copying.hpp>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <rmm/cuda_stream_view.hpp>
Expand All @@ -33,47 +34,49 @@ namespace cudf {
namespace strings {
namespace detail {

std::unique_ptr<cudf::column> copy_slice(strings_column_view const& strings,
std::unique_ptr<cudf::column> copy_slice(strings_column_view const& input,
size_type start,
size_type end,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
if (strings.is_empty()) return make_empty_column(type_id::STRING);
if (end < 0 || end > strings.size()) end = strings.size();
if (input.is_empty()) { return make_empty_column(type_id::STRING); }
CUDF_EXPECTS(((start >= 0) && (start < end)), "Invalid start parameter value.");
auto const strings_count = end - start;
auto const offsets_offset = start + strings.offset();
auto const offsets_offset = start + input.offset();

// slice the offsets child column
auto offsets_column = std::make_unique<cudf::column>(
cudf::detail::slice(
strings.offsets(), {offsets_offset, offsets_offset + strings_count + 1}, stream)
input.offsets(), {offsets_offset, offsets_offset + strings_count + 1}, stream)
.front(),
stream,
mr);
auto const chars_offset =
offsets_offset == 0 ? 0 : cudf::detail::get_value<int32_t>(offsets_column->view(), 0, stream);
offsets_offset == 0 ? 0L : get_offset_value(offsets_column->view(), 0, stream);
if (chars_offset > 0) {
// adjust the individual offset values only if needed
auto d_offsets = offsets_column->mutable_view();
auto d_offsets =
cudf::detail::offsetalator_factory::make_output_iterator(offsets_column->mutable_view());
auto input_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), offsets_offset);
thrust::transform(rmm::exec_policy(stream),
d_offsets.begin<int32_t>(),
d_offsets.end<int32_t>(),
d_offsets.begin<int32_t>(),
cuda::proclaim_return_type<int32_t>(
input_offsets,
input_offsets + offsets_column->size(),
d_offsets,
cuda::proclaim_return_type<int64_t>(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think we should declare something like size_type that would be a proper type name for all the int64_ts that are being sprinkled around? It's pretty subtle in some cases, like the 0L literal written above. I would want to reduce the possibility of unwanted casting (like forgetting to write 0L instead of 0) by having a type name with clear intent.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not wish to create a special type at this time and prefer not to obfuscate int64_t in this code.

[chars_offset] __device__(auto offset) { return offset - chars_offset; }));
}

// slice the chars child column
auto const data_size = static_cast<std::size_t>(
cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream));
auto const data_size =
static_cast<std::size_t>(get_offset_value(offsets_column->view(), strings_count, stream));
auto chars_buffer =
rmm::device_buffer{strings.chars_begin(stream) + chars_offset, data_size, stream, mr};
rmm::device_buffer{input.chars_begin(stream) + chars_offset, data_size, stream, mr};

// slice the null mask
auto null_mask = cudf::detail::copy_bitmask(
strings.null_mask(), offsets_offset, offsets_offset + strings_count, stream, mr);
input.null_mask(), offsets_offset, offsets_offset + strings_count, stream, mr);

auto null_count = cudf::detail::null_count(
static_cast<bitmask_type const*>(null_mask.data()), 0, strings_count, stream);
Expand Down
Loading