From 305c6093bfc4109a7340a0ec54e9c41aa9538c41 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 6 Jun 2024 14:48:30 -0400 Subject: [PATCH] Support large strings in cudf::io::text::multibyte_split --- cpp/src/io/text/multibyte_split.cu | 38 ++++++++++++++++-------------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 976d735e010..9c406369068 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -30,6 +31,7 @@ #include #include #include +#include #include #include @@ -518,32 +520,37 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source bool const insert_end = not(last_row_offset.has_value() or (global_offsets.size() > 0 and global_offsets.back_element(stream) == chunk_offset)); - rmm::device_uvector offsets{ - global_offsets.size() + insert_begin + insert_end, stream, mr}; - if (insert_begin) { offsets.set_element_to_zero_async(0, stream); } - if (insert_end) { - offsets.set_element(offsets.size() - 1, chunk_offset - *first_row_offset, stream); - } + auto const chars_bytes = chunk_offset - *first_row_offset; + auto offsets = cudf::strings::detail::create_offsets_child_column( + chars_bytes, global_offsets.size() + insert_begin + insert_end, stream, mr); + auto offsets_itr = + cudf::detail::offsetalator_factory::make_output_iterator(offsets->mutable_view()); + auto set_offset_value = [offsets_itr, stream](size_type index, int64_t value) { + cudf::detail::device_single_thread( + [offsets_itr, index, value] __device__() mutable { offsets_itr[index] = value; }, stream); + }; + if (insert_begin) { set_offset_value(0, 0); } + if (insert_end) { set_offset_value(offsets->size() - 1, chars_bytes); } thrust::transform(rmm::exec_policy(stream), global_offsets.begin(), global_offsets.end(), - offsets.begin() + insert_begin, - cuda::proclaim_return_type( + offsets_itr + insert_begin, + cuda::proclaim_return_type( [baseline = *first_row_offset] __device__(byte_offset global_offset) { - return static_cast(global_offset - baseline); + return (global_offset - baseline); })); - auto string_count = offsets.size() - 1; + auto string_count = offsets->size() - 1; if (strip_delimiters) { auto it = cudf::detail::make_counting_transform_iterator( 0, cuda::proclaim_return_type>( - [ofs = offsets.data(), + [ofs = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view()), chars = chars.data(), delim_size = static_cast(delimiter.size()), last_row = static_cast(string_count) - 1, insert_end] __device__(size_type row) { auto const begin = ofs[row]; - auto const len = ofs[row + 1] - begin; + auto const len = static_cast(ofs[row + 1] - begin); if (row == last_row && insert_end) { return thrust::make_pair(chars + begin, len); } else { @@ -552,12 +559,7 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source })); return cudf::strings::detail::make_strings_column(it, it + string_count, stream, mr); } else { - return cudf::make_strings_column( - string_count, - std::make_unique(std::move(offsets), rmm::device_buffer{}, 0), - chars.release(), - 0, - {}); + return cudf::make_strings_column(string_count, std::move(offsets), chars.release(), 0, {}); } }