From a9a59592560e440d42215cecbb522fa51125454e Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 17 Jan 2024 15:16:59 -0500 Subject: [PATCH 1/3] Fix calls to deprecated strings factory API --- cpp/benchmarks/common/generate_input.cu | 4 +-- cpp/benchmarks/json/json.cu | 4 +-- .../cudf/strings/detail/copy_if_else.cuh | 4 +-- .../cudf/strings/detail/copy_range.cuh | 4 +-- cpp/include/cudf/strings/detail/gather.cuh | 2 +- cpp/include/cudf/strings/detail/merge.cuh | 14 +++----- .../detail/strings_column_factories.cuh | 9 +++-- cpp/include/cudf_test/column_wrapper.hpp | 30 ++++++++-------- cpp/src/hash/md5_hash.cu | 10 +++--- cpp/src/interop/from_arrow.cu | 4 +-- cpp/src/io/csv/durations.cu | 4 +-- cpp/src/io/csv/writer_impl.cu | 7 ++-- cpp/src/io/json/legacy/reader_impl.cu | 34 +++++++++---------- cpp/src/io/json/write_json.cu | 24 +++++-------- cpp/src/io/parquet/predicate_pushdown.cpp | 4 +-- cpp/src/io/text/multibyte_split.cu | 4 +-- cpp/src/io/utilities/column_buffer.cpp | 22 ++---------- cpp/src/io/utilities/data_casting.cu | 9 +++-- cpp/src/json/json_path.cu | 9 +++-- cpp/src/lists/interleave_columns.cu | 2 +- cpp/src/replace/clamp.cu | 4 +-- cpp/src/replace/nulls.cu | 11 +++--- cpp/src/replace/replace.cu | 20 +++++------ cpp/src/reshape/interleave_columns.cu | 9 +++-- cpp/src/strings/capitalize.cu | 9 ++--- cpp/src/strings/case.cu | 9 ++--- cpp/src/strings/char_types/char_types.cu | 9 ++--- cpp/src/strings/combine/concatenate.cu | 14 ++++---- cpp/src/strings/combine/join.cu | 7 ++-- cpp/src/strings/combine/join_list_elements.cu | 16 ++++++--- cpp/src/strings/convert/convert_booleans.cu | 4 +-- cpp/src/strings/convert/convert_datetime.cu | 4 +-- cpp/src/strings/convert/convert_durations.cu | 4 +-- .../strings/convert/convert_fixed_point.cu | 4 +-- cpp/src/strings/convert/convert_floats.cu | 4 +-- cpp/src/strings/convert/convert_hex.cu | 8 ++--- cpp/src/strings/convert/convert_integers.cu | 4 +-- cpp/src/strings/convert/convert_ipv4.cu | 10 +++--- cpp/src/strings/convert/convert_lists.cu | 11 +++--- cpp/src/strings/convert/convert_urls.cu | 14 ++++---- cpp/src/strings/copying/concatenate.cu | 7 ++-- cpp/src/strings/copying/shift.cu | 8 ++--- cpp/src/strings/filling/fill.cu | 2 +- cpp/src/strings/filter_chars.cu | 9 ++--- cpp/src/strings/padding.cu | 15 ++++---- cpp/src/strings/repeat_strings.cu | 10 +++--- cpp/src/strings/replace/backref_re.cu | 10 +++--- cpp/src/strings/replace/multi.cu | 16 ++++----- cpp/src/strings/replace/multi_re.cu | 8 ++--- cpp/src/strings/replace/replace.cu | 25 +++++++------- cpp/src/strings/replace/replace_re.cu | 8 ++--- cpp/src/strings/slice.cu | 4 +-- cpp/src/strings/translate.cu | 8 ++--- cpp/src/text/bpe/byte_pair_encoding.cu | 6 ++-- cpp/src/text/detokenize.cu | 11 +++--- cpp/src/text/generate_ngrams.cu | 16 ++++++--- cpp/src/text/ngrams_tokenize.cu | 11 +++--- cpp/src/text/normalize.cu | 12 +++---- cpp/src/text/replace.cu | 16 +++++---- cpp/src/text/tokenize.cu | 9 +++-- cpp/src/transform/row_conversion.cu | 3 +- cpp/tests/copying/concatenate_tests.cpp | 34 +++++++++---------- cpp/tests/strings/contains_tests.cpp | 11 +++--- cpp/tests/strings/factories_test.cu | 14 +++----- cpp/tests/transform/row_conversion.cpp | 30 ++++------------ 65 files changed, 316 insertions(+), 366 deletions(-) diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index bb7529bb37a..0ea13957868 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -540,7 +540,7 @@ std::unique_ptr create_random_utf8_string_column(data_profile cons return cudf::make_strings_column( num_rows, std::make_unique(std::move(offsets), rmm::device_buffer{}, 0), - std::make_unique(std::move(chars), rmm::device_buffer{}, 0), + chars.release(), null_count, profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{}); } diff --git a/cpp/benchmarks/json/json.cu b/cpp/benchmarks/json/json.cu index c74701445f8..020c8e413b3 100644 --- a/cpp/benchmarks/json/json.cu +++ b/cpp/benchmarks/json/json.cu @@ -177,10 +177,10 @@ auto build_json_string_column(int desired_bytes, int num_rows) auto d_store_order = cudf::column_device_view::create(float_2bool_columns->get_column(2)); json_benchmark_row_builder jb{ desired_bytes, num_rows, {*d_books, *d_bicycles}, *d_book_pct, *d_misc_order, *d_store_order}; - auto children = cudf::strings::detail::make_strings_children( + auto [offsets, chars] = cudf::strings::detail::make_strings_children( jb, num_rows, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); return cudf::make_strings_column( - num_rows, std::move(children.first), std::move(children.second), 0, {}); + num_rows, std::move(offsets), std::move(chars->release().data.release()[0]), 0, {}); } void BM_case(benchmark::State& state, std::string query_arg) diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh index 6f0b199ff12..64e14dcc549 100644 --- a/cpp/include/cudf/strings/detail/copy_if_else.cuh +++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -109,7 +109,7 @@ std::unique_ptr copy_if_else(StringIterLeft lhs_begin, return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + std::move(chars_column->release().data.release()[0]), null_count, std::move(null_mask)); } diff --git a/cpp/include/cudf/strings/detail/copy_range.cuh b/cpp/include/cudf/strings/detail/copy_range.cuh index 5da3addd9a4..567452bac4e 100644 --- a/cpp/include/cudf/strings/detail/copy_range.cuh +++ b/cpp/include/cudf/strings/detail/copy_range.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -205,7 +205,7 @@ std::unique_ptr copy_range(SourceValueIterator source_value_begin, return make_strings_column(target.size(), std::move(p_offsets_column), - std::move(p_chars_column), + std::move(p_chars_column->release().data.release()[0]), null_count, std::move(null_mask)); } diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index e681373e6e0..401d53ae16b 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -321,7 +321,7 @@ std::unique_ptr gather(strings_column_view const& strings, return make_strings_column(output_count, std::move(out_offsets_column), - std::move(out_chars_column), + std::move(out_chars_column->release().data.release()[0]), 0, // caller sets these rmm::device_buffer{}); } diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh index aef1fe93792..8049895c3c2 100644 --- a/cpp/include/cudf/strings/detail/merge.cuh +++ b/cpp/include/cudf/strings/detail/merge.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -89,9 +89,8 @@ std::unique_ptr merge(strings_column_view const& lhs, auto d_offsets = offsets_column->view().template data(); // create the chars column - auto chars_column = strings::detail::create_chars_child_column(bytes, stream, mr); - // merge the strings - auto d_chars = chars_column->mutable_view().template data(); + rmm::device_uvector chars(bytes, stream, mr); + auto d_chars = chars.data(); thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), strings_count, @@ -103,11 +102,8 @@ std::unique_ptr merge(strings_column_view const& lhs, memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes()); }); - return make_strings_column(strings_count, - std::move(offsets_column), - std::move(chars_column), - null_count, - std::move(null_mask)); + return make_strings_column( + strings_count, std::move(offsets_column), chars.release(), null_count, std::move(null_mask)); } } // namespace detail diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh index de7db4ce47b..fcbdfa619f4 100644 --- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh +++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh @@ -137,7 +137,7 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + std::move(chars_column->release().data.release()[0]), null_count, std::move(null_mask)); } @@ -187,13 +187,12 @@ std::unique_ptr make_strings_column(CharIterator chars_begin, [] __device__(auto offset) { return static_cast(offset); })); // build chars column - auto chars_column = strings::detail::create_chars_child_column(bytes, stream, mr); - auto chars_view = chars_column->mutable_view(); - thrust::copy(rmm::exec_policy(stream), chars_begin, chars_end, chars_view.data()); + rmm::device_uvector chars_data(bytes, stream, mr); + thrust::copy(rmm::exec_policy(stream), chars_begin, chars_end, chars_data.begin()); return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + chars_data.release(), null_count, std::move(null_mask)); } diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index abcd89c3035..bfea2cbce76 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -757,20 +757,21 @@ class strings_column_wrapper : public detail::column_wrapper { strings_column_wrapper(StringsIterator begin, StringsIterator end) : column_wrapper{} { size_type num_strings = std::distance(begin, end); + if (num_strings == 0) { + wrapped = cudf::make_empty_column(cudf::type_id::STRING); + return; + } auto all_valid = thrust::make_constant_iterator(true); auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, all_valid); - auto d_chars = std::make_unique( - cudf::detail::make_device_uvector_sync( - chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()), - rmm::device_buffer{}, - 0); + auto d_chars = cudf::detail::make_device_uvector_sync( + chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); auto d_offsets = std::make_unique( cudf::detail::make_device_uvector_sync( offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()), rmm::device_buffer{}, 0); wrapped = - cudf::make_strings_column(num_strings, std::move(d_offsets), std::move(d_chars), 0, {}); + cudf::make_strings_column(num_strings, std::move(d_offsets), d_chars.release(), 0, {}); } /** @@ -805,14 +806,15 @@ class strings_column_wrapper : public detail::column_wrapper { strings_column_wrapper(StringsIterator begin, StringsIterator end, ValidityIterator v) : column_wrapper{} { - size_type num_strings = std::distance(begin, end); + size_type num_strings = std::distance(begin, end); + if (num_strings == 0) { + wrapped = cudf::make_empty_column(cudf::type_id::STRING); + return; + } auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, v); auto [null_mask, null_count] = detail::make_null_mask_vector(v, v + num_strings); - auto d_chars = std::make_unique( - cudf::detail::make_device_uvector_sync( - chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()), - rmm::device_buffer{}, - 0); + auto d_chars = cudf::detail::make_device_uvector_sync( + chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); auto d_offsets = std::make_unique( cudf::detail::make_device_uvector_sync( offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()), @@ -821,7 +823,7 @@ class strings_column_wrapper : public detail::column_wrapper { auto d_bitmask = cudf::detail::make_device_uvector_sync( null_mask, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); wrapped = cudf::make_strings_column( - num_strings, std::move(d_offsets), std::move(d_chars), null_count, d_bitmask.release()); + num_strings, std::move(d_offsets), d_chars.release(), null_count, d_bitmask.release()); } /** diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu index 8fc3e63bc59..002c9a9137b 100644 --- a/cpp/src/hash/md5_hash.cu +++ b/cpp/src/hash/md5_hash.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -333,9 +333,8 @@ std::unique_ptr md5(table_view const& input, auto [offsets_column, bytes] = cudf::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr); - auto chars_column = strings::detail::create_chars_child_column(bytes, stream, mr); - auto chars_view = chars_column->mutable_view(); - auto d_chars = chars_view.data(); + rmm::device_uvector chars(bytes, stream, mr); + auto d_chars = chars.data(); auto const device_input = table_device_view::create(input, stream); @@ -366,8 +365,7 @@ std::unique_ptr md5(table_view const& input, } }); - return make_strings_column( - input.num_rows(), std::move(offsets_column), std::move(chars_column), 0, {}); + return make_strings_column(input.num_rows(), std::move(offsets_column), chars.release(), 0, {}); } } // namespace detail diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index e39625c92e7..7b44fb41288 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -290,7 +290,7 @@ std::unique_ptr dispatch_to_cudf_column::operator()( auto const num_rows = offsets_column->size() - 1; auto out_col = make_strings_column(num_rows, std::move(offsets_column), - std::move(chars_column), + std::move(chars_column->release().data.release()[0]), array.null_count(), std::move(*get_mask_buffer(array, stream, mr))); diff --git a/cpp/src/io/csv/durations.cu b/cpp/src/io/csv/durations.cu index 66143d3fdee..f4d32edac89 100644 --- a/cpp/src/io/csv/durations.cu +++ b/cpp/src/io/csv/durations.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -202,7 +202,7 @@ struct dispatch_from_durations_fn { // return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + std::move(chars_column->release().data.release()[0]), durations.null_count(), std::move(null_mask)); } diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index 995d8d942c9..65473073e31 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -181,11 +181,12 @@ struct column_to_strings_fn { auto d_column = column_device_view::create(column_v, stream_); escape_strings_fn fn{*d_column, delimiter.value(stream_)}; - auto children = cudf::strings::detail::make_strings_children(fn, column_v.size(), stream_, mr_); + auto [offsets_column, chars_column] = + cudf::strings::detail::make_strings_children(fn, column_v.size(), stream_, mr_); return make_strings_column(column_v.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), column_v.null_count(), cudf::detail::copy_bitmask(column_v, stream_, mr_)); } diff --git a/cpp/src/io/json/legacy/reader_impl.cu b/cpp/src/io/json/legacy/reader_impl.cu index 5580628b0fe..d461f27c921 100644 --- a/cpp/src/io/json/legacy/reader_impl.cu +++ b/cpp/src/io/json/legacy/reader_impl.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -530,29 +530,27 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, auto repl_chars = std::vector{'"', '\\', '\t', '\r', '\b'}; auto repl_offsets = std::vector{0, 1, 2, 3, 4, 5}; - auto target = make_strings_column( - static_cast(target_offsets.size() - 1), - std::make_unique( - cudf::detail::make_device_uvector_async( - target_offsets, stream, rmm::mr::get_current_device_resource()), - rmm::device_buffer{}, - 0), - std::make_unique(cudf::detail::make_device_uvector_async( - target_chars, stream, rmm::mr::get_current_device_resource()), - rmm::device_buffer{}, - 0), - 0, - {}); + auto target = + make_strings_column(static_cast(target_offsets.size() - 1), + std::make_unique( + cudf::detail::make_device_uvector_async( + target_offsets, stream, rmm::mr::get_current_device_resource()), + rmm::device_buffer{}, + 0), + cudf::detail::make_device_uvector_async( + target_chars, stream, rmm::mr::get_current_device_resource()) + .release(), + 0, + {}); auto repl = make_strings_column( static_cast(repl_offsets.size() - 1), std::make_unique(cudf::detail::make_device_uvector_async( repl_offsets, stream, rmm::mr::get_current_device_resource()), rmm::device_buffer{}, 0), - std::make_unique(cudf::detail::make_device_uvector_async( - repl_chars, stream, rmm::mr::get_current_device_resource()), - rmm::device_buffer{}, - 0), + cudf::detail::make_device_uvector_async( + repl_chars, stream, rmm::mr::get_current_device_resource()) + .release(), 0, {}); diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index c35f15049bd..84e0ac9e74d 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -170,12 +170,12 @@ struct escape_strings_fn { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto children = + auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(*this, column_v.size(), stream, mr); return make_strings_column(column_v.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), column_v.null_count(), cudf::detail::copy_bitmask(column_v, stream, mr)); } @@ -347,13 +347,11 @@ std::unique_ptr struct_to_strings(table_view const& strings_columns, d_strview_offsets + row_string_offsets.size(), old_offsets.begin(), row_string_offsets.begin()); - auto chars_data = joined_col->release().data; - auto const chars_size = chars_data->size(); + auto chars_data = joined_col->release().data; return make_strings_column( strings_columns.num_rows(), std::make_unique(std::move(row_string_offsets), rmm::device_buffer{}, 0), - std::make_unique( - data_type{type_id::INT8}, chars_size, std::move(*chars_data), rmm::device_buffer{}, 0), + std::move(chars_data.release()[0]), 0, {}); } @@ -472,13 +470,11 @@ std::unique_ptr join_list_of_strings(lists_column_view const& lists_stri d_strview_offsets.end(), old_offsets.begin(), row_string_offsets.begin()); - auto chars_data = joined_col->release().data; - auto const chars_size = chars_data->size(); + auto chars_data = joined_col->release().data; return make_strings_column( num_lists, std::make_unique(std::move(row_string_offsets), rmm::device_buffer{}, 0), - std::make_unique( - data_type{type_id::INT8}, chars_size, std::move(*chars_data), rmm::device_buffer{}, 0), + std::move(chars_data.release()[0]), lists_strings.null_count(), cudf::detail::copy_bitmask(lists_strings.parent(), stream, mr)); } @@ -780,11 +776,7 @@ std::unique_ptr make_strings_column_from_host(host_span(std::move(d_chars), rmm::device_buffer{}, 0), - 0, - {}); + host_strings.size(), std::move(d_offsets), d_chars.release(), 0, {}); } std::unique_ptr make_column_names_column(host_span column_names, diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index 9c8b03886b5..f43a8fd24c4 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -189,7 +189,7 @@ struct stats_caster { return cudf::make_strings_column( val.size(), std::make_unique(std::move(d_offsets), rmm::device_buffer{}, 0), - std::make_unique(std::move(d_chars), rmm::device_buffer{}, 0), + d_chars.release(), null_count, rmm::device_buffer{ null_mask.data(), cudf::bitmask_allocation_size_bytes(val.size()), stream, mr}); diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 443ca0f5fe7..06715f49fdc 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -556,7 +556,7 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source return cudf::make_strings_column( string_count, std::make_unique(std::move(offsets), rmm::device_buffer{}, 0), - std::make_unique(std::move(chars), rmm::device_buffer{}, 0), + chars.release(), 0, {}); } diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp index 88617510394..36303a60aa9 100644 --- a/cpp/src/io/utilities/column_buffer.cpp +++ b/cpp/src/io/utilities/column_buffer.cpp @@ -68,26 +68,10 @@ std::unique_ptr cudf::io::detail::inline_column_buffer::make_string_colu rmm::cuda_stream_view stream) { // no need for copies, just transfer ownership of the data_buffers to the columns - auto const state = mask_state::UNALLOCATED; - auto str_col = - _string_data.is_empty() - ? make_empty_column(data_type{type_id::INT8}) - : std::make_unique(data_type{type_id::INT8}, - string_size(), - std::move(_string_data), - cudf::detail::create_null_mask(size, state, stream, _mr), - state_null_count(state, size), - std::vector>{}); - auto offsets_col = - std::make_unique(data_type{type_to_id()}, - size + 1, - std::move(_data), - cudf::detail::create_null_mask(size + 1, state, stream, _mr), - state_null_count(state, size + 1), - std::vector>{}); - + auto offsets_col = std::make_unique( + data_type{type_to_id()}, size + 1, std::move(_data), rmm::device_buffer{}, 0); return make_strings_column( - size, std::move(offsets_col), std::move(str_col), null_count(), std::move(_null_mask)); + size, std::move(offsets_col), std::move(_string_data), null_count(), std::move(_null_mask)); } namespace { diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index 9e5c5c76392..a50b990ac5b 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -861,9 +861,8 @@ static std::unique_ptr parse_string(string_view_pair_it str_tuples, std::overflow_error); // CHARS column - std::unique_ptr chars = - strings::detail::create_chars_child_column(static_cast(bytes), stream, mr); - auto d_chars = chars->mutable_view().data(); + rmm::device_uvector chars(bytes, stream, mr); + auto d_chars = chars.data(); single_thread_fn.d_chars = d_chars; thrust::for_each_n(rmm::exec_policy(stream), @@ -902,7 +901,7 @@ static std::unique_ptr parse_string(string_view_pair_it str_tuples, return make_strings_column(col_size, std::move(offsets), - std::move(chars), + chars.release(), d_null_count.value(stream), std::move(null_mask)); } diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu index c01357c96ca..8446800d587 100644 --- a/cpp/src/json/json_path.cu +++ b/cpp/src/json/json_path.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1010,7 +1010,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c cudf::detail::get_value(offsets_view, col.size(), stream); // allocate output string column - auto chars = cudf::strings::detail::create_chars_child_column(output_size, stream, mr); + rmm::device_uvector chars(output_size, stream, mr); // potential optimization : if we know that all outputs are valid, we could skip creating // the validity mask altogether @@ -1018,7 +1018,6 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c cudf::detail::create_null_mask(col.size(), mask_state::UNINITIALIZED, stream, mr); // compute results - cudf::mutable_column_view chars_view(*chars); rmm::device_scalar d_valid_count{0, stream}; get_json_object_kernel @@ -1026,14 +1025,14 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c *cdv, std::get<0>(preprocess).value().data(), offsets_view.head(), - chars_view.head(), + chars.data(), static_cast(validity.data()), d_valid_count.data(), options); auto result = make_strings_column(col.size(), std::move(offsets), - std::move(chars), + chars.release(), col.size() - d_valid_count.value(stream), std::move(validity)); // unmatched array query may result in unsanitized '[' value in the result diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu index 7b37e2dc8f6..8f05b020a2e 100644 --- a/cpp/src/lists/interleave_columns.cu +++ b/cpp/src/lists/interleave_columns.cu @@ -237,7 +237,7 @@ struct interleave_list_entries_implrelease().data.release()[0]), null_count, std::move(null_mask)); } diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu index 6852b19af44..23c792ddcae 100644 --- a/cpp/src/replace/clamp.cu +++ b/cpp/src/replace/clamp.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -106,7 +106,7 @@ std::unique_ptr clamp_string_column(strings_column_view const& inp return make_strings_column(input.size(), std::move(offsets_column), - std::move(chars_column), + std::move(chars_column->release().data.release()[0]), input.null_count(), std::move(cudf::detail::copy_bitmask(input.parent(), stream, mr))); } diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index 2eb624d3f05..f1f1254a6d2 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -255,22 +255,19 @@ std::unique_ptr replace_nulls_column_kernel_forwarder::operator()< auto offsets_view = offsets->mutable_view(); // Allocate chars array and output null mask - std::unique_ptr output_chars = - cudf::strings::detail::create_chars_child_column(bytes, stream, mr); - - auto output_chars_view = output_chars->mutable_view(); + rmm::device_uvector output_chars(bytes, stream, mr); replace_second<<>>( *device_in, *device_replacement, reinterpret_cast(valid_bits.data()), offsets_view.begin(), - output_chars_view.data(), + output_chars.data(), valid_count); return cudf::make_strings_column(input.size(), std::move(offsets), - std::move(output_chars), + output_chars.release(), input.size() - valid_counter.value(stream), std::move(valid_bits)); } diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu index 9341929de44..e2418e984f8 100644 --- a/cpp/src/replace/replace.cu +++ b/cpp/src/replace/replace.cu @@ -17,7 +17,7 @@ * limitations under the License. */ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -187,7 +187,7 @@ template __global__ void replace_strings_second_pass(cudf::column_device_view input, cudf::column_device_view replacement, cudf::mutable_column_device_view offsets, - cudf::mutable_column_device_view strings, + char* strings, cudf::mutable_column_device_view indices) { cudf::size_type nrows = input.size(); @@ -211,9 +211,8 @@ __global__ void replace_strings_second_pass(cudf::column_device_view input, cudf::string_view output = (replace_idx == -1) ? input.element(idx) : replacement.element(replace_idx); - std::memcpy(strings.data() + offsets.data()[idx], - output.data(), - output.size_bytes()); + std::memcpy( + strings + offsets.data()[idx], output.data(), output.size_bytes()); } tid += stride; @@ -434,18 +433,15 @@ std::unique_ptr replace_kernel_forwarder::operator() output_chars = - cudf::strings::detail::create_chars_child_column(bytes, stream, mr); - - auto output_chars_view = output_chars->mutable_view(); - auto device_chars = cudf::mutable_column_device_view::create(output_chars_view, stream); + rmm::device_uvector output_chars(bytes, stream, mr); + auto d_chars = output_chars.data(); replace_second<<>>( - *device_in, *device_replacement, *device_offsets, *device_chars, *device_indices); + *device_in, *device_replacement, *device_offsets, d_chars, *device_indices); return cudf::make_strings_column(input_col.size(), std::move(offsets), - std::move(output_chars), + output_chars.release(), null_count, std::move(valid_bits)); } diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu index deb0acb4742..22b45fe7a58 100644 --- a/cpp/src/reshape/interleave_columns.cu +++ b/cpp/src/reshape/interleave_columns.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -193,9 +193,8 @@ struct interleave_columns_implview().template data(); // Create the chars column - auto chars_column = strings::detail::create_chars_child_column(bytes, stream, mr); - // Fill the chars column - auto d_results_chars = chars_column->mutable_view().template data(); + rmm::device_uvector chars(bytes, stream, mr); + auto d_results_chars = chars.data(); thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -215,7 +214,7 @@ struct interleave_columns_impl capitalizer(CapitalFn cfn, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto children = cudf::strings::detail::make_strings_children(cfn, input.size(), stream, mr); + auto [offsets_column, chars_column] = + cudf::strings::detail::make_strings_children(cfn, input.size(), stream, mr); return make_strings_column(input.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu index c2e8033b42d..0d81bd3399e 100644 --- a/cpp/src/strings/case.cu +++ b/cpp/src/strings/case.cu @@ -217,7 +217,7 @@ std::unique_ptr convert_case(strings_column_view const& input, cudf::strings::detail::make_strings_children(converter, input.size(), stream, mr); return make_strings_column(input.size(), std::move(offsets), - std::move(chars), + std::move(chars->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } @@ -264,15 +264,16 @@ std::unique_ptr convert_case(strings_column_view const& input, "Size of output exceeds the column size limit", std::overflow_error); - auto chars = create_chars_child_column(static_cast(bytes), stream, mr); + // auto chars = create_chars_child_column(static_cast(bytes), stream, mr); + rmm::device_uvector chars(bytes, stream, mr); // second pass, write output converter.d_offsets = d_offsets; - converter.d_chars = chars->mutable_view().data(); + converter.d_chars = chars.data(); // chars->mutable_view().data(); thrust::for_each_n(rmm::exec_policy(stream), count_itr, input.size(), converter); return make_strings_column(input.size(), std::move(offsets), - std::move(chars), + chars.release(), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu index 35b0c0a2690..9c2a2701227 100644 --- a/cpp/src/strings/char_types/char_types.cu +++ b/cpp/src/strings/char_types/char_types.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -200,12 +200,13 @@ std::unique_ptr filter_characters_of_type(strings_column_view const& str rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // this utility calls filterer to build the offsets and chars columns - auto children = cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr); + auto [offsets_column, chars_column] = + cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr); // return new strings column return make_strings_column(strings_count, - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), strings.null_count(), std::move(null_mask)); } diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu index 0a11b6dc460..a48e84eac0c 100644 --- a/cpp/src/strings/combine/concatenate.cu +++ b/cpp/src/strings/combine/concatenate.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -142,7 +142,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, // Create device views from the strings columns. auto d_table = table_device_view::create(strings_columns, stream); concat_strings_fn fn{*d_table, d_separator, d_narep, separate_nulls}; - auto children = make_strings_children(fn, strings_count, stream, mr); + auto [offsets_column, chars_column] = make_strings_children(fn, strings_count, stream, mr); // create resulting null mask auto [null_mask, null_count] = cudf::detail::valid_if( @@ -157,8 +157,8 @@ std::unique_ptr concatenate(table_view const& strings_columns, mr); return make_strings_column(strings_count, - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), null_count, std::move(null_mask)); } @@ -237,7 +237,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, multi_separator_concat_fn mscf{ *d_table, separator_col_view, separator_rep, col_rep, separate_nulls}; - auto children = make_strings_children(mscf, strings_count, stream, mr); + auto [offsets_column, chars_column] = make_strings_children(mscf, strings_count, stream, mr); // Create resulting null mask auto [null_mask, null_count] = cudf::detail::valid_if( @@ -253,8 +253,8 @@ std::unique_ptr concatenate(table_view const& strings_columns, mr); return make_strings_column(strings_count, - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), null_count, std::move(null_mask)); } diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu index 48304759f7a..0e0d6e437a7 100644 --- a/cpp/src/strings/combine/join.cu +++ b/cpp/src/strings/combine/join.cu @@ -173,8 +173,11 @@ std::unique_ptr join_strings(strings_column_view const& input, : rmm::device_buffer{0, stream, mr}; // perhaps this return a string_scalar instead of a single-row column - return make_strings_column( - 1, std::move(offsets_column), std::move(chars_column), null_count, std::move(null_mask)); + return make_strings_column(1, + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), + null_count, + std::move(null_mask)); } } // namespace detail diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu index 372b49fb0ee..619f5feba15 100644 --- a/cpp/src/strings/combine/join_list_elements.cu +++ b/cpp/src/strings/combine/join_list_elements.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -215,8 +215,11 @@ std::unique_ptr join_list_elements(lists_column_view const& lists_string stream, mr); - return make_strings_column( - num_rows, std::move(offsets_column), std::move(chars_column), null_count, std::move(null_mask)); + return make_strings_column(num_rows, + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), + null_count, + std::move(null_mask)); } namespace { @@ -290,8 +293,11 @@ std::unique_ptr join_list_elements(lists_column_view const& lists_string stream, mr); - return make_strings_column( - num_rows, std::move(offsets_column), std::move(chars_column), null_count, std::move(null_mask)); + return make_strings_column(num_rows, + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), + null_count, + std::move(null_mask)); } } // namespace detail diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu index e75f1a6fe0f..4fe0be7883f 100644 --- a/cpp/src/strings/convert/convert_booleans.cu +++ b/cpp/src/strings/convert/convert_booleans.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -147,7 +147,7 @@ std::unique_ptr from_booleans(column_view const& booleans, return make_strings_column(strings_count, std::move(offsets), - std::move(chars), + std::move(chars->release().data.release()[0]), booleans.null_count(), std::move(null_mask)); } diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu index d2609441d72..b7a662b0b76 100644 --- a/cpp/src/strings/convert/convert_datetime.cu +++ b/cpp/src/strings/convert/convert_datetime.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1158,7 +1158,7 @@ std::unique_ptr from_timestamps(column_view const& timestamps, return make_strings_column(timestamps.size(), std::move(offsets_column), - std::move(chars_column), + std::move(chars_column->release().data.release()[0]), timestamps.null_count(), cudf::detail::copy_bitmask(timestamps, stream, mr)); } diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu index 987087042cb..9a58926539c 100644 --- a/cpp/src/strings/convert/convert_durations.cu +++ b/cpp/src/strings/convert/convert_durations.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -422,7 +422,7 @@ struct dispatch_from_durations_fn { return make_strings_column(strings_count, std::move(offsets), - std::move(chars), + std::move(chars->release().data.release()[0]), durations.null_count(), std::move(null_mask)); } diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu index 2c59f6dcd29..975f03b37d6 100644 --- a/cpp/src/strings/convert/convert_fixed_point.cu +++ b/cpp/src/strings/convert/convert_fixed_point.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -249,7 +249,7 @@ struct dispatch_from_fixed_point_fn { return make_strings_column(input.size(), std::move(offsets), - std::move(chars), + std::move(chars->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input, stream, mr)); } diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu index 9b3ef8f452b..c56e723de8e 100644 --- a/cpp/src/strings/convert/convert_floats.cu +++ b/cpp/src/strings/convert/convert_floats.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -408,7 +408,7 @@ struct dispatch_from_floats_fn { return make_strings_column(strings_count, std::move(offsets), - std::move(chars), + std::move(chars->release().data.release()[0]), floats.null_count(), std::move(null_mask)); } diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu index 8f656b149a5..68cff214507 100644 --- a/cpp/src/strings/convert/convert_hex.cu +++ b/cpp/src/strings/convert/convert_hex.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -180,12 +180,12 @@ struct dispatch_integers_to_hex_fn { { auto const d_column = column_device_view::create(input, stream); - auto children = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( integer_to_hex_fn{*d_column}, input.size(), stream, mr); return make_strings_column(input.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input, stream, mr)); } diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu index 56637e88e19..364cb534d2f 100644 --- a/cpp/src/strings/convert/convert_integers.cu +++ b/cpp/src/strings/convert/convert_integers.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -367,7 +367,7 @@ struct dispatch_from_integers_fn { return make_strings_column(strings_count, std::move(offsets), - std::move(chars), + std::move(chars->release().data.release()[0]), integers.null_count(), std::move(null_mask)); } diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu index 75527e24e79..e07be26a23c 100644 --- a/cpp/src/strings/convert/convert_ipv4.cu +++ b/cpp/src/strings/convert/convert_ipv4.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -165,13 +165,13 @@ std::unique_ptr integers_to_ipv4(column_view const& integers, CUDF_EXPECTS(integers.type().id() == type_id::INT64, "Input column must be type_id::INT64 type"); - auto d_column = column_device_view::create(integers, stream); - auto children = cudf::strings::detail::make_strings_children( + auto d_column = column_device_view::create(integers, stream); + auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( integers_to_ipv4_fn{*d_column}, integers.size(), stream, mr); return make_strings_column(integers.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), integers.null_count(), cudf::detail::copy_bitmask(integers, stream, mr)); } diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu index f9f2b91eb12..1f22aea284b 100644 --- a/cpp/src/strings/convert/convert_lists.cu +++ b/cpp/src/strings/convert/convert_lists.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -216,14 +216,17 @@ std::unique_ptr format_list_column(lists_column_view const& input, auto const d_separators = column_device_view::create(separators.parent(), stream); auto const d_na_rep = na_rep.value(stream); - auto children = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( format_lists_fn{*d_input, *d_separators, d_na_rep, stack_buffer.data(), depth}, input.size(), stream, mr); - return make_strings_column( - input.size(), std::move(children.first), std::move(children.second), 0, rmm::device_buffer{}); + return make_strings_column(input.size(), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), + 0, + rmm::device_buffer{}); } } // namespace detail diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu index 511acc38d75..e2a6f028a18 100644 --- a/cpp/src/strings/convert/convert_urls.cu +++ b/cpp/src/strings/convert/convert_urls.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -135,12 +135,12 @@ std::unique_ptr url_encode(strings_column_view const& input, auto d_column = column_device_view::create(input.parent(), stream); - auto children = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( url_encoder_fn{*d_column}, input.size(), stream, mr); return make_strings_column(input.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } @@ -409,8 +409,8 @@ std::unique_ptr url_decode(strings_column_view const& strings, auto out_chars_bytes = cudf::detail::get_value(offsets_view, offset_count - 1, stream); // create the chars column - auto chars_column = create_chars_child_column(out_chars_bytes, stream, mr); - auto d_out_chars = chars_column->mutable_view().data(); + rmm::device_uvector chars(out_chars_bytes, stream, mr); + auto d_out_chars = chars.data(); // decode and copy the characters from the input column to the output column url_decode_char_replacer @@ -422,7 +422,7 @@ std::unique_ptr url_decode(strings_column_view const& strings, return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + chars.release(), strings.null_count(), std::move(null_mask)); } diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index 2d9b06183e2..3d2d86708b1 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -228,9 +228,8 @@ std::unique_ptr concatenate(host_span columns, std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); }); // create output chars column - auto chars_column = create_chars_child_column(total_bytes, stream, mr); - auto d_new_chars = chars_column->mutable_view().data(); - chars_column->set_null_count(0); + rmm::device_uvector output_chars(total_bytes, stream, mr); + auto d_new_chars = output_chars.data(); // create output offsets column auto offsets_column = make_numeric_column( @@ -304,7 +303,7 @@ std::unique_ptr concatenate(host_span columns, return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + output_chars.release(), null_count, std::move(null_mask)); } diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu index 3b798a87d54..331cdecc36f 100644 --- a/cpp/src/strings/copying/shift.cu +++ b/cpp/src/strings/copying/shift.cu @@ -114,19 +114,19 @@ std::unique_ptr shift(strings_column_view const& input, }(); // create output chars child column - auto chars_column = create_chars_child_column(static_cast(total_bytes), stream, mr); - auto d_chars = mutable_column_device_view::create(chars_column->mutable_view(), stream); + rmm::device_uvector chars(total_bytes, stream, mr); + auto d_chars = chars.data(); // run kernel to shift all the characters thrust::transform(rmm::exec_policy(stream), thrust::counting_iterator(0), thrust::counting_iterator(total_bytes), - d_chars->data(), + d_chars, shift_chars_fn{*d_input, d_fill_str, shift_offset}); // caller sets the null-mask return make_strings_column( - input.size(), std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{}); + input.size(), std::move(offsets_column), chars.release(), 0, rmm::device_buffer{}); } } // namespace cudf::strings::detail diff --git a/cpp/src/strings/filling/fill.cu b/cpp/src/strings/filling/fill.cu index 49e1b11c1db..d2e3b6f6af3 100644 --- a/cpp/src/strings/filling/fill.cu +++ b/cpp/src/strings/filling/fill.cu @@ -98,7 +98,7 @@ std::unique_ptr fill(strings_column_view const& input, return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + std::move(chars_column->release().data.release()[0]), null_count, std::move(null_mask)); } diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu index 9f95fedfe0b..7a26fc45dcb 100644 --- a/cpp/src/strings/filter_chars.cu +++ b/cpp/src/strings/filter_chars.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -139,11 +139,12 @@ std::unique_ptr filter_characters( // this utility calls the strip_fn to build the offsets and chars columns filter_fn ffn{*d_strings, keep_characters, table.begin(), table.end(), d_replacement}; - auto children = cudf::strings::detail::make_strings_children(ffn, strings.size(), stream, mr); + auto [offsets_column, chars_column] = + cudf::strings::detail::make_strings_children(ffn, strings.size(), stream, mr); return make_strings_column(strings_count, - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu index 850ccaa4535..ec77aea6338 100644 --- a/cpp/src/strings/padding.cu +++ b/cpp/src/strings/padding.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -112,7 +112,7 @@ std::unique_ptr pad(strings_column_view const& input, auto d_strings = column_device_view::create(input.parent(), stream); - auto children = [&] { + auto [offsets_column, chars_column] = [&] { if (side == side_type::LEFT) { auto fn = pad_fn{*d_strings, width, fill_char_size, d_fill_char}; return make_strings_children(fn, input.size(), stream, mr); @@ -125,8 +125,8 @@ std::unique_ptr pad(strings_column_view const& input, }(); return make_strings_column(input.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } @@ -151,11 +151,12 @@ std::unique_ptr zfill(strings_column_view const& input, if (input.is_empty()) return make_empty_column(type_id::STRING); auto d_strings = column_device_view::create(input.parent(), stream); - auto children = make_strings_children(zfill_fn{*d_strings, width}, input.size(), stream, mr); + auto [offsets_column, chars_column] = + make_strings_children(zfill_fn{*d_strings, width}, input.size(), stream, mr); return make_strings_column(input.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu index 847a64f5602..b4a770f72bd 100644 --- a/cpp/src/strings/repeat_strings.cu +++ b/cpp/src/strings/repeat_strings.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -81,8 +81,6 @@ auto generate_empty_output(strings_column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto chars_column = create_chars_child_column(0, stream, mr); - auto offsets_column = make_numeric_column( data_type{type_to_id()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); CUDF_CUDA_TRY(cudaMemsetAsync(offsets_column->mutable_view().template data(), @@ -92,7 +90,7 @@ auto generate_empty_output(strings_column_view const& input, return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + rmm::device_buffer{}, input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } @@ -166,7 +164,7 @@ std::unique_ptr repeat_strings(strings_column_view const& input, make_strings_children(fn, strings_count * repeat_times, strings_count, stream, mr); return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + std::move(chars_column->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } @@ -252,7 +250,7 @@ std::unique_ptr repeat_strings(strings_column_view const& input, return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + std::move(chars_column->release().data.release()[0]), null_count, std::move(null_mask)); } diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu index fc11b7d80b3..edec525a913 100644 --- a/cpp/src/strings/replace/backref_re.cu +++ b/cpp/src/strings/replace/backref_re.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -126,8 +126,8 @@ std::unique_ptr replace_with_backrefs(strings_column_view const& input, auto const d_strings = column_device_view::create(input.parent(), stream); - using BackRefIterator = decltype(backrefs.begin()); - auto children = make_strings_children( + using BackRefIterator = decltype(backrefs.begin()); + auto [offsets_column, chars_column] = make_strings_children( backrefs_fn{*d_strings, d_repl_template, backrefs.begin(), backrefs.end()}, *d_prog, input.size(), @@ -135,8 +135,8 @@ std::unique_ptr replace_with_backrefs(strings_column_view const& input, mr); return make_strings_column(input.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu index a0f9d1136f3..3d0210d61b0 100644 --- a/cpp/src/strings/replace/multi.cu +++ b/cpp/src/strings/replace/multi.cu @@ -374,12 +374,8 @@ std::unique_ptr replace_character_parallel(strings_column_view const& in }); // use this utility to gather the string parts into a contiguous chars column - auto chars = make_strings_column(indices.begin(), indices.end(), stream, mr); - // TODO ideally we can pass this chars_data as rmm buffer to make_strings_column - auto chars_data = chars->release().data; - auto const chars_size = chars_data->size(); - auto chars_col = std::make_unique( - data_type{type_id::INT8}, chars_size, std::move(*chars_data), rmm::device_buffer{}, 0); + auto chars = make_strings_column(indices.begin(), indices.end(), stream, mr); + auto chars_data = chars->release().data; // create offsets from the sizes offsets = @@ -388,7 +384,7 @@ std::unique_ptr replace_character_parallel(strings_column_view const& in // build the strings columns from the chars and offsets return make_strings_column(strings_count, std::move(offsets), - std::move(chars_col), + std::move(chars_data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } @@ -458,12 +454,12 @@ std::unique_ptr replace_string_parallel(strings_column_view const& input auto d_targets = column_device_view::create(targets.parent(), stream); auto d_replacements = column_device_view::create(repls.parent(), stream); - auto children = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( replace_multi_fn{*d_strings, *d_targets, *d_replacements}, input.size(), stream, mr); return make_strings_column(input.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu index 3375cb7a789..c212d9f44ba 100644 --- a/cpp/src/strings/replace/multi_re.cu +++ b/cpp/src/strings/replace/multi_re.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -185,15 +185,15 @@ std::unique_ptr replace_re(strings_column_view const& input, auto found_ranges = rmm::device_uvector(d_progs.size() * input.size(), stream); - auto children = make_strings_children( + auto [offsets_column, chars_column] = make_strings_children( replace_multi_regex_fn{*d_strings, d_progs, found_ranges.data(), *d_repls}, input.size(), stream, mr); return make_strings_column(input.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu index 8c4bd4490b9..936127f254b 100644 --- a/cpp/src/strings/replace/replace.cu +++ b/cpp/src/strings/replace/replace.cu @@ -486,9 +486,8 @@ std::unique_ptr replace_char_parallel(strings_column_view const& strings offsets_update_fn); // build the characters column - auto chars_column = - create_chars_child_column(chars_bytes + (delta_per_target * target_count), stream, mr); - auto d_out_chars = chars_column->mutable_view().data(); + rmm::device_uvector chars(chars_bytes + (delta_per_target * target_count), stream, mr); + auto d_out_chars = chars.data(); thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(chars_start), @@ -501,7 +500,7 @@ std::unique_ptr replace_char_parallel(strings_column_view const& strings return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + chars.release(), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } @@ -532,12 +531,12 @@ std::unique_ptr replace_row_parallel(strings_column_view const& strings, auto d_strings = column_device_view::create(strings.parent(), stream); // this utility calls the given functor to build the offsets and chars columns - auto children = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( replace_row_parallel_fn{*d_strings, d_target, d_repl, maxrepl}, strings.size(), stream, mr); return make_strings_column(strings.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } @@ -697,12 +696,12 @@ std::unique_ptr replace_slice(strings_column_view const& strings, auto d_strings = column_device_view::create(strings.parent(), stream); // this utility calls the given functor to build the offsets and chars columns - auto children = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( replace_slice_fn{*d_strings, d_repl, start, stop}, strings.size(), stream, mr); return make_strings_column(strings.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } @@ -733,8 +732,8 @@ std::unique_ptr replace_nulls(strings_column_view const& strings, auto d_offsets = offsets_column->view().data(); // build chars column - auto chars_column = create_chars_child_column(bytes, stream, mr); - auto d_chars = chars_column->mutable_view().data(); + rmm::device_uvector chars(bytes, stream, mr); + auto d_chars = chars.data(); thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), strings_count, @@ -745,7 +744,7 @@ std::unique_ptr replace_nulls(strings_column_view const& strings, }); return make_strings_column( - strings_count, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{}); + strings_count, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{}); } } // namespace detail diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu index 502d5f1a52e..10d83932928 100644 --- a/cpp/src/strings/replace/replace_re.cu +++ b/cpp/src/strings/replace/replace_re.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -116,12 +116,12 @@ std::unique_ptr replace_re(strings_column_view const& input, auto const d_strings = column_device_view::create(input.parent(), stream); - auto children = make_strings_children( + auto [offsets_column, chars_column] = make_strings_children( replace_regex_fn{*d_strings, d_repl, maxrepl}, *d_prog, input.size(), stream, mr); return make_strings_column(input.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu index 5a1fee92c7d..1e55986fdb8 100644 --- a/cpp/src/strings/slice.cu +++ b/cpp/src/strings/slice.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -209,7 +209,7 @@ std::unique_ptr slice_strings(strings_column_view const& strings, return make_strings_column(strings.size(), std::move(offsets), - std::move(chars), + std::move(chars->release().data.release()[0]), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu index 0ca5e103d3d..039a8ac8a62 100644 --- a/cpp/src/strings/translate.cu +++ b/cpp/src/strings/translate.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -110,12 +110,12 @@ std::unique_ptr translate(strings_column_view const& strings, auto d_strings = column_device_view::create(strings.parent(), stream); - auto children = make_strings_children( + auto [offsets_column, chars_column] = make_strings_children( translate_fn{*d_strings, table.begin(), table.end()}, strings.size(), stream, mr); return make_strings_column(strings.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } diff --git a/cpp/src/text/bpe/byte_pair_encoding.cu b/cpp/src/text/bpe/byte_pair_encoding.cu index a697df913d3..225cc3f371b 100644 --- a/cpp/src/text/bpe/byte_pair_encoding.cu +++ b/cpp/src/text/bpe/byte_pair_encoding.cu @@ -429,8 +429,8 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const std::overflow_error); // build the output: inserting separators to the input character data - auto chars = cudf::strings::detail::create_chars_child_column(bytes, stream, mr); - auto d_chars = chars->mutable_view().data(); + rmm::device_uvector chars(bytes, stream, mr); + auto d_chars = chars.data(); auto const d_inserts = d_working.data(); // stores the insert positions auto offsets_at_non_zero = [d_spaces = d_spaces.data()] __device__(auto idx) { @@ -453,7 +453,7 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const return cudf::make_strings_column(input.size(), std::move(offsets), - std::move(chars), + chars.release(), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu index 38cb7dd6753..60625d6383a 100644 --- a/cpp/src/text/detokenize.cu +++ b/cpp/src/text/detokenize.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -156,15 +156,18 @@ std::unique_ptr detokenize(cudf::strings_column_view const& string cudf::string_view const d_separator(separator.data(), separator.size()); - auto children = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( detokenizer_fn{*strings_column, d_row_map, tokens_offsets.data(), d_separator}, output_count, stream, mr); // make the output strings column from the offsets and chars column - return cudf::make_strings_column( - output_count, std::move(children.first), std::move(children.second), 0, rmm::device_buffer{}); + return cudf::make_strings_column(output_count, + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), + 0, + rmm::device_buffer{}); } } // namespace detail diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index 1d3e98a25ad..882d9a04501 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -139,12 +139,15 @@ std::unique_ptr generate_ngrams(cudf::strings_column_view const& s // compute the number of strings of ngrams auto const ngrams_count = strings_count - ngrams + 1; - auto children = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( ngram_generator_fn{d_strings, ngrams, d_separator}, ngrams_count, stream, mr); // make the output strings column from the offsets and chars column - return cudf::make_strings_column( - ngrams_count, std::move(children.first), std::move(children.second), 0, rmm::device_buffer{}); + return cudf::make_strings_column(ngrams_count, + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), + 0, + rmm::device_buffer{}); } } // namespace detail @@ -239,8 +242,11 @@ std::unique_ptr generate_character_ngrams(cudf::strings_column_vie auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( generator, strings_count, total_ngrams, stream, mr); - return cudf::make_strings_column( - total_ngrams, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{}); + return cudf::make_strings_column(total_ngrams, + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), + 0, + rmm::device_buffer{}); } namespace { diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu index bc5cd04eac6..8aafd60db6d 100644 --- a/cpp/src/text/ngrams_tokenize.cu +++ b/cpp/src/text/ngrams_tokenize.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -232,9 +232,8 @@ std::unique_ptr ngrams_tokenize(cudf::strings_column_view const& s rmm::device_uvector ngram_sizes(total_ngrams, stream); // build output chars column - auto chars_column = cudf::strings::detail::create_chars_child_column( - static_cast(output_chars_size), stream, mr); - auto d_chars = chars_column->mutable_view().data(); + rmm::device_uvector chars(output_chars_size, stream, mr); + auto d_chars = chars.data(); // Generate the ngrams into the chars column data buffer. // The ngram_builder_fn functor also fills the ngram_sizes vector with the // size of each ngram. @@ -253,11 +252,11 @@ std::unique_ptr ngrams_tokenize(cudf::strings_column_view const& s // build the offsets column -- converting the ngram sizes into offsets auto offsets_column = std::get<0>( cudf::detail::make_offsets_child_column(ngram_sizes.begin(), ngram_sizes.end(), stream, mr)); - chars_column->set_null_count(0); + // chars_column->set_null_count(0); offsets_column->set_null_count(0); // create the output strings column return make_strings_column( - total_ngrams, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{}); + total_ngrams, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{}); } } // namespace detail diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index 5a0977d410f..d46ca25835f 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -182,12 +182,12 @@ std::unique_ptr normalize_spaces(cudf::strings_column_view const& auto d_strings = cudf::column_device_view::create(strings.parent(), stream); // build offsets and children using the normalize_space_fn - auto children = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( normalize_spaces_fn{*d_strings}, strings.size(), stream, mr); return cudf::make_strings_column(strings.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } @@ -228,12 +228,12 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con auto d_strings = cudf::column_device_view::create(strings.parent(), stream); // build offsets and children using the codepoint_to_utf8_fn - auto children = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( codepoint_to_utf8_fn{*d_strings, cp_chars, cp_offsets}, strings.size(), stream, mr); return cudf::make_strings_column(strings.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu index a4b28fe2dab..50d7bbd077d 100644 --- a/cpp/src/text/replace.cu +++ b/cpp/src/text/replace.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -228,12 +228,13 @@ std::unique_ptr replace_tokens(cudf::strings_column_view const& st rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // this utility calls replacer to build the offsets and chars columns - auto children = cudf::strings::detail::make_strings_children(replacer, strings_count, stream, mr); + auto [offsets_column, chars_column] = + cudf::strings::detail::make_strings_children(replacer, strings_count, stream, mr); // return new strings column return cudf::make_strings_column(strings_count, - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), strings.null_count(), std::move(null_mask)); } @@ -260,12 +261,13 @@ std::unique_ptr filter_tokens(cudf::strings_column_view const& str rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // this utility calls filterer to build the offsets and chars columns - auto children = cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr); + auto [offsets_column, chars_column] = + cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr); // return new strings column return cudf::make_strings_column(strings_count, - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), strings.null_count(), std::move(null_mask)); } diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu index c43b9dda56c..c256607fb23 100644 --- a/cpp/src/text/tokenize.cu +++ b/cpp/src/text/tokenize.cu @@ -219,14 +219,13 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const return idx < chars_bytes ? cudf::strings::detail::is_begin_utf8_char(d_chars[idx]) : true; }); - // create the output chars column -- just a copy of the input's chars column - cudf::column_view chars_view( - cudf::data_type{cudf::type_id::INT8}, chars_bytes, d_chars, nullptr, 0); - auto chars_column = std::make_unique(chars_view, stream, mr); + // create the output chars buffer -- just a copy of the input's chars + rmm::device_uvector output_chars(chars_bytes, stream, mr); + thrust::copy(rmm::exec_policy(stream), d_chars, d_chars + chars_bytes, output_chars.data()); // return new strings column return cudf::make_strings_column( - num_characters, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{}); + num_characters, std::move(offsets_column), output_chars.release(), 0, rmm::device_buffer{}); } } // namespace detail diff --git a/cpp/src/transform/row_conversion.cu b/cpp/src/transform/row_conversion.cu index b797e495480..26d3724b39f 100644 --- a/cpp/src/transform/row_conversion.cu +++ b/cpp/src/transform/row_conversion.cu @@ -2509,8 +2509,7 @@ std::unique_ptr convert_from_rows(lists_column_view const& input, make_strings_column(num_rows, std::make_unique( std::move(string_col_offsets[string_idx]), rmm::device_buffer{}, 0), - std::make_unique( - std::move(string_data_cols[string_idx]), rmm::device_buffer{}, 0), + string_data_cols[string_idx].release(), 0, std::move(*string_data.null_mask.release())); // Null count set to 0, temporarily. Will be fixed up before return. diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp index 06fb687ac2d..0f7c1053adf 100644 --- a/cpp/tests/copying/concatenate_tests.cpp +++ b/cpp/tests/copying/concatenate_tests.cpp @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -406,9 +407,9 @@ TEST_F(OverflowTest, OverflowTest) // try and concatenate 6 string columns of with 1 billion chars in each auto offsets = cudf::test::fixed_width_column_wrapper{0, size}; - auto many_chars = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, size); + auto many_chars = rmm::device_uvector(size, cudf::get_default_stream()); auto col = cudf::make_strings_column( - 1, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{}); + 1, offsets.release(), many_chars.release(), 0, rmm::device_buffer{}); cudf::table_view tbl({*col}); EXPECT_THROW(cudf::concatenate(std::vector({tbl, tbl, tbl, tbl, tbl, tbl})), @@ -422,7 +423,7 @@ TEST_F(OverflowTest, OverflowTest) // try and concatenate 6 string columns 1 billion rows each auto many_offsets = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, size + 1); - auto chars = cudf::test::fixed_width_column_wrapper{0, 1, 2}; + auto chars = rmm::device_uvector(3, cudf::get_default_stream()); auto col = cudf::make_strings_column( size, std::move(many_offsets), chars.release(), 0, rmm::device_buffer{}); @@ -533,10 +534,9 @@ TEST_F(OverflowTest, Presliced) auto offset_gen = cudf::detail::make_counting_transform_iterator( 0, [string_size](cudf::size_type index) { return index * string_size; }); cudf::test::fixed_width_column_wrapper offsets(offset_gen, offset_gen + num_rows + 1); - auto many_chars = - cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, total_chars_size); - auto col = cudf::make_strings_column( - num_rows, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{}); + auto many_chars = rmm::device_uvector(total_chars_size, cudf::get_default_stream()); + auto col = cudf::make_strings_column( + num_rows, offsets.release(), many_chars.release(), 0, rmm::device_buffer{}); auto sliced = cudf::split(*col, {(num_rows / 2) - 1}); @@ -557,13 +557,12 @@ TEST_F(OverflowTest, Presliced) constexpr cudf::size_type num_rows = total_chars_size / string_size; // try and concatenate 4 string columns of with ~1/2 billion chars in each - auto offsets = cudf::sequence(num_rows + 1, + auto offsets = cudf::sequence(num_rows + 1, cudf::numeric_scalar(0), cudf::numeric_scalar(string_size)); - auto many_chars = - cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, total_chars_size); - auto col = cudf::make_strings_column( - num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{}); + auto many_chars = rmm::device_uvector(total_chars_size, cudf::get_default_stream()); + auto col = cudf::make_strings_column( + num_rows, std::move(offsets), many_chars.release(), 0, rmm::device_buffer{}); // should pass (with 2 rows to spare) // leaving this disabled as it typically runs out of memory on a T4 @@ -636,7 +635,7 @@ TEST_F(OverflowTest, Presliced) cudf::numeric_scalar(0), cudf::numeric_scalar(list_size)); - auto col = cudf::make_strings_column( + auto col = cudf::make_lists_column( num_rows, std::move(offsets), std::move(struct_col), 0, rmm::device_buffer{}); // should pass (with 2 rows to spare) @@ -722,13 +721,12 @@ TEST_F(OverflowTest, BigColumnsSmallSlices) constexpr cudf::size_type num_rows = 1024; constexpr cudf::size_type string_size = inner_size / num_rows; - auto offsets = cudf::sequence(num_rows + 1, + auto offsets = cudf::sequence(num_rows + 1, cudf::numeric_scalar(0), cudf::numeric_scalar(string_size)); - auto many_chars = - cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, inner_size); - auto col = cudf::make_strings_column( - num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{}); + auto many_chars = rmm::device_uvector(inner_size, cudf::get_default_stream()); + auto col = cudf::make_strings_column( + num_rows, std::move(offsets), many_chars.release(), 0, rmm::device_buffer{}); auto sliced = cudf::slice(*col, {16, 32}); diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index 13459197aa3..2d9e2035e5e 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -297,17 +297,14 @@ TEST_F(StringsContainsTests, HexTest) std::vector offsets( {thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + count + 1}); - auto d_chars = std::make_unique( - cudf::detail::make_device_uvector_sync( - ascii_chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource()), - rmm::device_buffer{}, - 0); + auto d_chars = cudf::detail::make_device_uvector_sync( + ascii_chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto d_offsets = std::make_unique( cudf::detail::make_device_uvector_sync( offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource()), rmm::device_buffer{}, 0); - auto input = cudf::make_strings_column(count, std::move(d_offsets), std::move(d_chars), 0, {}); + auto input = cudf::make_strings_column(count, std::move(d_offsets), d_chars.release(), 0, {}); auto strings_view = cudf::strings_column_view(input->view()); for (auto ch : ascii_chars) { diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu index 5381ad63bc3..64123690aea 100644 --- a/cpp/tests/strings/factories_test.cu +++ b/cpp/tests/strings/factories_test.cu @@ -145,11 +145,8 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets) } std::vector h_nulls{h_null_mask}; - auto d_buffer = std::make_unique( - cudf::detail::make_device_uvector_sync( - h_buffer, cudf::get_default_stream(), rmm::mr::get_current_device_resource()), - rmm::device_buffer{}, - 0); + auto d_buffer = cudf::detail::make_device_uvector_sync( + h_buffer, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto d_offsets = std::make_unique( cudf::detail::make_device_uvector_sync( h_offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource()), @@ -158,7 +155,7 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets) auto d_nulls = cudf::detail::make_device_uvector_sync( h_nulls, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto column = cudf::make_strings_column( - count, std::move(d_offsets), std::move(d_buffer), null_count, d_nulls.release()); + count, std::move(d_offsets), d_buffer.release(), null_count, d_nulls.release()); EXPECT_EQ(column->type(), cudf::data_type{cudf::type_id::STRING}); EXPECT_EQ(column->null_count(), null_count); EXPECT_EQ(1, column->num_children()); @@ -197,8 +194,7 @@ TEST_F(StringsFactoriesTest, CreateScalar) TEST_F(StringsFactoriesTest, EmptyStringsColumn) { - auto d_chars = std::make_unique( - rmm::device_uvector{0, cudf::get_default_stream()}, rmm::device_buffer{}, 0); + auto d_chars = rmm::device_uvector(0, cudf::get_default_stream()); auto d_offsets = std::make_unique( cudf::detail::make_zeroed_device_uvector_sync( 1, cudf::get_default_stream(), rmm::mr::get_current_device_resource()), @@ -207,7 +203,7 @@ TEST_F(StringsFactoriesTest, EmptyStringsColumn) rmm::device_uvector d_nulls{0, cudf::get_default_stream()}; auto results = - cudf::make_strings_column(0, std::move(d_offsets), std::move(d_chars), 0, d_nulls.release()); + cudf::make_strings_column(0, std::move(d_offsets), d_chars.release(), 0, d_nulls.release()); cudf::test::expect_column_empty(results->view()); rmm::device_uvector> d_strings{ diff --git a/cpp/tests/transform/row_conversion.cpp b/cpp/tests/transform/row_conversion.cpp index e54929f1651..542ccc5e2d5 100644 --- a/cpp/tests/transform/row_conversion.cpp +++ b/cpp/tests/transform/row_conversion.cpp @@ -14,26 +14,20 @@ * limitations under the License. */ +#include +#include +#include +#include + #include #include -#include -#include #include #include #include #include -#include -#include -#include -#include -#include -#include - -#include -#include -#include #include +#include struct ColumnToRowTests : public cudf::test::BaseFixture {}; struct RowToColumnTests : public cudf::test::BaseFixture {}; @@ -833,19 +827,7 @@ TEST_F(RowToColumnTests, SimpleString) EXPECT_EQ(new_rows.size(), 1); for (auto& row : new_rows) { auto new_cols = cudf::convert_from_rows(cudf::lists_column_view(*row), schema); - EXPECT_EQ(row->size(), 5); - auto const num_columns = new_cols->num_columns(); - - cudf::strings_column_view str_col = new_cols->get_column(1).view(); - std::vector> col_data; - std::vector> offset_data; - for (int i = 0; i < num_columns; ++i) { - offset_data.emplace_back( - std::get<0>(cudf::test::to_host(str_col.offsets()))); - col_data.emplace_back(std::get<0>(cudf::test::to_host(str_col.chars()))); - } - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(in, *new_cols); } } From c738c05bd6d452a3d11b621df534757a6d4c0a52 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 17 Jan 2024 17:53:09 -0500 Subject: [PATCH 2/3] remove commented out code --- cpp/src/strings/case.cu | 3 +-- cpp/src/text/ngrams_tokenize.cu | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu index 0d81bd3399e..b3bf0e2a787 100644 --- a/cpp/src/strings/case.cu +++ b/cpp/src/strings/case.cu @@ -264,11 +264,10 @@ std::unique_ptr convert_case(strings_column_view const& input, "Size of output exceeds the column size limit", std::overflow_error); - // auto chars = create_chars_child_column(static_cast(bytes), stream, mr); rmm::device_uvector chars(bytes, stream, mr); // second pass, write output converter.d_offsets = d_offsets; - converter.d_chars = chars.data(); // chars->mutable_view().data(); + converter.d_chars = chars.data(); thrust::for_each_n(rmm::exec_policy(stream), count_itr, input.size(), converter); return make_strings_column(input.size(), diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu index 8aafd60db6d..642dca5fc47 100644 --- a/cpp/src/text/ngrams_tokenize.cu +++ b/cpp/src/text/ngrams_tokenize.cu @@ -252,7 +252,6 @@ std::unique_ptr ngrams_tokenize(cudf::strings_column_view const& s // build the offsets column -- converting the ngram sizes into offsets auto offsets_column = std::get<0>( cudf::detail::make_offsets_child_column(ngram_sizes.begin(), ngram_sizes.end(), stream, mr)); - // chars_column->set_null_count(0); offsets_column->set_null_count(0); // create the output strings column return make_strings_column( From 20b7f106220891b1744d9238d1991fc9705f81c1 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 18 Jan 2024 12:46:54 -0500 Subject: [PATCH 3/3] use make_device_uvector_async when possible in column_wrapper.hpp --- cpp/include/cudf_test/column_wrapper.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index bfea2cbce76..c4fa4be0f89 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -763,7 +763,7 @@ class strings_column_wrapper : public detail::column_wrapper { } auto all_valid = thrust::make_constant_iterator(true); auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, all_valid); - auto d_chars = cudf::detail::make_device_uvector_sync( + auto d_chars = cudf::detail::make_device_uvector_async( chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); auto d_offsets = std::make_unique( cudf::detail::make_device_uvector_sync( @@ -813,10 +813,10 @@ class strings_column_wrapper : public detail::column_wrapper { } auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, v); auto [null_mask, null_count] = detail::make_null_mask_vector(v, v + num_strings); - auto d_chars = cudf::detail::make_device_uvector_sync( + auto d_chars = cudf::detail::make_device_uvector_async( chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); auto d_offsets = std::make_unique( - cudf::detail::make_device_uvector_sync( + cudf::detail::make_device_uvector_async( offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()), rmm::device_buffer{}, 0);