diff --git a/cpp/benchmarks/hashing/hash.cpp b/cpp/benchmarks/hashing/hash.cpp index e679b4b62d2..4930fc59ac3 100644 --- a/cpp/benchmarks/hashing/hash.cpp +++ b/cpp/benchmarks/hashing/hash.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -43,7 +43,7 @@ static void bench_hash(nvbench::state& state) // collect statistics cudf::strings_column_view input(data->get_column(1).view()); - auto const chars_size = input.chars_size(); + auto const chars_size = input.chars_size(stream); // add memory read from string column state.add_global_memory_reads(chars_size); // add memory read from int64_t column diff --git a/cpp/benchmarks/json/json.cu b/cpp/benchmarks/json/json.cu index 5dc30aebe38..c74701445f8 100644 --- a/cpp/benchmarks/json/json.cu +++ b/cpp/benchmarks/json/json.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -190,7 +190,7 @@ void BM_case(benchmark::State& state, std::string query_arg) int desired_bytes = state.range(1); auto input = build_json_string_column(desired_bytes, num_rows); cudf::strings_column_view scv(input->view()); - size_t num_chars = scv.chars().size(); + size_t num_chars = scv.chars_size(cudf::get_default_stream()); std::string json_path(query_arg); diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp index 385bb7630f8..639a3dc1181 100644 --- a/cpp/benchmarks/string/case.cpp +++ b/cpp/benchmarks/string/case.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -43,18 +43,18 @@ void bench_case(nvbench::state& state) if (encoding == "ascii") { data_profile ascii_profile = data_profile_builder().no_validity().distribution( cudf::type_id::INT8, distribution_id::UNIFORM, 32, 126); // nice ASCII range - auto input = cudf::strings_column_view(col_view); - auto ascii_column = - create_random_column(cudf::type_id::INT8, row_count{input.chars_size()}, ascii_profile); + auto input = cudf::strings_column_view(col_view); + auto ascii_column = create_random_column( + cudf::type_id::INT8, row_count{input.chars_size(cudf::get_default_stream())}, ascii_profile); auto ascii_data = ascii_column->view(); col_view = cudf::column_view(col_view.type(), col_view.size(), - nullptr, + ascii_data.data(), col_view.null_mask(), col_view.null_count(), 0, - {input.offsets(), ascii_data}); + {input.offsets()}); ascii_contents = ascii_column->release(); } @@ -62,9 +62,9 @@ void bench_case(nvbench::state& state) state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - state.add_element_count(input.chars_size(), "chars_size"); - state.add_global_memory_reads(input.chars_size()); - state.add_global_memory_writes(input.chars_size()); + state.add_element_count(input.chars_size(cudf::get_default_stream()), "chars_size"); + state.add_global_memory_reads(input.chars_size(cudf::get_default_stream())); + state.add_global_memory_writes(input.chars_size(cudf::get_default_stream())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto result = cudf::strings::to_lower(input); }); diff --git a/cpp/benchmarks/string/char_types.cpp b/cpp/benchmarks/string/char_types.cpp index 59e6245fd41..eec9a5f54d7 100644 --- a/cpp/benchmarks/string/char_types.cpp +++ b/cpp/benchmarks/string/char_types.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -42,7 +42,7 @@ static void bench_char_types(nvbench::state& state) state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); // gather some throughput statistics as well - auto chars_size = input.chars_size(); + auto chars_size = input.chars_size(cudf::get_default_stream()); state.add_global_memory_reads(chars_size); // all bytes are read; if (api_type == "all") { state.add_global_memory_writes(num_rows); // output is a bool8 per row diff --git a/cpp/benchmarks/string/combine.cpp b/cpp/benchmarks/string/combine.cpp index 4ed54a38a48..7acfb1ffb0d 100644 --- a/cpp/benchmarks/string/combine.cpp +++ b/cpp/benchmarks/string/combine.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -44,7 +44,8 @@ static void BM_combine(benchmark::State& state) cudf::strings::concatenate(table->view(), separator); } - state.SetBytesProcessed(state.iterations() * (input1.chars_size() + input2.chars_size())); + state.SetBytesProcessed(state.iterations() * (input1.chars_size(cudf::get_default_stream()) + + input2.chars_size(cudf::get_default_stream()))); } static void generate_bench_args(benchmark::internal::Benchmark* b) diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp index af45d5d8fee..6d839c1de64 100644 --- a/cpp/benchmarks/string/contains.cpp +++ b/cpp/benchmarks/string/contains.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -100,7 +100,7 @@ static void bench_contains(nvbench::state& state) auto pattern = patterns[pattern_index]; auto program = cudf::strings::regex_program::create(pattern); - auto chars_size = input.chars_size(); + auto chars_size = input.chars_size(cudf::get_default_stream()); state.add_element_count(chars_size, "chars_size"); state.add_global_memory_reads(chars_size); state.add_global_memory_writes(input.size()); diff --git a/cpp/benchmarks/string/convert_datetime.cpp b/cpp/benchmarks/string/convert_datetime.cpp index 5f332a3e1a0..5deca3664b7 100644 --- a/cpp/benchmarks/string/convert_datetime.cpp +++ b/cpp/benchmarks/string/convert_datetime.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -48,7 +48,8 @@ void BM_convert_datetime(benchmark::State& state, direction dir) cudf::strings::from_timestamps(input, "%Y-%m-%d %H:%M:%S"); } - auto const bytes = dir == direction::to ? source_string.chars_size() : n_rows * sizeof(TypeParam); + auto const bytes = dir == direction::to ? source_string.chars_size(cudf::get_default_stream()) + : n_rows * sizeof(TypeParam); state.SetBytesProcessed(state.iterations() * bytes); } diff --git a/cpp/benchmarks/string/convert_fixed_point.cpp b/cpp/benchmarks/string/convert_fixed_point.cpp index 0cc98ee146c..e5bd794e405 100644 --- a/cpp/benchmarks/string/convert_fixed_point.cpp +++ b/cpp/benchmarks/string/convert_fixed_point.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -49,8 +49,9 @@ void convert_to_fixed_point(benchmark::State& state) } // bytes_processed = bytes_input + bytes_output - state.SetBytesProcessed(state.iterations() * - (strings_view.chars_size() + rows * cudf::size_of(dtype))); + state.SetBytesProcessed( + state.iterations() * + (strings_view.chars_size(cudf::get_default_stream()) + rows * cudf::size_of(dtype))); } class StringsFromFixedPoint : public cudf::benchmark {}; @@ -74,7 +75,8 @@ void convert_from_fixed_point(benchmark::State& state) // bytes_processed = bytes_input + bytes_output state.SetBytesProcessed( state.iterations() * - (cudf::strings_column_view(results->view()).chars_size() + rows * cudf::size_of(dtype))); + (cudf::strings_column_view(results->view()).chars_size(cudf::get_default_stream()) + + rows * cudf::size_of(dtype))); } #define CONVERT_TO_FIXED_POINT_BMD(name, fixed_point_type) \ diff --git a/cpp/benchmarks/string/convert_numerics.cpp b/cpp/benchmarks/string/convert_numerics.cpp index cce5d0f6a4d..8f875c5c80f 100644 --- a/cpp/benchmarks/string/convert_numerics.cpp +++ b/cpp/benchmarks/string/convert_numerics.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -63,8 +63,9 @@ void convert_to_number(benchmark::State& state) } // bytes_processed = bytes_input + bytes_output - state.SetBytesProcessed(state.iterations() * - (strings_view.chars_size() + rows * sizeof(NumericType))); + state.SetBytesProcessed( + state.iterations() * + (strings_view.chars_size(cudf::get_default_stream()) + rows * sizeof(NumericType))); } class StringsFromNumeric : public cudf::benchmark {}; @@ -90,7 +91,8 @@ void convert_from_number(benchmark::State& state) // bytes_processed = bytes_input + bytes_output state.SetBytesProcessed( state.iterations() * - (cudf::strings_column_view(results->view()).chars_size() + rows * sizeof(NumericType))); + (cudf::strings_column_view(results->view()).chars_size(cudf::get_default_stream()) + + rows * sizeof(NumericType))); } #define CONVERT_TO_NUMERICS_BD(name, type) \ diff --git a/cpp/benchmarks/string/copy.cu b/cpp/benchmarks/string/copy.cu index 27438f80f92..6b2f6c3a0a7 100644 --- a/cpp/benchmarks/string/copy.cu +++ b/cpp/benchmarks/string/copy.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -64,8 +64,9 @@ static void BM_copy(benchmark::State& state, copy_type ct) } } - state.SetBytesProcessed(state.iterations() * - cudf::strings_column_view(source->view().column(0)).chars_size()); + state.SetBytesProcessed( + state.iterations() * + cudf::strings_column_view(source->view().column(0)).chars_size(cudf::get_default_stream())); } static void generate_bench_args(benchmark::internal::Benchmark* b) diff --git a/cpp/benchmarks/string/count.cpp b/cpp/benchmarks/string/count.cpp index 08406462632..a656010dca5 100644 --- a/cpp/benchmarks/string/count.cpp +++ b/cpp/benchmarks/string/count.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -47,7 +47,7 @@ static void bench_count(nvbench::state& state) state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); // gather some throughput statistics as well - auto chars_size = input.chars_size(); + auto chars_size = input.chars_size(cudf::get_default_stream()); state.add_element_count(chars_size, "chars_size"); state.add_global_memory_reads(chars_size); state.add_global_memory_writes(input.size()); diff --git a/cpp/benchmarks/string/extract.cpp b/cpp/benchmarks/string/extract.cpp index 135dadabbe4..af4fedb5799 100644 --- a/cpp/benchmarks/string/extract.cpp +++ b/cpp/benchmarks/string/extract.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -67,7 +67,7 @@ static void bench_extract(nvbench::state& state) state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); // gather some throughput statistics as well - auto chars_size = strings_view.chars_size(); + auto chars_size = strings_view.chars_size(cudf::get_default_stream()); state.add_element_count(chars_size, "chars_size"); // number of bytes; state.add_global_memory_reads(chars_size); // all bytes are read; state.add_global_memory_writes(chars_size); // all bytes are written diff --git a/cpp/benchmarks/string/factory.cu b/cpp/benchmarks/string/factory.cu index c73bcb0b0ad..c4e74c4d97e 100644 --- a/cpp/benchmarks/string/factory.cu +++ b/cpp/benchmarks/string/factory.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -67,7 +67,7 @@ static void BM_factory(benchmark::State& state) } cudf::strings_column_view input(column->view()); - state.SetBytesProcessed(state.iterations() * input.chars_size()); + state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream())); } static void generate_bench_args(benchmark::internal::Benchmark* b) diff --git a/cpp/benchmarks/string/filter.cpp b/cpp/benchmarks/string/filter.cpp index b935fc4a11f..613834b1f3e 100644 --- a/cpp/benchmarks/string/filter.cpp +++ b/cpp/benchmarks/string/filter.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -57,7 +57,7 @@ static void BM_filter_chars(benchmark::State& state, FilterAPI api) } } - state.SetBytesProcessed(state.iterations() * input.chars_size()); + state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream())); } static void generate_bench_args(benchmark::internal::Benchmark* b) diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp index 5f2e6946b8b..e866092f3a3 100644 --- a/cpp/benchmarks/string/find.cpp +++ b/cpp/benchmarks/string/find.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -56,7 +56,7 @@ static void BM_find_scalar(benchmark::State& state, FindAPI find_api) } } - state.SetBytesProcessed(state.iterations() * input.chars_size()); + state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream())); } static void generate_bench_args(benchmark::internal::Benchmark* b) diff --git a/cpp/benchmarks/string/gather.cpp b/cpp/benchmarks/string/gather.cpp index 530b09b7d6a..5b1c679be7d 100644 --- a/cpp/benchmarks/string/gather.cpp +++ b/cpp/benchmarks/string/gather.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -43,7 +43,8 @@ static void bench_gather(nvbench::state& state) create_random_table({cudf::type_id::INT32}, row_count{num_rows}, map_profile); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - auto chars_size = cudf::strings_column_view(input_table->view().column(0)).chars_size(); + auto chars_size = + cudf::strings_column_view(input_table->view().column(0)).chars_size(cudf::get_default_stream()); state.add_global_memory_reads(chars_size); // all bytes are read; state.add_global_memory_writes(chars_size); diff --git a/cpp/benchmarks/string/join_strings.cpp b/cpp/benchmarks/string/join_strings.cpp index a122c0022a9..6dcf731ad3c 100644 --- a/cpp/benchmarks/string/join_strings.cpp +++ b/cpp/benchmarks/string/join_strings.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,7 +40,7 @@ static void bench_join(nvbench::state& state) state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); // gather some throughput statistics as well - auto const chars_size = input.chars_size(); + auto const chars_size = input.chars_size(cudf::get_default_stream()); state.add_element_count(chars_size, "chars_size"); // number of bytes; state.add_global_memory_reads(chars_size); // all bytes are read; state.add_global_memory_writes(chars_size); // all bytes are written diff --git a/cpp/benchmarks/string/lengths.cpp b/cpp/benchmarks/string/lengths.cpp index 36c4bf64a00..a19060ead3b 100644 --- a/cpp/benchmarks/string/lengths.cpp +++ b/cpp/benchmarks/string/lengths.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,7 +40,7 @@ static void bench_lengths(nvbench::state& state) state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); // gather some throughput statistics as well - auto chars_size = input.chars_size(); + auto chars_size = input.chars_size(cudf::get_default_stream()); state.add_global_memory_reads(chars_size); // all bytes are read; state.add_global_memory_writes(num_rows); // output is an integer per row diff --git a/cpp/benchmarks/string/like.cpp b/cpp/benchmarks/string/like.cpp index 6ac832471a5..99cef640dc3 100644 --- a/cpp/benchmarks/string/like.cpp +++ b/cpp/benchmarks/string/like.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -99,7 +99,7 @@ static void bench_like(nvbench::state& state) state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); // gather some throughput statistics as well - auto chars_size = input.chars_size(); + auto chars_size = input.chars_size(cudf::get_default_stream()); state.add_element_count(chars_size, "chars_size"); // number of bytes; state.add_global_memory_reads(chars_size); // all bytes are read; state.add_global_memory_writes(n_rows); // writes are BOOL8 diff --git a/cpp/benchmarks/string/repeat_strings.cpp b/cpp/benchmarks/string/repeat_strings.cpp index 92645524efb..f1d1516f248 100644 --- a/cpp/benchmarks/string/repeat_strings.cpp +++ b/cpp/benchmarks/string/repeat_strings.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -59,7 +59,7 @@ static void BM_repeat_strings_scalar_times(benchmark::State& state) cudf::strings::repeat_strings(strings_col, default_repeat_times); } - state.SetBytesProcessed(state.iterations() * strings_col.chars_size()); + state.SetBytesProcessed(state.iterations() * strings_col.chars_size(cudf::get_default_stream())); } static void BM_repeat_strings_column_times(benchmark::State& state) @@ -75,8 +75,8 @@ static void BM_repeat_strings_column_times(benchmark::State& state) cudf::strings::repeat_strings(strings_col, repeat_times_col); } - state.SetBytesProcessed(state.iterations() * - (strings_col.chars_size() + repeat_times_col.size() * sizeof(int32_t))); + state.SetBytesProcessed(state.iterations() * (strings_col.chars_size(cudf::get_default_stream()) + + repeat_times_col.size() * sizeof(int32_t))); } static void generate_bench_args(benchmark::internal::Benchmark* b) diff --git a/cpp/benchmarks/string/replace.cpp b/cpp/benchmarks/string/replace.cpp index 5ddf09f5cec..c8f26142193 100644 --- a/cpp/benchmarks/string/replace.cpp +++ b/cpp/benchmarks/string/replace.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -58,7 +58,7 @@ static void BM_replace(benchmark::State& state, replace_type rt) } } - state.SetBytesProcessed(state.iterations() * input.chars_size()); + state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream())); } static void generate_bench_args(benchmark::internal::Benchmark* b) diff --git a/cpp/benchmarks/string/replace_re.cpp b/cpp/benchmarks/string/replace_re.cpp index b8efd76ab41..4dcf1314f83 100644 --- a/cpp/benchmarks/string/replace_re.cpp +++ b/cpp/benchmarks/string/replace_re.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -42,7 +42,7 @@ static void bench_replace(nvbench::state& state) auto program = cudf::strings::regex_program::create("(\\d+)"); - auto chars_size = input.chars_size(); + auto chars_size = input.chars_size(cudf::get_default_stream()); state.add_element_count(chars_size, "chars_size"); state.add_global_memory_reads(chars_size); state.add_global_memory_writes(chars_size); diff --git a/cpp/benchmarks/string/reverse.cpp b/cpp/benchmarks/string/reverse.cpp index 31cd4639115..a2676609a40 100644 --- a/cpp/benchmarks/string/reverse.cpp +++ b/cpp/benchmarks/string/reverse.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,7 +40,7 @@ static void bench_reverse(nvbench::state& state) state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); // gather some throughput statistics as well - auto chars_size = input.chars_size(); + auto chars_size = input.chars_size(cudf::get_default_stream()); state.add_element_count(chars_size, "chars_size"); // number of bytes; state.add_global_memory_reads(chars_size); // all bytes are read; state.add_global_memory_writes(chars_size); // all bytes are written diff --git a/cpp/benchmarks/string/slice.cpp b/cpp/benchmarks/string/slice.cpp index 6c1d7d98d3a..0f973a7c8b5 100644 --- a/cpp/benchmarks/string/slice.cpp +++ b/cpp/benchmarks/string/slice.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -58,7 +58,7 @@ static void BM_slice(benchmark::State& state, slice_type rt) } } - state.SetBytesProcessed(state.iterations() * input.chars_size()); + state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream())); } static void generate_bench_args(benchmark::internal::Benchmark* b) diff --git a/cpp/benchmarks/string/split.cpp b/cpp/benchmarks/string/split.cpp index eb724fabfd1..9ef58daf0fc 100644 --- a/cpp/benchmarks/string/split.cpp +++ b/cpp/benchmarks/string/split.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -44,7 +44,7 @@ static void bench_split(nvbench::state& state) state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); // gather some throughput statistics as well - auto chars_size = input.chars_size(); + auto chars_size = input.chars_size(cudf::get_default_stream()); state.add_element_count(chars_size, "chars_size"); // number of bytes; state.add_global_memory_reads(chars_size); // all bytes are read; state.add_global_memory_writes(chars_size); // all bytes are written diff --git a/cpp/benchmarks/string/split_re.cpp b/cpp/benchmarks/string/split_re.cpp index 67aa6f0e008..1fdb6e67109 100644 --- a/cpp/benchmarks/string/split_re.cpp +++ b/cpp/benchmarks/string/split_re.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -44,7 +44,7 @@ static void bench_split(nvbench::state& state) state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); // gather some throughput statistics as well - auto chars_size = input.chars_size(); + auto chars_size = input.chars_size(cudf::get_default_stream()); state.add_element_count(chars_size, "chars_size"); // number of bytes; state.add_global_memory_reads(chars_size); // all bytes are read; state.add_global_memory_writes(chars_size); // all bytes are written diff --git a/cpp/benchmarks/string/translate.cpp b/cpp/benchmarks/string/translate.cpp index 00ca7459964..dc3c8c71488 100644 --- a/cpp/benchmarks/string/translate.cpp +++ b/cpp/benchmarks/string/translate.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -56,7 +56,7 @@ static void BM_translate(benchmark::State& state, int entry_count) cudf::strings::translate(input, entries); } - state.SetBytesProcessed(state.iterations() * input.chars_size()); + state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream())); } static void generate_bench_args(benchmark::internal::Benchmark* b) diff --git a/cpp/benchmarks/string/url_decode.cu b/cpp/benchmarks/string/url_decode.cu index 9ede89bee43..b3aeb69e5ea 100644 --- a/cpp/benchmarks/string/url_decode.cu +++ b/cpp/benchmarks/string/url_decode.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -67,7 +67,7 @@ auto generate_column(cudf::size_type num_rows, cudf::size_type chars_per_row, do auto col_1a = cudf::test::strings_column_wrapper(strings.begin(), strings.end()); auto table_a = cudf::repeat(cudf::table_view{{col_1a}}, num_rows); auto result_col = std::move(table_a->release()[0]); // string column with num_rows aaa... - auto chars_col = result_col->child(cudf::strings_column_view::chars_column_index).mutable_view(); + auto chars_data = static_cast(result_col->mutable_view().head()); auto offset_col = result_col->child(cudf::strings_column_view::offsets_column_index).view(); auto engine = thrust::default_random_engine{}; @@ -75,7 +75,7 @@ auto generate_column(cudf::size_type num_rows, cudf::size_type chars_per_row, do thrust::make_zip_iterator(offset_col.begin(), offset_col.begin() + 1), num_rows, - url_string_generator{chars_col.begin(), esc_seq_chance, engine}); + url_string_generator{chars_data, esc_seq_chance, engine}); return result_col; } diff --git a/cpp/benchmarks/text/edit_distance.cpp b/cpp/benchmarks/text/edit_distance.cpp index 8a8bd9ae586..0a1ea52c415 100644 --- a/cpp/benchmarks/text/edit_distance.cpp +++ b/cpp/benchmarks/text/edit_distance.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -43,7 +43,8 @@ static void bench_edit_distance(nvbench::state& state) state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - auto chars_size = input1.chars_size() + input2.chars_size(); + auto chars_size = + input1.chars_size(cudf::get_default_stream()) + input2.chars_size(cudf::get_default_stream()); state.add_global_memory_reads(chars_size); // output are integers (one per row) state.add_global_memory_writes(num_rows); diff --git a/cpp/benchmarks/text/hash_ngrams.cpp b/cpp/benchmarks/text/hash_ngrams.cpp index 5bbd2fc6819..3df0c61fc31 100644 --- a/cpp/benchmarks/text/hash_ngrams.cpp +++ b/cpp/benchmarks/text/hash_ngrams.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -43,7 +43,7 @@ static void bench_hash_ngrams(nvbench::state& state) state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - auto chars_size = input.chars_size(); + auto chars_size = input.chars_size(cudf::get_default_stream()); state.add_global_memory_reads(chars_size); // output are hashes: approximate total number of hashes state.add_global_memory_writes(num_rows * ngrams); diff --git a/cpp/benchmarks/text/jaccard.cpp b/cpp/benchmarks/text/jaccard.cpp index 70470b829bd..60251c96096 100644 --- a/cpp/benchmarks/text/jaccard.cpp +++ b/cpp/benchmarks/text/jaccard.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include +#include #include @@ -44,9 +45,10 @@ static void bench_jaccard(nvbench::state& state) cudf::strings_column_view input1(input_table->view().column(0)); cudf::strings_column_view input2(input_table->view().column(1)); - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); - auto chars_size = input1.chars_size() + input2.chars_size(); + auto chars_size = input1.chars_size(stream) + input2.chars_size(stream); state.add_global_memory_reads(chars_size); state.add_global_memory_writes(num_rows); diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index 1b60caa24de..d10d0d307d7 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -52,7 +52,7 @@ static void bench_minhash(nvbench::state& state) state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - auto chars_size = input.chars_size(); + auto chars_size = input.chars_size(cudf::get_default_stream()); state.add_global_memory_reads(chars_size); state.add_global_memory_writes(num_rows); // output are hashes diff --git a/cpp/benchmarks/text/ngrams.cpp b/cpp/benchmarks/text/ngrams.cpp index f3fd5cc5729..8e48f8e9a05 100644 --- a/cpp/benchmarks/text/ngrams.cpp +++ b/cpp/benchmarks/text/ngrams.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -46,7 +46,7 @@ static void BM_ngrams(benchmark::State& state, ngrams_type nt) } } - state.SetBytesProcessed(state.iterations() * input.chars_size()); + state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream())); } static void generate_bench_args(benchmark::internal::Benchmark* b) diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp index 6878fa4f8b6..71bccd80d39 100644 --- a/cpp/benchmarks/text/normalize.cpp +++ b/cpp/benchmarks/text/normalize.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -43,7 +43,7 @@ static void bench_normalize(nvbench::state& state) state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - auto chars_size = input.chars_size(); + auto chars_size = input.chars_size(cudf::get_default_stream()); state.add_global_memory_reads(chars_size); state.add_global_memory_writes(chars_size); diff --git a/cpp/benchmarks/text/replace.cpp b/cpp/benchmarks/text/replace.cpp index 257f62aa728..767ebab3eee 100644 --- a/cpp/benchmarks/text/replace.cpp +++ b/cpp/benchmarks/text/replace.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -59,7 +59,7 @@ static void bench_replace(nvbench::state& state) state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - auto chars_size = view.chars_size(); + auto chars_size = view.chars_size(cudf::get_default_stream()); state.add_global_memory_reads(chars_size); state.add_global_memory_writes(chars_size); diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp index b556a84c541..2151b28d637 100644 --- a/cpp/benchmarks/text/tokenize.cpp +++ b/cpp/benchmarks/text/tokenize.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -46,7 +46,7 @@ static void bench_tokenize(nvbench::state& state) state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - auto chars_size = input.chars_size(); + auto chars_size = input.chars_size(cudf::get_default_stream()); state.add_global_memory_reads(chars_size); state.add_global_memory_writes(chars_size); diff --git a/cpp/benchmarks/text/vocab.cpp b/cpp/benchmarks/text/vocab.cpp index 80942e2697d..770519294ad 100644 --- a/cpp/benchmarks/text/vocab.cpp +++ b/cpp/benchmarks/text/vocab.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,6 +31,7 @@ static void bench_vocab_tokenize(nvbench::state& state) { + auto const stream = cudf::get_default_stream(); auto const num_rows = static_cast(state.get_int64("num_rows")); auto const row_width = static_cast(state.get_int64("row_width")); @@ -63,16 +64,16 @@ static void bench_vocab_tokenize(nvbench::state& state) }(); auto const vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocab_col->view())); - auto token_count = [input] { + auto token_count = [input, stream] { auto const counts = nvtext::count_tokens(input); auto const agg = cudf::make_sum_aggregation(); auto const count = cudf::reduce(counts->view(), *agg, counts->type()); - return static_cast*>(count.get()) - ->value(cudf::get_default_stream()); + return static_cast*>(count.get())->value(stream); }(); - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - auto chars_size = input.chars_size() + cudf::strings_column_view(vocab_col->view()).chars_size(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + auto chars_size = + input.chars_size(stream) + cudf::strings_column_view(vocab_col->view()).chars_size(stream); state.add_global_memory_reads(chars_size); state.add_global_memory_writes(token_count); diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md index fc2f72de33c..c38151d7518 100644 --- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md +++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md @@ -1197,17 +1197,15 @@ This is related to [Arrow's "Variable-Size List" memory layout](https://arrow.ap ## Strings columns -Strings are represented in much the same way as lists, except that the data child column is always -a non-nullable column of `INT8` data. The parent column's type is `STRING` and contains no data, +Strings are represented as a column with a data device buffer and a child offsets column. +The parent column's type is `STRING` and its data holds all the characters across all the strings packed together but its size represents the number of strings in the column, and its null mask represents the validity of each string. To summarize, the strings column children are: 1. A non-nullable column of [`size_type`](#cudfsize_type) elements that indicates the offset to the beginning of each - string in a dense column of all characters. -2. A non-nullable column of `INT8` elements of all the characters across all the strings packed - together. + string in a dense data buffer of all characters. -With this representation, `characters[offsets[i]]` is the first character of string `i`, and the +With this representation, `data[offsets[i]]` is the first character of string `i`, and the size of string `i` is given by `offsets[i+1] - offsets[i]`. The following image shows an example of this compound column representation of strings. diff --git a/cpp/doxygen/developer_guide/strings.png b/cpp/doxygen/developer_guide/strings.png index 85ffef283b6..1d18ea8a407 100644 Binary files a/cpp/doxygen/developer_guide/strings.png and b/cpp/doxygen/developer_guide/strings.png differ diff --git a/cpp/examples/strings/custom_prealloc.cu b/cpp/examples/strings/custom_prealloc.cu index 0af4c47e947..5088ebd6168 100644 --- a/cpp/examples/strings/custom_prealloc.cu +++ b/cpp/examples/strings/custom_prealloc.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -101,7 +101,7 @@ std::unique_ptr redact_strings(cudf::column_view const& names, auto const offsets = scv.offsets_begin(); // create working memory to hold the output of each string - auto working_memory = rmm::device_uvector(scv.chars_size(), stream); + auto working_memory = rmm::device_uvector(scv.chars_size(stream), stream); // create a vector for the output strings' pointers auto str_ptrs = rmm::device_uvector(names.size(), stream); diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index daee443a5f3..19722d127cb 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -445,7 +445,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { __device__ T element(size_type element_index) const noexcept { size_type index = element_index + offset(); // account for this view's _offset - char const* d_strings = d_children[strings_column_view::chars_column_index].data(); + char const* d_strings = static_cast(_data); auto const offsets = d_children[strings_column_view::offsets_column_index]; auto const itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type()); auto const offset = itr[index]; diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp index ce5772dcf3c..a6167d983c5 100644 --- a/cpp/include/cudf/column/column_factories.hpp +++ b/cpp/include/cudf/column/column_factories.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -462,9 +462,31 @@ std::unique_ptr make_strings_column( * nulls is used for interpreting this bitmask. * @return Constructed strings column */ +[[deprecated]] std::unique_ptr make_strings_column(size_type num_strings, + std::unique_ptr offsets_column, + std::unique_ptr chars_column, + size_type null_count, + rmm::device_buffer&& null_mask); +/** + * @brief Construct a STRING type column given offsets column, chars columns, and null mask and null + * count. + * + * The columns and mask are moved into the resulting strings column. + * + * @param num_strings The number of strings the column represents. + * @param offsets_column The column of offset values for this column. The number of elements is + * one more than the total number of strings so the `offset[last] - offset[0]` is the total number + * of bytes in the strings vector. + * @param chars_buffer The buffer of char bytes for all the strings for this column. Individual + * strings are identified by the offsets and the nullmask. + * @param null_count The number of null string entries. + * @param null_mask The bits specifying the null strings in device memory. Arrow format for + * nulls is used for interpreting this bitmask. + * @return Constructed strings column + */ std::unique_ptr make_strings_column(size_type num_strings, std::unique_ptr offsets_column, - std::unique_ptr chars_column, + rmm::device_buffer&& chars_buffer, size_type null_count, rmm::device_buffer&& null_mask); diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index 47a48f2175b..3208a81cd63 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -693,6 +693,11 @@ class column_in_metadata { column_in_metadata& set_output_as_binary(bool binary) noexcept { _output_as_binary = binary; + if (_output_as_binary and children.size() == 1) { + children.emplace_back(); + } else if (!_output_as_binary and children.size() == 2) { + children.pop_back(); + } return *this; } diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp index e27d32fceb9..36054f7c229 100644 --- a/cpp/include/cudf/strings/strings_column_view.hpp +++ b/cpp/include/cudf/strings/strings_column_view.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #pragma once #include +#include /** * @file @@ -58,7 +59,6 @@ class strings_column_view : private column_view { strings_column_view& operator=(strings_column_view&&) = default; static constexpr size_type offsets_column_index{0}; ///< Child index of the offsets column - static constexpr size_type chars_column_index{1}; ///< Child index of the characters column using column_view::has_nulls; using column_view::is_empty; @@ -106,10 +106,12 @@ class strings_column_view : private column_view { /** * @brief Returns the internal column of chars * - * @throw cudf::logic_error if this is an empty column + * @throw cudf::logic error if this is an empty column + * @param stream CUDA stream used for device memory operations and kernel launches * @return The chars column */ - [[nodiscard]] column_view chars() const; + [[deprecated]] [[nodiscard]] column_view chars( + rmm::cuda_stream_view stream = cudf::get_default_stream()) const; /** * @brief Returns the number of bytes in the chars child column. @@ -117,9 +119,10 @@ class strings_column_view : private column_view { * This accounts for empty columns but does not reflect a sliced parent column * view (i.e.: non-zero offset or reduced row count). * + * @param stream CUDA stream used for device memory operations and kernel launches * @return Number of bytes in the chars child column */ - [[nodiscard]] size_type chars_size() const noexcept; + [[nodiscard]] size_type chars_size(rmm::cuda_stream_view stream) const noexcept; /** * @brief Return an iterator for the chars child column. @@ -128,11 +131,11 @@ class strings_column_view : private column_view { * The offsets child must be used to properly address the char bytes. * * For example, to access the first character of string `i` (accounting for - * a sliced column offset) use: `chars_begin()[offsets_begin()[i]]`. + * a sliced column offset) use: `chars_begin(stream)[offsets_begin()[i]]`. * * @return Iterator pointing to the first char byte. */ - [[nodiscard]] chars_iterator chars_begin() const; + [[nodiscard]] chars_iterator chars_begin(rmm::cuda_stream_view) const; /** * @brief Return an end iterator for the offsets child column. @@ -140,9 +143,10 @@ class strings_column_view : private column_view { * This does not apply the offset of the parent. * The offsets child must be used to properly address the char bytes. * + * @param stream CUDA stream used for device memory operations and kernel launches * @return Iterator pointing 1 past the last char byte. */ - [[nodiscard]] chars_iterator chars_end() const; + [[nodiscard]] chars_iterator chars_end(rmm::cuda_stream_view stream) const; }; //! Strings column APIs. diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp index 6231f8207f9..49d5098f823 100644 --- a/cpp/include/cudf_test/column_utilities.hpp +++ b/cpp/include/cudf_test/column_utilities.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -226,15 +226,15 @@ template <> inline std::pair, std::vector> to_host(column_view c) { thrust::host_vector host_data(c.size()); + auto stream = cudf::get_default_stream(); if (c.size() > c.null_count()) { auto const scv = strings_column_view(c); auto const h_chars = cudf::detail::make_std_vector_sync( - cudf::device_span(scv.chars().data(), scv.chars().size()), - cudf::get_default_stream()); + cudf::device_span(scv.chars_begin(stream), scv.chars_size(stream)), stream); auto const h_offsets = cudf::detail::make_std_vector_sync( cudf::device_span(scv.offsets().data() + scv.offset(), scv.size() + 1), - cudf::get_default_stream()); + stream); // build std::string vector from chars and offsets std::transform( diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu index 464c15dac9d..73ba15e39f3 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cu +++ b/cpp/src/binaryop/compiled/binary_ops.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -86,11 +86,11 @@ scalar_as_column_view::return_type scalar_as_column_view::operator()(s.validity_data()), static_cast(!s.is_valid(stream)), 0, - {offsets_column->view(), chars_column_v}); + {offsets_column->view()}); return std::pair{col_v, std::move(offsets_column)}; } diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index 75722ede9d2..4d16298c605 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -51,7 +51,9 @@ column_view_base::column_view_base(data_type type, CUDF_EXPECTS(nullptr == data, "EMPTY column should have no data."); CUDF_EXPECTS(nullptr == null_mask, "EMPTY column should have no null mask."); } else if (is_compound(type)) { - CUDF_EXPECTS(nullptr == data, "Compound (parent) columns cannot have data"); + if (type.id() != type_id::STRING) { + CUDF_EXPECTS(nullptr == data, "Compound (parent) columns cannot have data"); + } } else if (size > 0) { CUDF_EXPECTS(nullptr != data, "Null data pointer."); } diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index dd4af236ecf..54d0aa10353 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -502,23 +502,34 @@ std::pair buf_info_functor::operator() 0; + + // string columns contain the underlying chars data. + *current = src_buf_info(type_id::STRING, + nullptr, + offset_stack_pos, + // if I have an offsets child, it's index will be my parent_offset_index + has_offsets_child ? ((current + 1) - head) : parent_offset_index, + false, + col.offset()); + + // if I have offsets, I need to include that in the stack size + offset_stack_pos += has_offsets_child ? offset_depth + 1 : offset_depth; current++; - offset_stack_pos += offset_depth; - // string columns don't necessarily have children - if (col.num_children() > 0) { - CUDF_EXPECTS(col.num_children() == 2, "Encountered malformed string column"); + if (has_offsets_child) { + CUDF_EXPECTS(col.num_children() == 1, "Encountered malformed string column"); strings_column_view scv(col); // info for the offsets buffer @@ -539,15 +550,6 @@ std::pair buf_info_functor::operator() build_output_column_metadata( }(); // size/data pointer for the column - auto const col_size = static_cast(current_info->num_elements); - int64_t const data_offset = src.num_children() > 0 || col_size == 0 || src.head() == nullptr - ? -1 - : static_cast(current_info->dst_offset); + auto const col_size = [&]() { + // if I am a string column, I need to use the number of rows from my child offset column. the + // number of rows in my dst_buf_info struct will be equal to the number of chars, which is + // incorrect. this is a quirk of how cudf stores strings. + if (src.type().id() == type_id::STRING) { + // if I have no children (no offsets), then I must have a row count of 0 + if (src.num_children() == 0) { return 0; } + + // otherwise my actual number of rows will be the num_rows field of the next dst_buf_info + // struct (our child offsets column) + return (current_info + 1)->num_rows; + } + + // otherwise the number of rows is the number of elements + return static_cast(current_info->num_elements); + }(); + int64_t const data_offset = + col_size == 0 || src.head() == nullptr ? -1 : static_cast(current_info->dst_offset); mb.add_column_info_to_meta( src.type(), col_size, null_count, data_offset, bitmask_offset, src.num_children()); @@ -902,11 +918,19 @@ struct dst_valid_count_output_iterator { */ struct size_of_helper { template - constexpr std::enable_if_t(), int> __device__ operator()() const + constexpr std::enable_if_t() && !std::is_same_v, int> + __device__ operator()() const { return 0; } + template + constexpr std::enable_if_t() && std::is_same_v, int> + __device__ operator()() const + { + return sizeof(cudf::device_storage_type_t); + } + template constexpr std::enable_if_t(), int> __device__ operator()() const noexcept { @@ -1236,7 +1260,7 @@ std::unique_ptr compute_splits( } // final element indices and row count - int const out_element_index = src_info.is_validity ? row_start / 32 : row_start; + int const src_element_index = src_info.is_validity ? row_start / 32 : row_start; int const num_rows = row_end - row_start; // if I am an offsets column, all my values need to be shifted int const value_shift = src_info.offsets == nullptr ? 0 : src_info.offsets[row_start]; @@ -1259,7 +1283,7 @@ std::unique_ptr compute_splits( num_elements, element_size, num_rows, - out_element_index, + src_element_index, 0, value_shift, bit_shift, diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 3a9fe50d25b..04ca1250ed5 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include #include #include @@ -49,16 +51,16 @@ namespace { * @brief Create arrow data buffer from given cudf column */ template -std::shared_ptr fetch_data_buffer(column_view input_view, +std::shared_ptr fetch_data_buffer(device_span input, arrow::MemoryPool* ar_mr, rmm::cuda_stream_view stream) { - int64_t const data_size_in_bytes = sizeof(T) * input_view.size(); + int64_t const data_size_in_bytes = sizeof(T) * input.size(); auto data_buffer = allocate_arrow_buffer(data_size_in_bytes, ar_mr); CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(), - input_view.data(), + input.data(), data_size_in_bytes, cudaMemcpyDefault, stream.value())); @@ -136,11 +138,13 @@ struct dispatch_to_arrow { arrow::MemoryPool* ar_mr, rmm::cuda_stream_view stream) { - return to_arrow_array(id, - static_cast(input_view.size()), - fetch_data_buffer(input_view, ar_mr, stream), - fetch_mask_buffer(input_view, ar_mr, stream), - static_cast(input_view.null_count())); + return to_arrow_array( + id, + static_cast(input_view.size()), + fetch_data_buffer( + device_span(input_view.data(), input_view.size()), ar_mr, stream), + fetch_mask_buffer(input_view, ar_mr, stream), + static_cast(input_view.null_count())); } }; @@ -280,7 +284,7 @@ std::shared_ptr dispatch_to_arrow::operator()( { std::unique_ptr tmp_column = ((input.offset() != 0) or - ((input.num_children() == 2) and (input.child(0).size() - 1 != input.size()))) + ((input.num_children() == 1) and (input.child(0).size() - 1 != input.size()))) ? std::make_unique(input, stream) : nullptr; @@ -295,8 +299,13 @@ std::shared_ptr dispatch_to_arrow::operator()( return std::make_shared( 0, std::move(tmp_offset_buffer), std::move(tmp_data_buffer)); } - auto offset_buffer = child_arrays[0]->data()->buffers[1]; - auto data_buffer = child_arrays[1]->data()->buffers[1]; + auto offset_buffer = child_arrays[strings_column_view::offsets_column_index]->data()->buffers[1]; + auto const sview = strings_column_view{input_view}; + auto data_buffer = fetch_data_buffer( + device_span{sview.chars_begin(stream), + static_cast(sview.chars_size(stream))}, + ar_mr, + stream); return std::make_shared(static_cast(input_view.size()), offset_buffer, data_buffer, diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index aad761acdba..995d8d942c9 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -377,8 +377,8 @@ void write_chunked(data_sink* out_sink, rmm::mr::get_current_device_resource()); strings_column_view strings_column{p_str_col_w_nl->view()}; - auto total_num_bytes = strings_column.chars_size(); - char const* ptr_all_bytes = strings_column.chars_begin(); + auto total_num_bytes = strings_column.chars_size(stream); + char const* ptr_all_bytes = strings_column.chars_begin(stream); if (out_sink->is_device_write_preferred(total_num_bytes)) { // Direct write from device memory diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 056cce18a52..f1296daca26 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -363,7 +363,7 @@ std::vector copy_strings_to_host(device_span input, if (col.is_empty()) return std::vector{}; auto const scv = cudf::strings_column_view(col); auto const h_chars = cudf::detail::make_std_vector_sync( - cudf::device_span(scv.chars().data(), scv.chars().size()), stream); + cudf::device_span(scv.chars_begin(stream), scv.chars_size(stream)), stream); auto const h_offsets = cudf::detail::make_std_vector_sync( cudf::device_span(scv.offsets().data() + scv.offset(), scv.size() + 1), diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index b2017ee513f..c35f15049bd 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -347,10 +347,13 @@ std::unique_ptr struct_to_strings(table_view const& strings_columns, d_strview_offsets + row_string_offsets.size(), old_offsets.begin(), row_string_offsets.begin()); + auto chars_data = joined_col->release().data; + auto const chars_size = chars_data->size(); return make_strings_column( strings_columns.num_rows(), std::make_unique(std::move(row_string_offsets), rmm::device_buffer{}, 0), - std::move(joined_col->release().children[strings_column_view::chars_column_index]), + std::make_unique( + data_type{type_id::INT8}, chars_size, std::move(*chars_data), rmm::device_buffer{}, 0), 0, {}); } @@ -469,10 +472,13 @@ std::unique_ptr join_list_of_strings(lists_column_view const& lists_stri d_strview_offsets.end(), old_offsets.begin(), row_string_offsets.begin()); + auto chars_data = joined_col->release().data; + auto const chars_size = chars_data->size(); return make_strings_column( num_lists, std::make_unique(std::move(row_string_offsets), rmm::device_buffer{}, 0), - std::move(joined_col->release().children[strings_column_view::chars_column_index]), + std::make_unique( + data_type{type_id::INT8}, chars_size, std::move(*chars_data), rmm::device_buffer{}, 0), lists_strings.null_count(), cudf::detail::copy_bitmask(lists_strings.parent(), stream, mr)); } @@ -812,8 +818,8 @@ void write_chunked(data_sink* out_sink, CUDF_FUNC_RANGE(); CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column."); - auto const total_num_bytes = str_column_view.chars_size() - skip_last_chars; - char const* ptr_all_bytes = str_column_view.chars_begin(); + auto const total_num_bytes = str_column_view.chars_size(stream) - skip_last_chars; + char const* ptr_all_bytes = str_column_view.chars_begin(stream); if (out_sink->is_device_write_preferred(total_num_bytes)) { // Direct write from device memory diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 279a814a4e1..90f52c0ee70 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -609,10 +609,10 @@ std::vector construct_schema_tree( // column that isn't a single-depth list the code will throw. if (col_meta.is_enabled_output_as_binary() && is_last_list_child(col)) { CUDF_EXPECTS(col_meta.num_children() == 2 or col_meta.num_children() == 0, - "Binary column's corresponding metadata should have zero or two children!"); + "Binary column's corresponding metadata should have zero or two children"); if (col_meta.num_children() > 0) { CUDF_EXPECTS(col->children[lists_column_view::child_column_index]->children.empty(), - "Binary column must not be nested!"); + "Binary column must not be nested"); } schema_tree_node col_schema{}; @@ -734,8 +734,13 @@ std::vector construct_schema_tree( } else { // if leaf, add current if (col->type().id() == type_id::STRING) { - CUDF_EXPECTS(col_meta.num_children() == 2 or col_meta.num_children() == 0, - "String column's corresponding metadata should have zero or two children"); + if (col_meta.is_enabled_output_as_binary()) { + CUDF_EXPECTS(col_meta.num_children() == 2 or col_meta.num_children() == 0, + "Binary column's corresponding metadata should have zero or two children"); + } else { + CUDF_EXPECTS(col_meta.num_children() == 1 or col_meta.num_children() == 0, + "String column's corresponding metadata should have zero or one children"); + } } else { CUDF_EXPECTS(col_meta.num_children() == 0, "Leaf column's corresponding metadata cannot have children"); diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp index 1cbd5929525..88617510394 100644 --- a/cpp/src/io/utilities/column_buffer.cpp +++ b/cpp/src/io/utilities/column_buffer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -161,7 +161,6 @@ std::unique_ptr make_column(column_buffer_base& buffer, if (schema.value_or(reader_column_schema{}).is_enabled_convert_binary_to_strings()) { if (schema_info != nullptr) { schema_info->children.push_back(column_name_info{"offsets"}); - schema_info->children.push_back(column_name_info{"chars"}); } // make_strings_column allocates new memory, it does not simply move @@ -177,12 +176,11 @@ std::unique_ptr make_column(column_buffer_base& buffer, auto col_content = string_col->release(); // convert to uint8 column, strings are currently stored as int8 - auto contents = - col_content.children[strings_column_view::chars_column_index].release()->release(); - auto data = contents.data.release(); + auto data = col_content.data.release(); + auto char_size = data->size(); auto uint8_col = std::make_unique( - data_type{type_id::UINT8}, data->size(), std::move(*data), rmm::device_buffer{}, 0); + data_type{type_id::UINT8}, char_size, std::move(*data), rmm::device_buffer{}, 0); if (schema_info != nullptr) { schema_info->children.push_back(column_name_info{"offsets"}); diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu index cbc99fcdb83..7b37e2dc8f6 100644 --- a/cpp/src/lists/interleave_columns.cu +++ b/cpp/src/lists/interleave_columns.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -193,8 +193,7 @@ struct compute_string_sizes_and_interleave_lists_fn { auto const start_byte = str_offsets[start_str_idx]; auto const end_byte = str_offsets[end_str_idx]; if (start_byte < end_byte) { - auto const input_ptr = - str_col.child(strings_column_view::chars_column_index).template data() + start_byte; + auto const input_ptr = str_col.template head() + start_byte; auto const output_ptr = d_chars + d_offsets[write_idx]; thrust::copy(thrust::seq, input_ptr, input_ptr + end_byte - start_byte, output_ptr); } diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu index 5f89b6d9b3b..6ed28e693fd 100644 --- a/cpp/src/reshape/byte_cast.cu +++ b/cpp/src/reshape/byte_cast.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -135,10 +135,9 @@ struct byte_list_conversion_fn(input, stream, mr)->release(); - auto chars_contents = col_content.children[strings_column_view::chars_column_index]->release(); - auto const num_chars = chars_contents.data->size(); + auto const num_chars = col_content.data->size(); auto uint8_col = std::make_unique( - output_type, num_chars, std::move(*(chars_contents.data)), rmm::device_buffer{}, 0); + output_type, num_chars, std::move(*(col_content.data)), rmm::device_buffer{}, 0); auto result = make_lists_column( input.size(), diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu index de51356845c..00e49f9d97e 100644 --- a/cpp/src/strings/attributes.cu +++ b/cpp/src/strings/attributes.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -170,7 +170,8 @@ std::unique_ptr count_characters(strings_column_view const& input, rmm::mr::device_memory_resource* mr) { if ((input.size() == input.null_count()) || - ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD)) { + ((input.chars_size(stream) / (input.size() - input.null_count())) < + AVG_CHAR_BYTES_THRESHOLD)) { auto ufn = cuda::proclaim_return_type( [] __device__(string_view const& d_str) { return d_str.length(); }); return counts_fn(input, ufn, stream, mr); diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu index 8f4c2ee574a..c2e8033b42d 100644 --- a/cpp/src/strings/case.cu +++ b/cpp/src/strings/case.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,6 +33,7 @@ #include #include +#include namespace cudf { namespace strings { @@ -211,7 +212,7 @@ std::unique_ptr convert_case(strings_column_view const& input, upper_lower_fn converter{ccfn, *d_strings}; // For smaller strings, use the regular string-parallel algorithm - if ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) { + if ((input.chars_size(stream) / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) { auto [offsets, chars] = cudf::strings::detail::make_strings_children(converter, input.size(), stream, mr); return make_strings_column(input.size(), @@ -227,16 +228,16 @@ std::unique_ptr convert_case(strings_column_view const& input, // but results in a large performance gain when the input contains only single-byte characters. // The count_if is faster than any_of or all_of: https://github.com/NVIDIA/thrust/issues/1016 bool const multi_byte_chars = - thrust::count_if( - rmm::exec_policy(stream), input.chars_begin(), input.chars_end(), [] __device__(auto chr) { - return is_utf8_continuation_char(chr); - }) > 0; + thrust::count_if(rmm::exec_policy(stream), + input.chars_begin(stream), + input.chars_end(stream), + cuda::proclaim_return_type( + [] __device__(auto chr) { return is_utf8_continuation_char(chr); })) > 0; if (!multi_byte_chars) { // optimization for ASCII-only case: copy the input column and inplace replace each character - auto result = std::make_unique(input.parent(), stream, mr); - auto d_chars = - result->mutable_view().child(strings_column_view::chars_column_index).data(); - auto const chars_size = strings_column_view(result->view()).chars_size(); + auto result = std::make_unique(input.parent(), stream, mr); + auto d_chars = result->mutable_view().head(); + auto const chars_size = strings_column_view(result->view()).chars_size(stream); thrust::transform( rmm::exec_policy(stream), d_chars, d_chars + chars_size, d_chars, ascii_converter_fn{ccfn}); result->set_null_count(input.null_count()); diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu index 9ab527feaf8..48304759f7a 100644 --- a/cpp/src/strings/combine/join.cu +++ b/cpp/src/strings/combine/join.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -145,15 +145,19 @@ std::unique_ptr join_strings(strings_column_view const& input, auto chars_column = [&] { // build the strings column and commandeer the chars column if ((input.size() == input.null_count()) || - ((input.chars_size() / (input.size() - input.null_count())) <= AVG_CHAR_BYTES_THRESHOLD)) { + ((input.chars_size(stream) / (input.size() - input.null_count())) <= + AVG_CHAR_BYTES_THRESHOLD)) { return std::get<1>( make_strings_children(join_fn{*d_strings, d_separator, d_narep}, input.size(), stream, mr)); } // dynamically feeds index pairs to build the output auto indices = cudf::detail::make_counting_transform_iterator( 0, join_gather_fn{*d_strings, d_separator, d_narep}); - auto joined_col = make_strings_column(indices, indices + (input.size() * 2), stream, mr); - return std::move(joined_col->release().children.back()); + auto joined_col = make_strings_column(indices, indices + (input.size() * 2), stream, mr); + auto chars_data = joined_col->release().data; + auto const chars_size = chars_data->size(); + return std::make_unique( + data_type{type_id::INT8}, chars_size, std::move(*chars_data), rmm::device_buffer{}, 0); }(); // build the offsets: single string output has offsets [0,chars-size] diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index 027466ef13c..2d9b06183e2 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -192,8 +192,7 @@ __global__ void fused_concatenate_string_chars_kernel(column_device_view const* auto const input_offsets_data = cudf::detail::input_offsetalator(offsets_child.head(), offsets_child.type()); - constexpr auto chars_child = strings_column_view::chars_column_index; - auto const* input_chars_data = input_view.child(chars_child).data(); + auto const* input_chars_data = input_view.head(); auto const first_char = input_offsets_data[input_view.offset()]; output_data[output_index] = input_chars_data[offset_index + first_char]; @@ -287,12 +286,11 @@ std::unique_ptr concatenate(host_span columns, continue; // empty column may not have children size_type column_offset = column->offset(); column_view offsets_child = column->child(strings_column_view::offsets_column_index); - column_view chars_child = column->child(strings_column_view::chars_column_index); auto const bytes_offset = get_offset_value(offsets_child, column_offset, stream); auto const bytes_end = get_offset_value(offsets_child, column_size + column_offset, stream); // copy the chars column data - auto d_chars = chars_child.data() + bytes_offset; + auto d_chars = column->head() + bytes_offset; auto const bytes = bytes_end - bytes_offset; CUDF_CUDA_TRY( diff --git a/cpp/src/strings/copying/copying.cu b/cpp/src/strings/copying/copying.cu index 2295a80ff5b..4f37d3864ac 100644 --- a/cpp/src/strings/copying/copying.cu +++ b/cpp/src/strings/copying/copying.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -65,12 +66,10 @@ std::unique_ptr copy_slice(strings_column_view const& strings, } // slice the chars child column - auto const data_size = - cudf::detail::get_value(offsets_column->view(), strings_count, stream); - auto chars_column = std::make_unique( - cudf::detail::slice(strings.chars(), {chars_offset, chars_offset + data_size}, stream).front(), - stream, - mr); + auto const data_size = static_cast( + cudf::detail::get_value(offsets_column->view(), strings_count, stream)); + auto chars_buffer = + rmm::device_buffer{strings.chars_begin(stream) + chars_offset, data_size, stream, mr}; // slice the null mask auto null_mask = cudf::detail::copy_bitmask( @@ -81,7 +80,7 @@ std::unique_ptr copy_slice(strings_column_view const& strings, return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + std::move(chars_buffer), null_count, std::move(null_mask)); } diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu index b54c433c23d..3b798a87d54 100644 --- a/cpp/src/strings/copying/shift.cu +++ b/cpp/src/strings/copying/shift.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -69,8 +69,7 @@ struct shift_chars_fn { auto const first_index = offset + d_column.child(strings_column_view::offsets_column_index) .element(d_column.offset() + d_column.size()); - return d_column.child(strings_column_view::chars_column_index) - .element(idx + first_index); + return d_column.head()[idx + first_index]; } else { auto const char_index = idx - last_index; return d_filler.data()[char_index % d_filler.size_bytes()]; @@ -79,10 +78,9 @@ struct shift_chars_fn { if (idx < offset) { return d_filler.data()[idx % d_filler.size_bytes()]; } else { - return d_column.child(strings_column_view::chars_column_index) - .element(idx - offset + - d_column.child(strings_column_view::offsets_column_index) - .element(d_column.offset())); + return d_column.head()[idx - offset + + d_column.child(strings_column_view::offsets_column_index) + .element(d_column.offset())]; } } } diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu index 28736c2ca15..a0f9d1136f3 100644 --- a/cpp/src/strings/replace/multi.cu +++ b/cpp/src/strings/replace/multi.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -74,10 +74,7 @@ using target_pair = thrust::pair; * @brief Helper functions for performing character-parallel replace */ struct replace_multi_parallel_fn { - __device__ char const* get_base_ptr() const - { - return d_strings.child(strings_column_view::chars_column_index).data(); - } + __device__ char const* get_base_ptr() const { return d_strings.head(); } __device__ size_type const* get_offsets_ptr() const { @@ -378,6 +375,11 @@ std::unique_ptr replace_character_parallel(strings_column_view const& in // use this utility to gather the string parts into a contiguous chars column auto chars = make_strings_column(indices.begin(), indices.end(), stream, mr); + // TODO ideally we can pass this chars_data as rmm buffer to make_strings_column + auto chars_data = chars->release().data; + auto const chars_size = chars_data->size(); + auto chars_col = std::make_unique( + data_type{type_id::INT8}, chars_size, std::move(*chars_data), rmm::device_buffer{}, 0); // create offsets from the sizes offsets = @@ -386,7 +388,7 @@ std::unique_ptr replace_character_parallel(strings_column_view const& in // build the strings columns from the chars and offsets return make_strings_column(strings_count, std::move(offsets), - std::move(chars->release().children.back()), + std::move(chars_col), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } @@ -483,7 +485,8 @@ std::unique_ptr replace(strings_column_view const& input, CUDF_EXPECTS(repls.size() == targets.size(), "Sizes for targets and repls must match"); return (input.size() == input.null_count() || - ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD)) + ((input.chars_size(stream) / (input.size() - input.null_count())) < + AVG_CHAR_BYTES_THRESHOLD)) ? replace_string_parallel(input, targets, repls, stream, mr) : replace_character_parallel(input, targets, repls, stream, mr); } diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu index aa955d3086e..8c4bd4490b9 100644 --- a/cpp/src/strings/replace/replace.cu +++ b/cpp/src/strings/replace/replace.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -415,7 +415,7 @@ std::unique_ptr replace_char_parallel(strings_column_view const& strings auto const strings_count = strings.size(); auto const offset_count = strings_count + 1; auto const d_offsets = strings.offsets_begin(); - auto const d_in_chars = strings.chars_begin(); + auto const d_in_chars = strings.chars_begin(stream); auto const chars_bytes = chars_end - chars_start; auto const target_size = d_target.size_bytes(); @@ -574,7 +574,7 @@ std::unique_ptr replace(strings_column_view con ? 0 : cudf::detail::get_value(strings.offsets(), strings.offset(), stream); size_type const chars_end = (offset_count == strings.offsets().size()) - ? strings.chars_size() + ? strings.chars_size(stream) : cudf::detail::get_value( strings.offsets(), strings.offset() + strings_count, stream); size_type const chars_bytes = chars_end - chars_start; @@ -612,7 +612,7 @@ std::unique_ptr replace( : cudf::detail::get_value( strings.offsets(), strings.offset(), stream); size_type chars_end = (offset_count == strings.offsets().size()) - ? strings.chars_size() + ? strings.chars_size(stream) : cudf::detail::get_value( strings.offsets(), strings.offset() + strings_count, stream); return replace_char_parallel( diff --git a/cpp/src/strings/reverse.cu b/cpp/src/strings/reverse.cu index 2855bdbb827..aecb029f25f 100644 --- a/cpp/src/strings/reverse.cu +++ b/cpp/src/strings/reverse.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -65,7 +65,7 @@ std::unique_ptr reverse(strings_column_view const& input, auto result = std::make_unique(input.parent(), stream, mr); auto const d_offsets = result->view().child(strings_column_view::offsets_column_index).data(); - auto d_chars = result->mutable_view().child(strings_column_view::chars_column_index).data(); + auto d_chars = result->mutable_view().head(); auto const d_column = column_device_view::create(input.parent(), stream); thrust::for_each_n(rmm::exec_policy(stream), diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu index d35f512e0f7..4ba1359c469 100644 --- a/cpp/src/strings/search/find.cu +++ b/cpp/src/strings/search/find.cu @@ -186,7 +186,7 @@ void find_utility(strings_column_view const& input, { auto d_strings = column_device_view::create(input.parent(), stream); auto d_results = output.mutable_view().data(); - if ((input.chars_size() / (input.size() - input.null_count())) > AVG_CHAR_BYTES_THRESHOLD) { + if ((input.chars_size(stream) / (input.size() - input.null_count())) > AVG_CHAR_BYTES_THRESHOLD) { // warp-per-string runs faster for longer strings (but not shorter ones) constexpr int block_size = 256; cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size}; @@ -538,7 +538,7 @@ std::unique_ptr contains(strings_column_view const& input, { // use warp parallel when the average string width is greater than the threshold if ((input.null_count() < input.size()) && - ((input.chars_size() / input.size()) > AVG_CHAR_BYTES_THRESHOLD)) { + ((input.chars_size(stream) / input.size()) > AVG_CHAR_BYTES_THRESHOLD)) { return contains_warp_parallel(input, target, stream, mr); } diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh index dc0b04af388..c5fb44fc3dd 100644 --- a/cpp/src/strings/split/split.cuh +++ b/cpp/src/strings/split/split.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -49,10 +49,7 @@ namespace cudf::strings::detail { */ template struct base_split_tokenizer { - __device__ char const* get_base_ptr() const - { - return d_strings.child(strings_column_view::chars_column_index).data(); - } + __device__ char const* get_base_ptr() const { return d_strings.head(); } __device__ string_view const get_string(size_type idx) const { diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu index 0b55e18b00a..5ba4d8d3132 100644 --- a/cpp/src/strings/strings_column_factories.cu +++ b/cpp/src/strings/strings_column_factories.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -131,10 +131,33 @@ std::unique_ptr make_strings_column(size_type num_strings, std::vector> children; children.emplace_back(std::move(offsets_column)); - children.emplace_back(std::move(chars_column)); return std::make_unique(data_type{type_id::STRING}, num_strings, - rmm::device_buffer{}, + std::move(*(chars_column->release().data.release())), + std::move(null_mask), + null_count, + std::move(children)); +} + +std::unique_ptr make_strings_column(size_type num_strings, + std::unique_ptr offsets_column, + rmm::device_buffer&& chars_buffer, + size_type null_count, + rmm::device_buffer&& null_mask) +{ + CUDF_FUNC_RANGE(); + + if (null_count > 0) { CUDF_EXPECTS(null_mask.size() > 0, "Column with nulls must be nullable."); } + CUDF_EXPECTS(num_strings == offsets_column->size() - 1, + "Invalid offsets column size for strings column."); + CUDF_EXPECTS(offsets_column->null_count() == 0, "Offsets column should not contain nulls"); + + std::vector> children; + children.emplace_back(std::move(offsets_column)); + + return std::make_unique(data_type{type_id::STRING}, + num_strings, + std::move(chars_buffer), std::move(null_mask), null_count, std::move(children)); @@ -151,7 +174,6 @@ std::unique_ptr make_strings_column(size_type num_strings, if (num_strings == 0) { return make_empty_column(type_id::STRING); } auto const offsets_size = static_cast(offsets.size()); - auto const chars_size = static_cast(chars.size()); if (null_count > 0) CUDF_EXPECTS(null_mask.size() > 0, "Column with nulls must be nullable."); @@ -164,21 +186,13 @@ std::unique_ptr make_strings_column(size_type num_strings, rmm::device_buffer(), 0); - auto chars_column = std::make_unique( // - data_type{type_id::INT8}, - chars_size, - chars.release(), - rmm::device_buffer(), - 0); - auto children = std::vector>(); children.emplace_back(std::move(offsets_column)); - children.emplace_back(std::move(chars_column)); return std::make_unique(data_type{type_id::STRING}, num_strings, - rmm::device_buffer{}, + chars.release(), std::move(null_mask), null_count, std::move(children)); diff --git a/cpp/src/strings/strings_column_view.cpp b/cpp/src/strings/strings_column_view.cpp index 4b206666d4b..27a8c6fb17f 100644 --- a/cpp/src/strings/strings_column_view.cpp +++ b/cpp/src/strings/strings_column_view.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,9 +14,12 @@ * limitations under the License. */ +#include #include #include +#include + namespace cudf { // strings_column_view::strings_column_view(column_view strings_column) : column_view(strings_column) @@ -42,26 +45,28 @@ strings_column_view::offset_iterator strings_column_view::offsets_end() const return offsets_begin() + size() + 1; } -column_view strings_column_view::chars() const +column_view strings_column_view::chars(rmm::cuda_stream_view stream) const { CUDF_EXPECTS(num_children() > 0, "strings column has no children"); - return child(chars_column_index); + return column_view( + data_type{type_id::INT8}, chars_size(stream), chars_begin(stream), nullptr, 0, 0); } -size_type strings_column_view::chars_size() const noexcept +size_type strings_column_view::chars_size(rmm::cuda_stream_view stream) const noexcept { if (size() == 0) return 0; - return chars().size(); + return detail::get_value(offsets(), offsets().size() - 1, stream); } -strings_column_view::chars_iterator strings_column_view::chars_begin() const +strings_column_view::chars_iterator strings_column_view::chars_begin(rmm::cuda_stream_view) const { - return chars().begin(); + return head(); } -strings_column_view::chars_iterator strings_column_view::chars_end() const +strings_column_view::chars_iterator strings_column_view::chars_end( + rmm::cuda_stream_view stream) const { - return chars_begin() + chars_size(); + return chars_begin(stream) + chars_size(stream); } } // namespace cudf diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu index aa87a663964..19f1ac55bb0 100644 --- a/cpp/src/strings/wrap.cu +++ b/cpp/src/strings/wrap.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -111,8 +112,11 @@ std::unique_ptr wrap(strings_column_view const& strings, auto offsets_column = std::make_unique(strings.offsets(), stream, mr); // makes a copy auto d_new_offsets = offsets_column->view().template data(); - auto chars_column = std::make_unique(strings.chars(), stream, mr); // makes a copy - auto d_chars = chars_column->mutable_view().data(); + auto chars_buffer = rmm::device_buffer{strings.chars_begin(stream), + static_cast(strings.chars_size(stream)), + stream, + mr}; // makes a copy + auto d_chars = static_cast(chars_buffer.data()); device_execute_functor d_execute_fctr{d_column, d_new_offsets, d_chars, width}; @@ -123,7 +127,7 @@ std::unique_ptr wrap(strings_column_view const& strings, return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + std::move(chars_buffer), null_count, std::move(null_mask)); } diff --git a/cpp/src/text/bpe/byte_pair_encoding.cu b/cpp/src/text/bpe/byte_pair_encoding.cu index 2d53faf548e..a697df913d3 100644 --- a/cpp/src/text/bpe/byte_pair_encoding.cu +++ b/cpp/src/text/bpe/byte_pair_encoding.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -342,7 +342,7 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (input.is_empty() || input.chars_size() == 0) { + if (input.is_empty() || input.chars_size(stream) == 0) { return cudf::make_empty_column(cudf::type_id::STRING); } @@ -356,11 +356,11 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const : cudf::detail::get_value( input.offsets(), input.offset(), stream); auto const last_offset = (input.offset() == 0 && input.size() == input.offsets().size() - 1) - ? input.chars().size() + ? input.chars_size(stream) : cudf::detail::get_value( input.offsets(), input.size() + input.offset(), stream); auto const chars_size = last_offset - first_offset; - auto const d_input_chars = input.chars().data() + first_offset; + auto const d_input_chars = input.chars_begin(stream) + first_offset; auto const offset_data_type = cudf::data_type{cudf::type_to_id()}; auto offsets = cudf::make_numeric_column( @@ -406,7 +406,7 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const cudf::column_view(cudf::device_span(tmp_offsets)); auto const tmp_size = offsets_total - 1; auto const tmp_input = cudf::column_view( - input.parent().type(), tmp_size, nullptr, nullptr, 0, 0, {col_offsets, input.chars()}); + input.parent().type(), tmp_size, input.chars_begin(stream), nullptr, 0, 0, {col_offsets}); auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream); // launch the byte-pair-encoding kernel on the temp column diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index 31e2405ce88..1d3e98a25ad 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -128,11 +128,11 @@ std::unique_ptr generate_ngrams(cudf::strings_column_view const& s // create a temporary column view from the non-empty offsets and chars column views cudf::column_view strings_view(cudf::data_type{cudf::type_id::STRING}, strings_count, - nullptr, + strings.chars_begin(stream), nullptr, 0, 0, - {non_empty_offsets_column->view(), strings.chars()}); + {non_empty_offsets_column->view()}); strings_column = cudf::column_device_view::create(strings_view, stream); d_strings = *strings_column; diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index 0fc1d221b15..5a0977d410f 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -210,7 +210,7 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con auto const offsets = strings.offsets(); auto const d_offsets = offsets.data() + strings.offset(); auto const offset = cudf::detail::get_value(offsets, strings.offset(), stream); - auto const d_chars = strings.chars().data() + offset; + auto const d_chars = strings.chars_begin(stream) + offset; return normalizer.normalize(d_chars, d_offsets, strings.size(), stream); }(); diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu index 1a3084a257f..a35d69e2145 100644 --- a/cpp/src/text/subword/subword_tokenize.cu +++ b/cpp/src/text/subword/subword_tokenize.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -186,7 +186,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings, auto const offsets = strings.offsets(); auto const d_offsets = offsets.data() + strings.offset(); auto const offset = cudf::detail::get_value(offsets, strings.offset(), stream); - auto const d_chars = strings.chars().data() + offset; + auto const d_chars = strings.chars_begin(stream) + offset; // Create tokenizer wordpiece_tokenizer tokenizer( diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu index 87f6a61a533..c43b9dda56c 100644 --- a/cpp/src/text/tokenize.cu +++ b/cpp/src/text/tokenize.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -182,7 +182,8 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const auto chars_bytes = cudf::detail::get_value( offsets, strings_column.offset() + strings_count, stream) - offset; - auto d_chars = strings_column.chars().data(); // unsigned is necessary for checking bits + auto d_chars = + strings_column.parent().data(); // unsigned is necessary for checking bits d_chars += offset; // To minimize memory, count the number of characters so we can diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu index 511f1995374..91f4c304590 100644 --- a/cpp/src/text/vocabulary_tokenize.cu +++ b/cpp/src/text/vocabulary_tokenize.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -240,10 +240,8 @@ __global__ void token_counts_fn(cudf::column_device_view const d_strings, auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index).data(); - auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()]; - auto const chars_begin = - d_strings.child(cudf::strings_column_view::chars_column_index).data() + - offsets[d_strings.offset()]; + auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()]; + auto const chars_begin = d_strings.data() + offsets[d_strings.offset()]; auto const begin = d_str.data(); auto const end = begin + d_str.size_bytes(); @@ -372,7 +370,7 @@ std::unique_ptr tokenize_with_vocabulary(cudf::strings_column_view auto map_ref = vocabulary._impl->get_map_ref(); auto const zero_itr = thrust::make_counting_iterator(0); - if ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) { + if ((input.chars_size(stream) / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) { auto const sizes_itr = cudf::detail::make_counting_transform_iterator(0, strings_tokenizer{*d_strings, d_delimiter}); auto [token_offsets, total_count] = @@ -401,11 +399,11 @@ std::unique_ptr tokenize_with_vocabulary(cudf::strings_column_view : cudf::detail::get_value( input.offsets(), input.offset(), stream); auto const last_offset = (input.offset() == 0 && input.size() == input.offsets().size() - 1) - ? input.chars().size() + ? input.chars_size(stream) : cudf::detail::get_value( input.offsets(), input.size() + input.offset(), stream); auto const chars_size = last_offset - first_offset; - auto const d_input_chars = input.chars().data() + first_offset; + auto const d_input_chars = input.chars_begin(stream) + first_offset; rmm::device_uvector d_token_counts(input.size(), stream); rmm::device_uvector d_marks(chars_size, stream); @@ -436,9 +434,8 @@ std::unique_ptr tokenize_with_vocabulary(cudf::strings_column_view auto tmp_offsets = std::make_unique(std::move(d_tmp_offsets), rmm::device_buffer{}, 0); - auto tmp_chars = cudf::column_view(input.chars().type(), chars_size, d_input_chars, nullptr, 0); auto const tmp_input = cudf::column_view( - input.parent().type(), total_count, nullptr, nullptr, 0, 0, {tmp_offsets->view(), tmp_chars}); + input.parent().type(), total_count, d_input_chars, nullptr, 0, 0, {tmp_offsets->view()}); auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream); diff --git a/cpp/src/transform/row_conversion.cu b/cpp/src/transform/row_conversion.cu index a1c5827e5da..b797e495480 100644 --- a/cpp/src/transform/row_conversion.cu +++ b/cpp/src/transform/row_conversion.cu @@ -1994,11 +1994,9 @@ std::vector> convert_to_rows( CUDF_EXPECTS(!variable_width_table.is_empty(), "No variable-width columns when expected!"); CUDF_EXPECTS(variable_width_offsets.has_value(), "No variable width offset data!"); - auto const variable_data_begin = - thrust::make_transform_iterator(variable_width_table.begin(), [](auto const& c) { - strings_column_view const scv{c}; - return is_compound(c.type()) ? scv.chars().template data() : nullptr; - }); + auto const variable_data_begin = thrust::make_transform_iterator( + variable_width_table.begin(), + [](auto const& c) { return is_compound(c.type()) ? c.template data() : nullptr; }); std::vector variable_width_input_data( variable_data_begin, variable_data_begin + variable_width_table.num_columns()); diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu index 036b9170250..6923b7be42d 100644 --- a/cpp/tests/io/json_type_cast_test.cu +++ b/cpp/tests/io/json_type_cast_test.cu @@ -95,7 +95,7 @@ TEST_F(JSONTypeCastTest, String) std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size())); auto str_col = cudf::io::json::detail::parse_data( - column.chars().data(), + column.chars_begin(stream), thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), column.size(), type, @@ -128,7 +128,7 @@ TEST_F(JSONTypeCastTest, Int) std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size())); auto col = cudf::io::json::detail::parse_data( - column.chars().data(), + column.chars_begin(stream), thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), column.size(), type, @@ -168,7 +168,7 @@ TEST_F(JSONTypeCastTest, StringEscapes) std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size())); auto col = cudf::io::json::detail::parse_data( - column.chars().data(), + column.chars_begin(stream), thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), column.size(), type, @@ -237,7 +237,7 @@ TEST_F(JSONTypeCastTest, ErrorNulls) std::get<0>(cudf::test::detail::make_null_mask(null_mask_it, null_mask_it + column.size())); auto str_col = cudf::io::json::detail::parse_data( - column.chars().data(), + column.chars_begin(stream), thrust::make_zip_iterator(thrust::make_tuple(column.offsets_begin(), svs_length.begin())), column.size(), type, diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp index d1e0dfb1ff1..c6cc8e078bb 100644 --- a/cpp/tests/strings/array_tests.cpp +++ b/cpp/tests/strings/array_tests.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -154,9 +155,14 @@ TEST_F(StringsColumnTest, GatherTooBig) std::vector h_chars(3000000); cudf::test::fixed_width_column_wrapper chars(h_chars.begin(), h_chars.end()); cudf::test::fixed_width_column_wrapper offsets({0, 3000000}); - auto input = cudf::column_view( - cudf::data_type{cudf::type_id::STRING}, 1, nullptr, nullptr, 0, 0, {offsets, chars}); - auto map = thrust::constant_iterator(0); + auto input = cudf::column_view(cudf::data_type{cudf::type_id::STRING}, + 1, + cudf::column_view(chars).begin(), + nullptr, + 0, + 0, + {offsets}); + auto map = thrust::constant_iterator(0); cudf::test::fixed_width_column_wrapper gather_map(map, map + 1000); EXPECT_THROW(cudf::gather(cudf::table_view{{input}}, gather_map), std::overflow_error); } @@ -220,7 +226,6 @@ TEST_F(StringsColumnTest, OffsetsBeginEnd) scv = cudf::strings_column_view(cudf::slice(input, {1, 5}).front()); EXPECT_EQ(std::distance(scv.offsets_begin(), scv.offsets_end()), static_cast(scv.size() + 1)); - EXPECT_EQ(std::distance(scv.chars_begin(), scv.chars_end()), 16L); } CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu index 1066738df72..5381ad63bc3 100644 --- a/cpp/tests/strings/factories_test.cu +++ b/cpp/tests/strings/factories_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -87,16 +87,18 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair) EXPECT_TRUE(column->nullable()); EXPECT_TRUE(column->has_nulls()); } - EXPECT_EQ(2, column->num_children()); + EXPECT_EQ(1, column->num_children()); + EXPECT_NE(nullptr, column->view().head()); cudf::strings_column_view strings_view(column->view()); EXPECT_EQ(strings_view.size(), count); EXPECT_EQ(strings_view.offsets().size(), count + 1); - EXPECT_EQ(strings_view.chars().size(), memsize); + EXPECT_EQ(strings_view.chars_size(cudf::get_default_stream()), memsize); // check string data auto h_chars_data = cudf::detail::make_std_vector_sync( - cudf::device_span(strings_view.chars().data(), strings_view.chars().size()), + cudf::device_span(strings_view.chars_begin(cudf::get_default_stream()), + strings_view.chars_size(cudf::get_default_stream())), cudf::get_default_stream()); auto h_offsets_data = cudf::detail::make_std_vector_sync( cudf::device_span( @@ -159,16 +161,18 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets) count, std::move(d_offsets), std::move(d_buffer), null_count, d_nulls.release()); EXPECT_EQ(column->type(), cudf::data_type{cudf::type_id::STRING}); EXPECT_EQ(column->null_count(), null_count); - EXPECT_EQ(2, column->num_children()); + EXPECT_EQ(1, column->num_children()); + EXPECT_NE(nullptr, column->view().head()); cudf::strings_column_view strings_view(column->view()); EXPECT_EQ(strings_view.size(), count); EXPECT_EQ(strings_view.offsets().size(), count + 1); - EXPECT_EQ(strings_view.chars().size(), memsize); + EXPECT_EQ(strings_view.chars_size(cudf::get_default_stream()), memsize); // check string data auto h_chars_data = cudf::detail::make_std_vector_sync( - cudf::device_span(strings_view.chars().data(), strings_view.chars().size()), + cudf::device_span(strings_view.chars_begin(cudf::get_default_stream()), + strings_view.chars_size(cudf::get_default_stream())), cudf::get_default_stream()); auto h_offsets_data = cudf::detail::make_std_vector_sync( cudf::device_span( diff --git a/cpp/tests/utilities_tests/column_wrapper_tests.cpp b/cpp/tests/utilities_tests/column_wrapper_tests.cpp index da17e33e11a..479c6687e75 100644 --- a/cpp/tests/utilities_tests/column_wrapper_tests.cpp +++ b/cpp/tests/utilities_tests/column_wrapper_tests.cpp @@ -255,7 +255,7 @@ TYPED_TEST(StringsColumnWrapperTest, NullablePairListConstructorAllNull) EXPECT_EQ(view.size(), count); EXPECT_EQ(view.offsets().size(), count + 1); // all null entries results in no data allocated to chars - EXPECT_EQ(nullptr, view.chars().head()); + EXPECT_EQ(nullptr, view.parent().head()); EXPECT_NE(nullptr, view.offsets().head()); EXPECT_TRUE(view.has_nulls()); EXPECT_EQ(view.null_count(), 5); diff --git a/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java b/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java index 7deb5bae541..666a8864003 100644 --- a/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java +++ b/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java @@ -1742,7 +1742,7 @@ private static long buildColumnView(SerializedColumnHeader column, } DType dtype = column.getType(); long bufferAddress = combinedBuffer.getAddress(); - long dataAddress = dtype.isNestedType() ? 0 : bufferAddress + offsetsInfo.data; + long dataAddress = offsetsInfo.dataLen == 0 ? 0 : bufferAddress + offsetsInfo.data; long validityAddress = needsValidityBuffer(column.getNullCount()) ? bufferAddress + offsetsInfo.validity : 0; long offsetsAddress = dtype.hasOffsets() ? bufferAddress + offsetsInfo.offsets : 0; diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 675996df71c..47dc802cd49 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -111,6 +111,9 @@ std::size_t calc_device_memory_size(cudf::column_view const &view, bool const pa auto dtype = view.type(); if (cudf::is_fixed_width(dtype)) { total += pad_size(cudf::size_of(dtype) * view.size(), pad_for_cpu); + } else if (dtype.id() == cudf::type_id::STRING) { + auto scv = cudf::strings_column_view(view); + total += pad_size(scv.chars_size(cudf::get_default_stream()), pad_for_cpu); } return std::accumulate(view.child_begin(), view.child_end(), total, @@ -1974,18 +1977,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeCudfColumnView( new cudf::column_view(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0)); } else { JNI_NULL_CHECK(env, j_offset, "offset is null", 0); - // This must be kept in sync with how string columns are created - // offsets are always the first child - // data is the second child - cudf::size_type *offsets = reinterpret_cast(j_offset); cudf::column_view offsets_column(cudf::data_type{cudf::type_id::INT32}, size + 1, offsets, nullptr, 0); - cudf::column_view data_column(cudf::data_type{cudf::type_id::INT8}, j_data_size, data, - nullptr, 0); return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::STRING}, size, - nullptr, valid, j_null_count, 0, - {offsets_column, data_column})); + data, valid, j_null_count, 0, {offsets_column})); } } else if (n_type == cudf::type_id::LIST) { JNI_NULL_CHECK(env, j_children, "children of a list are null", 0); @@ -2082,8 +2078,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeDataAddress(JNIE if (column->type().id() == cudf::type_id::STRING) { if (column->size() > 0) { cudf::strings_column_view view = cudf::strings_column_view(*column); - cudf::column_view data_view = view.chars(); - result = reinterpret_cast(data_view.data()); + result = reinterpret_cast(view.chars_begin(cudf::get_default_stream())); } } else if (column->type().id() != cudf::type_id::LIST && column->type().id() != cudf::type_id::STRUCT) { @@ -2104,8 +2099,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeDataLength(JNIEn if (column->type().id() == cudf::type_id::STRING) { if (column->size() > 0) { cudf::strings_column_view view = cudf::strings_column_view(*column); - cudf::column_view data_view = view.chars(); - result = data_view.size(); + result = view.chars_size(cudf::get_default_stream()); } } else if (column->type().id() != cudf::type_id::LIST && column->type().id() != cudf::type_id::STRUCT) { diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index d7d0279174d..295574858da 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -905,12 +905,12 @@ cudf::column_view remove_validity_from_col(cudf::column_view column_view) { children.push_back(remove_validity_from_col(*it)); } if (!column_view.nullable() || column_view.null_count() != 0) { - return cudf::column_view(column_view.type(), column_view.size(), nullptr, + return cudf::column_view(column_view.type(), column_view.size(), column_view.head(), column_view.null_mask(), column_view.null_count(), column_view.offset(), children); } else { - return cudf::column_view(column_view.type(), column_view.size(), nullptr, nullptr, 0, - column_view.offset(), children); + return cudf::column_view(column_view.type(), column_view.size(), column_view.head(), nullptr, + 0, column_view.offset(), children); } } } diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 0edf9f8aa95..acd0ba519dd 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from typing import Literal @@ -39,6 +39,7 @@ from cudf._lib.types cimport ( from cudf._lib.null_mask import bitmask_allocation_size_bytes from cudf._lib.types import dtype_from_pylibcudf_column +cimport cudf._lib.cpp.copying as cpp_copying cimport cudf._lib.cpp.types as libcudf_types cimport cudf._lib.cpp.unary as libcudf_unary from cudf._lib.cpp.column.column cimport column, column_contents @@ -52,6 +53,19 @@ from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.scalar cimport DeviceScalar +cdef get_element(column_view col_view, size_type index): + + cdef unique_ptr[scalar] c_output + with nogil: + c_output = move( + cpp_copying.get_element(col_view, index) + ) + + return DeviceScalar.from_unique_ptr( + move(c_output), dtype=dtype_from_column_view(col_view) + ) + + cdef class Column: """ A Column stores columnar data in device memory. @@ -652,11 +666,29 @@ cdef class Column: mask_owner = mask_owner.base_mask base_size = owner.base_size base_nbytes = base_size * dtype_itemsize + # special case for string column + is_string_column = (cv.type().id() == libcudf_types.type_id.STRING) + if is_string_column: + # get the size from offset child column (device to host copy) + offsets_column_index = 0 + offset_child_column = cv.child(offsets_column_index) + if offset_child_column.size() == 0: + base_nbytes = 0 + else: + chars_size = get_element( + offset_child_column, offset_child_column.size()-1).value + base_nbytes = chars_size + if data_ptr: if data_owner is None: + buffer_size = ( + base_nbytes + if is_string_column + else ((size + offset) * dtype_itemsize) + ) data = as_buffer( rmm.DeviceBuffer(ptr=data_ptr, - size=(size+offset) * dtype_itemsize) + size=buffer_size) ) elif ( column_owner and diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index c13ec33c51c..705862c502a 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1468,17 +1468,9 @@ def column_empty( ), ) elif dtype.kind in "OU" and not isinstance(dtype, DecimalDtype): - data = None + data = as_buffer(rmm.DeviceBuffer(size=0)) children = ( full(row_count + 1, 0, dtype=libcudf.types.size_type_dtype), - build_column( - data=as_buffer( - rmm.DeviceBuffer( - size=row_count * cudf.dtype("int8").itemsize - ) - ), - dtype="int8", - ), ) else: data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize)) @@ -1583,6 +1575,7 @@ def build_column( ) elif dtype.type in (np.object_, np.str_): return cudf.core.column.StringColumn( + data=data, mask=mask, size=size, offset=offset, diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 84333fc205a..c47088caebc 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5465,6 +5465,7 @@ class StringColumn(column.ColumnBase): def __init__( self, + data: Optional[Buffer] = None, mask: Optional[Buffer] = None, size: Optional[int] = None, # TODO: make non-optional offset: int = 0, @@ -5491,11 +5492,10 @@ def __init__( # all nulls-column: offsets = column.full(size + 1, 0, dtype=size_type_dtype) - chars = cudf.core.column.column_empty(0, dtype="int8") - children = (offsets, chars) + children = (offsets,) super().__init__( - data=None, + data=data, size=size, dtype=dtype, mask=mask, @@ -5516,7 +5516,7 @@ def copy(self, deep: bool = True): def start_offset(self) -> int: if self._start_offset is None: if ( - len(self.base_children) == 2 + len(self.base_children) == 1 and self.offset < self.base_children[0].size ): self._start_offset = int( @@ -5531,7 +5531,7 @@ def start_offset(self) -> int: def end_offset(self) -> int: if self._end_offset is None: if ( - len(self.base_children) == 2 + len(self.base_children) == 1 and (self.offset + self.size) < self.base_children[0].size ): self._end_offset = int( @@ -5547,16 +5547,14 @@ def end_offset(self) -> int: @cached_property def memory_usage(self) -> int: n = 0 - if len(self.base_children) == 2: + if self.data is not None: + n += self.data.size + if len(self.base_children) == 1: child0_size = (self.size + 1) * self.base_children[ 0 ].dtype.itemsize - child1_size = ( - self.end_offset - self.start_offset - ) * self.base_children[1].dtype.itemsize - - n += child0_size + child1_size + n += child0_size if self.nullable: n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size) return n @@ -5568,6 +5566,24 @@ def base_size(self) -> int: else: return self.base_children[0].size - 1 + # override for string column + @property + def data(self): + if self.base_data is None: + return None + if self._data is None: + if ( + self.offset == 0 + and len(self.base_children) > 0 + and self.size == self.base_children[0].size - 1 + ): + self._data = self.base_data + else: + self._data = self.base_data[ + self.start_offset : self.end_offset + ] + return self._data + def data_array_view( self, *, mode="write" ) -> cuda.devicearray.DeviceNDArray: @@ -5614,14 +5630,6 @@ def sum( else: return result_col - def set_base_data(self, value): - if value is not None: - raise RuntimeError( - "StringColumns do not use data attribute of Column, use " - "`set_base_children` instead" - ) - super().set_base_data(value) - def __contains__(self, item: ScalarLike) -> bool: if is_scalar(item): return True in libcudf.search.contains( @@ -5938,15 +5946,12 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase": str_end_byte_offset = self.base_children[0].element_indexing( self.offset + self.size ) - char_dtype_size = self.base_children[1].dtype.itemsize - n_bytes_to_view = ( - str_end_byte_offset - str_byte_offset - ) * char_dtype_size + n_bytes_to_view = str_end_byte_offset - str_byte_offset to_view = column.build_column( - self.base_children[1].data, - dtype=self.base_children[1].dtype, + self.base_data, + dtype=cudf.api.types.dtype("int8"), offset=str_byte_offset, size=n_bytes_to_view, ) diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 6e1c5f6fd00..c97d6dcdd2d 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. import enum from collections import abc @@ -482,7 +482,9 @@ def _get_data_buffer( dtype = self._dtype_from_cudfdtype(col_data.dtype) elif self.dtype[0] == _DtypeKind.STRING: - col_data = self._col.children[1] + col_data = build_column( + data=self._col.data, dtype=np.dtype("int8") + ) dtype = self._dtype_from_cudfdtype(col_data.dtype) else: diff --git a/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_0.16.pkl b/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_0.16.pkl deleted file mode 100644 index 97c745c1dd0..00000000000 Binary files a/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_0.16.pkl and /dev/null differ diff --git a/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl b/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl new file mode 100644 index 00000000000..1ec077d10f7 Binary files /dev/null and b/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl differ diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index d6134c7bb01..bffbade14d8 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. from typing import Any, Tuple @@ -112,7 +112,8 @@ def assert_column_equal(col: _CuDFColumn, cudfcol): assert col.get_buffers()["offsets"] is None elif col.dtype[0] == _DtypeKind.STRING: - assert_buffer_equal(col.get_buffers()["data"], cudfcol.children[1]) + chars_col = build_column(data=cudfcol.data, dtype="int8") + assert_buffer_equal(col.get_buffers()["data"], chars_col) assert_buffer_equal(col.get_buffers()["offsets"], cudfcol.children[0]) else: diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py index cac170cce55..87efe6bbbcc 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/test_serialize.py @@ -1,5 +1,6 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. +import itertools import pickle import msgpack @@ -115,6 +116,7 @@ ] ), ], + ids=itertools.count(), ) @pytest.mark.parametrize("to_host", [True, False]) def test_serialize(df, to_host): @@ -368,8 +370,8 @@ def test_serialize_string_check_buffer_sizes(): assert expect == got -def test_deserialize_cudf_0_16(datadir): - fname = datadir / "pkl" / "stringColumnWithRangeIndex_cudf_0.16.pkl" +def test_deserialize_cudf_23_12(datadir): + fname = datadir / "pkl" / "stringColumnWithRangeIndex_cudf_23.12.pkl" expected = cudf.DataFrame({"a": ["hi", "hello", "world", None]}) with open(fname, "rb") as f: diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py index e6658040663..3024c8e2e7b 100644 --- a/python/cudf/cudf/tests/test_testing.py +++ b/python/cudf/cudf/tests/test_testing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -431,8 +431,8 @@ def test_assert_column_memory_basic_same(arrow_arrays): data = cudf.core.column.ColumnBase.from_arrow(arrow_arrays) buf = cudf.core.buffer.as_buffer(data.base_data) - left = cudf.core.column.build_column(buf, dtype=np.int32) - right = cudf.core.column.build_column(buf, dtype=np.int32) + left = cudf.core.column.build_column(buf, dtype=np.int8) + right = cudf.core.column.build_column(buf, dtype=np.int8) assert_column_memory_eq(left, right) with pytest.raises(AssertionError):