Move chars column to parent data buffer in strings column (#14202)

Eliminates chars column and moves chars data to parent string column's _data buffer. Summary of changes - chars child column is removed, chars buffer is added to parent column - Adds stream to `chars_size()`, `chars_end()` in `strings_column_view` and their invocations - Remove `chars_column_index`, and deprecate `chars()` from `strings_column_view` - Replace `chars_col.begin<char>()` with `static_cast<char*>(parent.head())` - Adds string column factory which accepts `rmm::device_buffer` instead of chars column - Deprecate string column factory which accepts chars column - IO changes - contiguous split (From @nvdbaranec ), to_arrow, parquet writer. - Fix binary ops, column_view, interleave columns, byte cast, strings APIs, text APIs - Fix tests, benchmarks (mostly adding `stream` parameter to chars_size) - Java fixes (From @andygrove) - Python changes - .data special case for string column - get size from offsets column for rmm.DeviceBuffer in column - special condition for string slice - Pickle file update for string column - a few unit tests updates Preparing for #13733 Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - Jason Lowe (https://github.com/jlowe) - David Wendt (https://github.com/davidwendt) - Nghia Truong (https://github.com/ttnghia) - Lawrence Mitchell (https://github.com/wence-) - Matthew Roeschke (https://github.com/mroeschke) - Ashwin Srinath (https://github.com/shwina) URL: #14202
rapidsai · Jan 17, 2024 · c7acdaa · c7acdaa
1 parent 8f5e64d
commit c7acdaa
Show file tree

Hide file tree

Showing 93 changed files with 519 additions and 378 deletions.
diff --git a/cpp/benchmarks/hashing/hash.cpp b/cpp/benchmarks/hashing/hash.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,7 +43,7 @@ static void bench_hash(nvbench::state& state)
 
   // collect statistics
   cudf::strings_column_view input(data->get_column(1).view());
-  auto const chars_size = input.chars_size();
+  auto const chars_size = input.chars_size(stream);
   // add memory read from string column
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);
   // add memory read from int64_t column

diff --git a/cpp/benchmarks/json/json.cu b/cpp/benchmarks/json/json.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -190,7 +190,7 @@ void BM_case(benchmark::State& state, std::string query_arg)
   int desired_bytes = state.range(1);
   auto input        = build_json_string_column(desired_bytes, num_rows);
   cudf::strings_column_view scv(input->view());
-  size_t num_chars = scv.chars().size();
+  size_t num_chars = scv.chars_size(cudf::get_default_stream());
 
   std::string json_path(query_arg);
 

diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,28 +43,28 @@ void bench_case(nvbench::state& state)
   if (encoding == "ascii") {
     data_profile ascii_profile = data_profile_builder().no_validity().distribution(
       cudf::type_id::INT8, distribution_id::UNIFORM, 32, 126);  // nice ASCII range
-    auto input = cudf::strings_column_view(col_view);
-    auto ascii_column =
-      create_random_column(cudf::type_id::INT8, row_count{input.chars_size()}, ascii_profile);
+    auto input        = cudf::strings_column_view(col_view);
+    auto ascii_column = create_random_column(
+      cudf::type_id::INT8, row_count{input.chars_size(cudf::get_default_stream())}, ascii_profile);
     auto ascii_data = ascii_column->view();
 
     col_view = cudf::column_view(col_view.type(),
                                  col_view.size(),
-                                 nullptr,
+                                 ascii_data.data<char>(),
                                  col_view.null_mask(),
                                  col_view.null_count(),
                                  0,
-                                 {input.offsets(), ascii_data});
+                                 {input.offsets()});
 
     ascii_contents = ascii_column->release();
   }
   auto input = cudf::strings_column_view(col_view);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
-  state.add_element_count(input.chars_size(), "chars_size");
-  state.add_global_memory_reads<nvbench::int8_t>(input.chars_size());
-  state.add_global_memory_writes<nvbench::int8_t>(input.chars_size());
+  state.add_element_count(input.chars_size(cudf::get_default_stream()), "chars_size");
+  state.add_global_memory_reads<nvbench::int8_t>(input.chars_size(cudf::get_default_stream()));
+  state.add_global_memory_writes<nvbench::int8_t>(input.chars_size(cudf::get_default_stream()));
 
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch& launch) { auto result = cudf::strings::to_lower(input); });

diff --git a/cpp/benchmarks/string/char_types.cpp b/cpp/benchmarks/string/char_types.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ static void bench_char_types(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   // gather some throughput statistics as well
-  auto chars_size = input.chars_size();
+  auto chars_size = input.chars_size(cudf::get_default_stream());
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);  // all bytes are read;
   if (api_type == "all") {
     state.add_global_memory_writes<nvbench::int8_t>(num_rows);  // output is a bool8 per row

diff --git a/cpp/benchmarks/string/combine.cpp b/cpp/benchmarks/string/combine.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,7 +44,8 @@ static void BM_combine(benchmark::State& state)
     cudf::strings::concatenate(table->view(), separator);
   }
 
-  state.SetBytesProcessed(state.iterations() * (input1.chars_size() + input2.chars_size()));
+  state.SetBytesProcessed(state.iterations() * (input1.chars_size(cudf::get_default_stream()) +
+                                                input2.chars_size(cudf::get_default_stream())));
 }
 
 static void generate_bench_args(benchmark::internal::Benchmark* b)

diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -100,7 +100,7 @@ static void bench_contains(nvbench::state& state)
   auto pattern = patterns[pattern_index];
   auto program = cudf::strings::regex_program::create(pattern);
 
-  auto chars_size = input.chars_size();
+  auto chars_size = input.chars_size(cudf::get_default_stream());
   state.add_element_count(chars_size, "chars_size");
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);
   state.add_global_memory_writes<nvbench::int32_t>(input.size());

diff --git a/cpp/benchmarks/string/convert_datetime.cpp b/cpp/benchmarks/string/convert_datetime.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,7 +48,8 @@ void BM_convert_datetime(benchmark::State& state, direction dir)
       cudf::strings::from_timestamps(input, "%Y-%m-%d %H:%M:%S");
   }
 
-  auto const bytes = dir == direction::to ? source_string.chars_size() : n_rows * sizeof(TypeParam);
+  auto const bytes = dir == direction::to ? source_string.chars_size(cudf::get_default_stream())
+                                          : n_rows * sizeof(TypeParam);
   state.SetBytesProcessed(state.iterations() * bytes);
 }
 

diff --git a/cpp/benchmarks/string/convert_fixed_point.cpp b/cpp/benchmarks/string/convert_fixed_point.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,8 +49,9 @@ void convert_to_fixed_point(benchmark::State& state)
   }
 
   // bytes_processed = bytes_input + bytes_output
-  state.SetBytesProcessed(state.iterations() *
-                          (strings_view.chars_size() + rows * cudf::size_of(dtype)));
+  state.SetBytesProcessed(
+    state.iterations() *
+    (strings_view.chars_size(cudf::get_default_stream()) + rows * cudf::size_of(dtype)));
 }
 
 class StringsFromFixedPoint : public cudf::benchmark {};
@@ -74,7 +75,8 @@ void convert_from_fixed_point(benchmark::State& state)
   // bytes_processed = bytes_input + bytes_output
   state.SetBytesProcessed(
     state.iterations() *
-    (cudf::strings_column_view(results->view()).chars_size() + rows * cudf::size_of(dtype)));
+    (cudf::strings_column_view(results->view()).chars_size(cudf::get_default_stream()) +
+     rows * cudf::size_of(dtype)));
 }
 
 #define CONVERT_TO_FIXED_POINT_BMD(name, fixed_point_type)                  \

diff --git a/cpp/benchmarks/string/convert_numerics.cpp b/cpp/benchmarks/string/convert_numerics.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -63,8 +63,9 @@ void convert_to_number(benchmark::State& state)
   }
 
   // bytes_processed = bytes_input + bytes_output
-  state.SetBytesProcessed(state.iterations() *
-                          (strings_view.chars_size() + rows * sizeof(NumericType)));
+  state.SetBytesProcessed(
+    state.iterations() *
+    (strings_view.chars_size(cudf::get_default_stream()) + rows * sizeof(NumericType)));
 }
 
 class StringsFromNumeric : public cudf::benchmark {};
@@ -90,7 +91,8 @@ void convert_from_number(benchmark::State& state)
   // bytes_processed = bytes_input + bytes_output
   state.SetBytesProcessed(
     state.iterations() *
-    (cudf::strings_column_view(results->view()).chars_size() + rows * sizeof(NumericType)));
+    (cudf::strings_column_view(results->view()).chars_size(cudf::get_default_stream()) +
+     rows * sizeof(NumericType)));
 }
 
 #define CONVERT_TO_NUMERICS_BD(name, type)                               \

diff --git a/cpp/benchmarks/string/copy.cu b/cpp/benchmarks/string/copy.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -64,8 +64,9 @@ static void BM_copy(benchmark::State& state, copy_type ct)
     }
   }
 
-  state.SetBytesProcessed(state.iterations() *
-                          cudf::strings_column_view(source->view().column(0)).chars_size());
+  state.SetBytesProcessed(
+    state.iterations() *
+    cudf::strings_column_view(source->view().column(0)).chars_size(cudf::get_default_stream()));
 }
 
 static void generate_bench_args(benchmark::internal::Benchmark* b)

diff --git a/cpp/benchmarks/string/count.cpp b/cpp/benchmarks/string/count.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,7 +47,7 @@ static void bench_count(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   // gather some throughput statistics as well
-  auto chars_size = input.chars_size();
+  auto chars_size = input.chars_size(cudf::get_default_stream());
   state.add_element_count(chars_size, "chars_size");
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);
   state.add_global_memory_writes<nvbench::int32_t>(input.size());

diff --git a/cpp/benchmarks/string/extract.cpp b/cpp/benchmarks/string/extract.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,7 +67,7 @@ static void bench_extract(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   // gather some throughput statistics as well
-  auto chars_size = strings_view.chars_size();
+  auto chars_size = strings_view.chars_size(cudf::get_default_stream());
   state.add_element_count(chars_size, "chars_size");            // number of bytes;
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read;
   state.add_global_memory_writes<nvbench::int8_t>(chars_size);  // all bytes are written

diff --git a/cpp/benchmarks/string/factory.cu b/cpp/benchmarks/string/factory.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,7 +67,7 @@ static void BM_factory(benchmark::State& state)
   }
 
   cudf::strings_column_view input(column->view());
-  state.SetBytesProcessed(state.iterations() * input.chars_size());
+  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
 }
 
 static void generate_bench_args(benchmark::internal::Benchmark* b)

diff --git a/cpp/benchmarks/string/filter.cpp b/cpp/benchmarks/string/filter.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,7 +57,7 @@ static void BM_filter_chars(benchmark::State& state, FilterAPI api)
     }
   }
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size());
+  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
 }
 
 static void generate_bench_args(benchmark::internal::Benchmark* b)

diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,7 +56,7 @@ static void BM_find_scalar(benchmark::State& state, FindAPI find_api)
     }
   }
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size());
+  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
 }
 
 static void generate_bench_args(benchmark::internal::Benchmark* b)

diff --git a/cpp/benchmarks/string/gather.cpp b/cpp/benchmarks/string/gather.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,7 +43,8 @@ static void bench_gather(nvbench::state& state)
     create_random_table({cudf::type_id::INT32}, row_count{num_rows}, map_profile);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
-  auto chars_size = cudf::strings_column_view(input_table->view().column(0)).chars_size();
+  auto chars_size =
+    cudf::strings_column_view(input_table->view().column(0)).chars_size(cudf::get_default_stream());
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);  // all bytes are read;
   state.add_global_memory_writes<nvbench::int8_t>(chars_size);
 

diff --git a/cpp/benchmarks/string/join_strings.cpp b/cpp/benchmarks/string/join_strings.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,7 +40,7 @@ static void bench_join(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   // gather some throughput statistics as well
-  auto const chars_size = input.chars_size();
+  auto const chars_size = input.chars_size(cudf::get_default_stream());
   state.add_element_count(chars_size, "chars_size");            // number of bytes;
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read;
   state.add_global_memory_writes<nvbench::int8_t>(chars_size);  // all bytes are written

diff --git a/cpp/benchmarks/string/lengths.cpp b/cpp/benchmarks/string/lengths.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,7 +40,7 @@ static void bench_lengths(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   // gather some throughput statistics as well
-  auto chars_size = input.chars_size();
+  auto chars_size = input.chars_size(cudf::get_default_stream());
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);  // all bytes are read;
   state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output is an integer per row
 

diff --git a/cpp/benchmarks/string/like.cpp b/cpp/benchmarks/string/like.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -99,7 +99,7 @@ static void bench_like(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   // gather some throughput statistics as well
-  auto chars_size = input.chars_size();
+  auto chars_size = input.chars_size(cudf::get_default_stream());
   state.add_element_count(chars_size, "chars_size");           // number of bytes;
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);  // all bytes are read;
   state.add_global_memory_writes<nvbench::int8_t>(n_rows);     // writes are BOOL8

diff --git a/cpp/benchmarks/string/repeat_strings.cpp b/cpp/benchmarks/string/repeat_strings.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,7 +59,7 @@ static void BM_repeat_strings_scalar_times(benchmark::State& state)
     cudf::strings::repeat_strings(strings_col, default_repeat_times);
   }
 
-  state.SetBytesProcessed(state.iterations() * strings_col.chars_size());
+  state.SetBytesProcessed(state.iterations() * strings_col.chars_size(cudf::get_default_stream()));
 }
 
 static void BM_repeat_strings_column_times(benchmark::State& state)
@@ -75,8 +75,8 @@ static void BM_repeat_strings_column_times(benchmark::State& state)
     cudf::strings::repeat_strings(strings_col, repeat_times_col);
   }
 
-  state.SetBytesProcessed(state.iterations() *
-                          (strings_col.chars_size() + repeat_times_col.size() * sizeof(int32_t)));
+  state.SetBytesProcessed(state.iterations() * (strings_col.chars_size(cudf::get_default_stream()) +
+                                                repeat_times_col.size() * sizeof(int32_t)));
 }
 
 static void generate_bench_args(benchmark::internal::Benchmark* b)