From 5f3a561e2d416973837134847329729122f45762 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 3 May 2023 16:57:32 -0400
Subject: [PATCH 1/5] Performance improvement in cudf::strings::join_strings
 for long strings

---
 cpp/benchmarks/CMakeLists.txt                 |   4 +-
 cpp/benchmarks/string/join_strings.cpp        |  59 ++++++
 cpp/src/strings/combine/join.cu               | 200 +++++++++++-------
 .../strings/combine/join_strings_tests.cpp    |  13 ++
 4 files changed, 204 insertions(+), 72 deletions(-)
 create mode 100644 cpp/benchmarks/string/join_strings.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 565a396d913..9cbf0f60f62 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -301,7 +301,9 @@ ConfigureBench(
   string/url_decode.cu
 )
 
-ConfigureNVBench(STRINGS_NVBENCH string/like.cpp string/reverse.cpp string/lengths.cpp)
+ConfigureNVBench(
+  STRINGS_NVBENCH string/like.cpp string/reverse.cpp string/lengths.cpp string/join_strings.cpp
+)
 
 # ##################################################################################################
 # * json benchmark -------------------------------------------------------------------
diff --git a/cpp/benchmarks/string/join_strings.cpp b/cpp/benchmarks/string/join_strings.cpp
new file mode 100644
index 00000000000..950a742e1d9
--- /dev/null
+++ b/cpp/benchmarks/string/join_strings.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/rmm_pool_raii.hpp>
+
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_join(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const table_profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  // gather some throughput statistics as well
+  auto chars_size = input.chars_size();
+  state.add_element_count(chars_size, "chars_size");            // number of bytes;
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read;
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);  // all bytes are written
+
+  std::string separator(":");
+  std::string narep("null");
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = cudf::strings::join_strings(input, separator, narep);
+  });
+}
+
+NVBENCH_BENCH(bench_join)
+  .set_name("strings_join")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index 9519cf66664..0b9d14d7396 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -21,8 +21,8 @@
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/detail/combine.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -33,94 +33,152 @@
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/for_each.h>
-#include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/transform_scan.h>
 
 namespace cudf {
 namespace strings {
 namespace detail {
 
-std::unique_ptr<column> join_strings(strings_column_view const& strings,
+namespace {
+
+/**
+ * @brief Threshold to decide on using string-per-thread vs the string-gather
+ * approaches.
+ *
+ * If the average byte length of a string in a column exceeds this value then
+ * the string-gather function is used.
+ * Otherwise, a regular string-parallel function is used.
+ *
+ * This value was found using the strings_join benchmark results.
+ */
+constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 128;
+
+struct join_base_fn {
+  column_device_view const d_strings;
+  string_view d_separator;
+  string_scalar_device_view d_narep;
+
+  __device__ std::pair<string_view, string_view> process_string(size_type idx) const
+  {
+    string_view d_str{};
+    string_view d_sep = (idx + 1 < d_strings.size()) ? d_separator : d_str;
+    if (d_strings.is_null(idx)) {
+      if (d_narep.is_valid()) {
+        d_str = d_narep.value();
+      } else {
+        // if null and no narep, don't output a separator either
+        d_sep = d_str;
+      }
+    } else {
+      d_str = d_strings.element<string_view>(idx);
+    }
+    return {d_str, d_sep};
+  }
+};
+
+/**
+ * @brief Compute output sizes and write output bytes
+ *
+ * This functor is suitable for make_strings_children
+ */
+struct join_fn : public join_base_fn {
+  size_type* d_offsets{};
+  char* d_chars{};
+
+  join_fn(column_device_view const d_strings,
+          string_view d_separator,
+          string_scalar_device_view d_narep)
+    : join_base_fn{d_strings, d_separator, d_narep}
+  {
+  }
+
+  __device__ void operator()(size_type idx) const
+  {
+    auto [d_str, d_sep] = process_string(idx);
+
+    char* d_buffer  = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    size_type bytes = 0;
+    if (d_buffer) {
+      d_buffer = detail::copy_string(d_buffer, d_str);
+      d_buffer = detail::copy_string(d_buffer, d_sep);
+    } else {
+      bytes += d_str.size_bytes() + d_sep.size_bytes();
+    }
+    if (!d_chars) { d_offsets[idx] = bytes; }
+  }
+};
+
+/**
+ * @brief Recode string pointers for use with make_strings_column that
+ * employs an efficient gather algorithm
+ */
+struct join_gather_fn : public join_base_fn {
+  string_index_pair* d_results;
+
+  join_gather_fn(column_device_view const d_strings,
+                 string_view d_separator,
+                 string_scalar_device_view d_narep,
+                 string_index_pair* d_results)
+    : join_base_fn{d_strings, d_separator, d_narep}, d_results{d_results}
+  {
+  }
+
+  __device__ void operator()(size_type idx) const
+  {
+    auto [d_str, d_sep] = process_string(idx);
+
+    auto d_output = d_results + (idx * 2);
+    d_output[0]   = string_index_pair{d_str.data(), d_str.size_bytes()};
+    d_output[1]   = string_index_pair{d_sep.data(), d_sep.size_bytes()};
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<column> join_strings(strings_column_view const& input,
                                      string_scalar const& separator,
                                      string_scalar const& narep,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
-  auto strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(type_id::STRING);
+  if (input.is_empty()) { return make_empty_column(type_id::STRING); }
 
   CUDF_EXPECTS(separator.is_valid(stream), "Parameter separator must be a valid string_scalar");
 
   string_view d_separator(separator.data(), separator.size());
   auto d_narep = get_scalar_device_view(const_cast<string_scalar&>(narep));
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_strings      = *strings_column;
-
-  // create an offsets array for building the output memory layout
-  rmm::device_uvector<size_type> output_offsets(strings_count + 1, stream);
-  auto d_output_offsets = output_offsets.data();
-  // using inclusive-scan to compute last entry which is the total size
-  thrust::transform_inclusive_scan(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    thrust::make_counting_iterator<size_type>(strings_count),
-    d_output_offsets + 1,
-    [d_strings, d_separator, d_narep] __device__(size_type idx) {
-      size_type bytes = 0;
-      if (d_strings.is_null(idx)) {
-        if (!d_narep.is_valid()) return 0;  // skip nulls
-        bytes += d_narep.size();
-      } else
-        bytes += d_strings.element<string_view>(idx).size_bytes();
-      if ((idx + 1) < d_strings.size()) bytes += d_separator.size_bytes();
-      return bytes;
-    },
-    thrust::plus<size_type>());
-
-  output_offsets.set_element_to_zero_async(0, stream);
-  // total size is the last entry
-  size_type const bytes = output_offsets.back_element(stream);
-
-  // build offsets column (only 1 string so 2 offset entries)
-  auto offsets_column =
-    make_numeric_column(data_type{type_id::INT32}, 2, mask_state::UNALLOCATED, stream, mr);
-  auto offsets_view = offsets_column->mutable_view();
-  // set the first entry to 0 and the last entry to bytes
-  int32_t new_offsets[] = {0, static_cast<int32_t>(bytes)};
-  CUDF_CUDA_TRY(cudaMemcpyAsync(offsets_view.data<int32_t>(),
-                                new_offsets,
-                                sizeof(new_offsets),
-                                cudaMemcpyDefault,
-                                stream.value()));
-
-  // build null mask
-  // only one entry so it is either all valid or all null
+  auto d_strings = column_device_view::create(input.parent(), stream);
+
+  auto chars_column = [&] {
+    if ((input.size() == input.null_count()) ||
+        ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD)) {
+      return std::get<1>(
+        make_strings_children(join_fn{*d_strings, d_separator, d_narep}, input.size(), stream, mr));
+    }
+
+    rmm::device_uvector<string_index_pair> indices(2L * input.size(), stream);
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator<size_type>(0),
+                       input.size(),
+                       join_gather_fn{*d_strings, d_separator, d_narep, indices.data()});
+    // build the strings column and commandeer the chars column
+    auto joined_col = make_strings_column(indices.begin(), indices.end(), stream, mr);
+    return std::move(joined_col->release().children.back());
+  }();
+
+  // build the offsets: single string has offsets [0,chars-size]
+  rmm::device_uvector<size_type> offsets(2, stream);
+  offsets.set_element_to_zero_async(0, stream);
+  offsets.set_element(1, chars_column->size(), stream);
+  auto offsets_column = std::make_unique<column>(std::move(offsets), rmm::device_buffer{}, 0);
+
+  // build the null mask: only one output row so it is either all-valid or all-null
   auto const null_count =
-    static_cast<size_type>(strings.null_count() == strings_count && !narep.is_valid(stream));
-  auto null_mask    = null_count
-                        ? cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr)
-                        : rmm::device_buffer{0, stream, mr};
-  auto chars_column = create_chars_child_column(bytes, stream, mr);
-  auto d_chars      = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    strings_count,
-    [d_strings, d_separator, d_narep, d_output_offsets, d_chars] __device__(size_type idx) {
-      size_type offset = d_output_offsets[idx];
-      char* d_buffer   = d_chars + offset;
-      if (d_strings.is_null(idx)) {
-        if (!d_narep.is_valid())
-          return;  // do not write to buffer if element is null (including separator)
-        d_buffer = detail::copy_string(d_buffer, d_narep.value());
-      } else {
-        string_view d_str = d_strings.element<string_view>(idx);
-        d_buffer          = detail::copy_string(d_buffer, d_str);
-      }
-      if ((idx + 1) < d_strings.size()) d_buffer = detail::copy_string(d_buffer, d_separator);
-    });
+    static_cast<size_type>(input.null_count() == input.size() && !narep.is_valid(stream));
+  auto null_mask = null_count
+                     ? cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr)
+                     : rmm::device_buffer{0, stream, mr};
 
   return make_strings_column(
     1, std::move(offsets_column), std::move(chars_column), null_count, std::move(null_mask));
diff --git a/cpp/tests/strings/combine/join_strings_tests.cpp b/cpp/tests/strings/combine/join_strings_tests.cpp
index a265803929b..4abe45b663c 100644
--- a/cpp/tests/strings/combine/join_strings_tests.cpp
+++ b/cpp/tests/strings/combine/join_strings_tests.cpp
@@ -58,6 +58,19 @@ TEST_F(JoinStringsTest, Join)
   }
 }
 
+TEST_F(JoinStringsTest, JoinLongStrings)
+{
+  std::string data(200, '0');
+  cudf::test::strings_column_wrapper input({data, data, data, data});
+
+  auto results =
+    cudf::strings::join_strings(cudf::strings_column_view(input), cudf::string_scalar("+"));
+
+  auto expected_data = data + "+" + data + "+" + data + "+" + data;
+  cudf::test::strings_column_wrapper expected({expected_data});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(JoinStringsTest, JoinZeroSizeStringsColumn)
 {
   cudf::column_view zero_size_strings_column(

From 4fa67bbc1b73f20647444addf6cac7c6f1073654 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 8 May 2023 10:27:33 -0400
Subject: [PATCH 2/5] use iterator instead of temporary buffer for pairs

---
 cpp/src/strings/combine/join.cu | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index 0b9d14d7396..7209a89a680 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -112,6 +112,7 @@ struct join_fn : public join_base_fn {
  * @brief Recode string pointers for use with make_strings_column that
  * employs an efficient gather algorithm
  */
+#if 0
 struct join_gather_fn : public join_base_fn {
   string_index_pair* d_results;
 
@@ -132,7 +133,23 @@ struct join_gather_fn : public join_base_fn {
     d_output[1]   = string_index_pair{d_sep.data(), d_sep.size_bytes()};
   }
 };
+#else
+struct join_gather_fn : public join_base_fn {
+  join_gather_fn(column_device_view const d_strings,
+                 string_view d_separator,
+                 string_scalar_device_view d_narep)
+    : join_base_fn{d_strings, d_separator, d_narep}
+  {
+  }
 
+  __device__ string_index_pair operator()(size_type idx) const
+  {
+    auto [d_str, d_sep] = process_string(idx / 2);
+    return idx % 2 ? string_index_pair{d_sep.data(), d_sep.size_bytes()}
+                   : string_index_pair{d_str.data(), d_str.size_bytes()};
+  }
+};
+#endif
 }  // namespace
 
 std::unique_ptr<column> join_strings(strings_column_view const& input,
@@ -151,19 +168,21 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
   auto d_strings = column_device_view::create(input.parent(), stream);
 
   auto chars_column = [&] {
-    if ((input.size() == input.null_count()) ||
+    if ((input.size() == input.null_count()) ||  //(input.size() > 0) ||
         ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD)) {
       return std::get<1>(
         make_strings_children(join_fn{*d_strings, d_separator, d_narep}, input.size(), stream, mr));
     }
 
-    rmm::device_uvector<string_index_pair> indices(2L * input.size(), stream);
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       input.size(),
-                       join_gather_fn{*d_strings, d_separator, d_narep, indices.data()});
+    // rmm::device_uvector<string_index_pair> indices(2L * input.size(), stream);
+    // thrust::for_each_n(rmm::exec_policy(stream),
+    //                    thrust::make_counting_iterator<size_type>(0),
+    //                    input.size(),
+    //                    join_gather_fn{*d_strings, d_separator, d_narep, indices.data()});
+    auto indices = cudf::detail::make_counting_transform_iterator(
+      0, join_gather_fn{*d_strings, d_separator, d_narep});
     // build the strings column and commandeer the chars column
-    auto joined_col = make_strings_column(indices.begin(), indices.end(), stream, mr);
+    auto joined_col = make_strings_column(indices, indices + (input.size() * 2), stream, mr);
     return std::move(joined_col->release().children.back());
   }();
 

From defae23bfae00e960cddea7a700813e78c3e2dc4 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 8 May 2023 11:43:43 -0400
Subject: [PATCH 3/5] set threshold to 32

---
 cpp/src/strings/combine/join.cu | 46 ++++++---------------------------
 1 file changed, 8 insertions(+), 38 deletions(-)

diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index 7209a89a680..91979e7aa90 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -51,7 +51,7 @@ namespace {
  *
  * This value was found using the strings_join benchmark results.
  */
-constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 128;
+constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 32;
 
 struct join_base_fn {
   column_device_view const d_strings;
@@ -108,32 +108,6 @@ struct join_fn : public join_base_fn {
   }
 };
 
-/**
- * @brief Recode string pointers for use with make_strings_column that
- * employs an efficient gather algorithm
- */
-#if 0
-struct join_gather_fn : public join_base_fn {
-  string_index_pair* d_results;
-
-  join_gather_fn(column_device_view const d_strings,
-                 string_view d_separator,
-                 string_scalar_device_view d_narep,
-                 string_index_pair* d_results)
-    : join_base_fn{d_strings, d_separator, d_narep}, d_results{d_results}
-  {
-  }
-
-  __device__ void operator()(size_type idx) const
-  {
-    auto [d_str, d_sep] = process_string(idx);
-
-    auto d_output = d_results + (idx * 2);
-    d_output[0]   = string_index_pair{d_str.data(), d_str.size_bytes()};
-    d_output[1]   = string_index_pair{d_sep.data(), d_sep.size_bytes()};
-  }
-};
-#else
 struct join_gather_fn : public join_base_fn {
   join_gather_fn(column_device_view const d_strings,
                  string_view d_separator,
@@ -145,11 +119,11 @@ struct join_gather_fn : public join_base_fn {
   __device__ string_index_pair operator()(size_type idx) const
   {
     auto [d_str, d_sep] = process_string(idx / 2);
+    // every other string is the separator
     return idx % 2 ? string_index_pair{d_sep.data(), d_sep.size_bytes()}
                    : string_index_pair{d_str.data(), d_str.size_bytes()};
   }
 };
-#endif
 }  // namespace
 
 std::unique_ptr<column> join_strings(strings_column_view const& input,
@@ -168,25 +142,20 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
   auto d_strings = column_device_view::create(input.parent(), stream);
 
   auto chars_column = [&] {
-    if ((input.size() == input.null_count()) ||  //(input.size() > 0) ||
-        ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD)) {
+    // build the strings column and commandeer the chars column
+    if ((input.size() == input.null_count()) ||
+        ((input.chars_size() / (input.size() - input.null_count())) <= AVG_CHAR_BYTES_THRESHOLD)) {
       return std::get<1>(
         make_strings_children(join_fn{*d_strings, d_separator, d_narep}, input.size(), stream, mr));
     }
-
-    // rmm::device_uvector<string_index_pair> indices(2L * input.size(), stream);
-    // thrust::for_each_n(rmm::exec_policy(stream),
-    //                    thrust::make_counting_iterator<size_type>(0),
-    //                    input.size(),
-    //                    join_gather_fn{*d_strings, d_separator, d_narep, indices.data()});
+    // dynamically feeds index pairs to build the output
     auto indices = cudf::detail::make_counting_transform_iterator(
       0, join_gather_fn{*d_strings, d_separator, d_narep});
-    // build the strings column and commandeer the chars column
     auto joined_col = make_strings_column(indices, indices + (input.size() * 2), stream, mr);
     return std::move(joined_col->release().children.back());
   }();
 
-  // build the offsets: single string has offsets [0,chars-size]
+  // build the offsets: single string output has offsets [0,chars-size]
   rmm::device_uvector<size_type> offsets(2, stream);
   offsets.set_element_to_zero_async(0, stream);
   offsets.set_element(1, chars_column->size(), stream);
@@ -199,6 +168,7 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
                      ? cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr)
                      : rmm::device_buffer{0, stream, mr};
 
+  // perhaps this return a string_scalar instead of a single-row column
   return make_strings_column(
     1, std::move(offsets_column), std::move(chars_column), null_count, std::move(null_mask));
 }

From ee45b1e29368d601ee509c18befd971e54f4d3cb Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 11 May 2023 09:27:55 -0400
Subject: [PATCH 4/5] change std::pair to thrust::pair

---
 cpp/benchmarks/string/join_strings.cpp | 3 +--
 cpp/src/strings/combine/join.cu        | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/cpp/benchmarks/string/join_strings.cpp b/cpp/benchmarks/string/join_strings.cpp
index 950a742e1d9..a122c0022a9 100644
--- a/cpp/benchmarks/string/join_strings.cpp
+++ b/cpp/benchmarks/string/join_strings.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/rmm_pool_raii.hpp>
 
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -41,7 +40,7 @@ static void bench_join(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   // gather some throughput statistics as well
-  auto chars_size = input.chars_size();
+  auto const chars_size = input.chars_size();
   state.add_element_count(chars_size, "chars_size");            // number of bytes;
   state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read;
   state.add_global_memory_writes<nvbench::int8_t>(chars_size);  // all bytes are written
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index 91979e7aa90..8641d9414a0 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -58,7 +58,7 @@ struct join_base_fn {
   string_view d_separator;
   string_scalar_device_view d_narep;
 
-  __device__ std::pair<string_view, string_view> process_string(size_type idx) const
+  __device__ thrust::pair<string_view, string_view> process_string(size_type idx) const
   {
     string_view d_str{};
     string_view d_sep = (idx + 1 < d_strings.size()) ? d_separator : d_str;

From dbea49b9c57cc599e4aaa22eb477eb20760ecd94 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 15 May 2023 10:12:50 -0400
Subject: [PATCH 5/5] use mr for offsets vector

---
 cpp/src/strings/combine/join.cu | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index e52987fa098..faf1be6a26f 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -157,10 +157,8 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
   }();
 
   // build the offsets: single string output has offsets [0,chars-size]
-  auto offsets =
-    cudf::detail::make_device_uvector_async(std::vector<size_type>({0, chars_column->size()}),
-                                            stream,
-                                            rmm::mr::get_current_device_resource());
+  auto offsets = cudf::detail::make_device_uvector_async(
+    std::vector<size_type>({0, chars_column->size()}), stream, mr);
   auto offsets_column = std::make_unique<column>(std::move(offsets), rmm::device_buffer{}, 0);
 
   // build the null mask: only one output row so it is either all-valid or all-null