From f4fe9037accfceaa7316ab3fa1c5a6e7f51e1702 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 13 Mar 2024 15:39:36 -0400
Subject: [PATCH 1/5] Use offsetalator in cudf::strings::detail::merge

---
 cpp/CMakeLists.txt                            |  1 +
 cpp/include/cudf/strings/detail/merge.hpp     | 47 +++++++++++++++++
 cpp/src/merge/merge.cu                        | 10 ++--
 .../merge.cuh => src/strings/merge/merge.cu}  | 51 +++++++------------
 4 files changed, 69 insertions(+), 40 deletions(-)
 create mode 100644 cpp/include/cudf/strings/detail/merge.hpp
 rename cpp/{include/cudf/strings/detail/merge.cuh => src/strings/merge/merge.cu} (66%)
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 4f64c094ead..695b24345ed 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -583,6 +583,7 @@ add_library(
   src/strings/filling/fill.cu
   src/strings/filter_chars.cu
   src/strings/like.cu
+  src/strings/merge/merge.cu
   src/strings/padding.cu
   src/strings/regex/regcomp.cpp
   src/strings/regex/regexec.cpp
diff --git a/cpp/include/cudf/strings/detail/merge.hpp b/cpp/include/cudf/strings/detail/merge.hpp
new file mode 100644
index 00000000000..054ade7f69c
--- /dev/null
+++ b/cpp/include/cudf/strings/detail/merge.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/detail/merge.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+/**
+ * @brief Merges two strings columns
+ *
+ * Caller must set the validity mask in the output column.
+ *
+ * @param lhs First column
+ * @param rhs Second column
+ * @param row_order Indexes for each column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
+ */
+std::unique_ptr<column> merge(strings_column_view const& lhs,
+                              strings_column_view const& rhs,
+                              cudf::detail::index_vector const& row_order,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr);
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 8be503025bd..dedb2acb9da 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -27,7 +27,7 @@
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/lists/detail/concatenate.hpp>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/strings/detail/merge.cuh>
+#include <cudf/strings/detail/merge.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table.hpp>
@@ -433,12 +433,8 @@ std::unique_ptr<column> column_merger::operator()<cudf::string_view>(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr) const
 {
-  auto column = strings::detail::merge<index_type>(strings_column_view(lcol),
-                                                   strings_column_view(rcol),
-                                                   row_order_.begin(),
-                                                   row_order_.end(),
-                                                   stream,
-                                                   mr);
+  auto column = strings::detail::merge(
+    strings_column_view(lcol), strings_column_view(rcol), row_order_, stream, mr);
   if (lcol.has_nulls() || rcol.has_nulls()) {
     auto merged_view = column->mutable_view();
     materialize_bitmask(
diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/src/strings/merge/merge.cu
similarity index 66%
rename from cpp/include/cudf/strings/detail/merge.cuh
rename to cpp/src/strings/merge/merge.cu
index f05e957783f..9bc5d3d334f 100644
--- a/cpp/include/cudf/strings/detail/merge.cuh
+++ b/cpp/src/strings/merge/merge.cu
@@ -13,13 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#pragma once
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/merge.hpp>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
+#include <cudf/strings/detail/merge.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -36,62 +36,47 @@
 namespace cudf {
 namespace strings {
 namespace detail {
-/**
- * @brief Merges two strings columns.
- *
- * Caller must set the validity mask in the output column.
- *
- * @tparam row_order_iterator This must be an iterator for type thrust::tuple<side,size_type>.
- *
- * @param lhs First column.
- * @param rhs Second column.
- * @param row_order Indexes for each column.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
- */
-template <typename index_type, typename row_order_iterator>
 std::unique_ptr<column> merge(strings_column_view const& lhs,
                               strings_column_view const& rhs,
-                              row_order_iterator begin,
-                              row_order_iterator end,
+                              cudf::detail::index_vector const& row_order,
                               rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   using cudf::detail::side;
-  size_type strings_count = static_cast<size_type>(std::distance(begin, end));
-  if (strings_count == 0) return make_empty_column(type_id::STRING);
+  if (row_order.is_empty()) { return make_empty_column(type_id::STRING); }
+  auto const strings_count = static_cast<cudf::size_type>(row_order.size());
 
-  auto lhs_column = column_device_view::create(lhs.parent(), stream);
-  auto d_lhs      = *lhs_column;
-  auto rhs_column = column_device_view::create(rhs.parent(), stream);
-  auto d_rhs      = *rhs_column;
+  auto const lhs_column = column_device_view::create(lhs.parent(), stream);
+  auto const d_lhs      = *lhs_column;
+  auto const rhs_column = column_device_view::create(rhs.parent(), stream);
+  auto const d_rhs      = *rhs_column;
 
   // caller will set the null mask
-  rmm::device_buffer null_mask{0, stream, mr};
-  size_type null_count = lhs.null_count() + rhs.null_count();
-  if (null_count > 0)
-    null_mask = cudf::detail::create_null_mask(strings_count, mask_state::ALL_VALID, stream, mr);
+  auto const null_count = lhs.null_count() + rhs.null_count();
+  auto null_mask        = (null_count > 0) ? cudf::detail::create_null_mask(
+                                        strings_count, mask_state::ALL_VALID, stream, mr)
+                                           : rmm::device_buffer{};
 
   // build offsets column
   auto offsets_transformer =
     cuda::proclaim_return_type<size_type>([d_lhs, d_rhs] __device__(auto index_pair) {
       auto const [side, index] = index_pair;
-      if (side == side::LEFT ? d_lhs.is_null(index) : d_rhs.is_null(index)) return 0;
+      if (side == side::LEFT ? d_lhs.is_null(index) : d_rhs.is_null(index)) { return 0; }
       auto d_str =
         side == side::LEFT ? d_lhs.element<string_view>(index) : d_rhs.element<string_view>(index);
       return d_str.size_bytes();
     });
+  auto const begin             = row_order.begin();
   auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer);
-  auto [offsets_column, bytes] = cudf::detail::make_offsets_child_column(
+  auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
     offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_offsets = offsets_column->view().template data<int32_t>();
+  auto d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
 
   // create the chars column
   rmm::device_uvector<char> chars(bytes, stream, mr);
   auto d_chars = chars.data();
   thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
+                     thrust::counting_iterator<size_type>(0),
                      strings_count,
                      [d_lhs, d_rhs, begin, d_offsets, d_chars] __device__(size_type idx) {
                        auto const [side, index] = begin[idx];

From 5ad6dda0a9a7e5366f3563c7d64f103a481c55dc Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 14 Mar 2024 12:58:56 -0400
Subject: [PATCH 2/5] add large strings test

---
 .../cudf/strings/detail/strings_children.cuh  | 36 +++++++++++--------
 cpp/tests/merge/merge_string_test.cpp         | 36 +++++++++++++++++++
 2 files changed, 57 insertions(+), 15 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index 49c4be88ca5..d77bdbe6b27 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -153,32 +153,38 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
     data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
   auto d_offsets = offsets_column->mutable_view().template data<int32_t>();
 
-  // The number of offsets is strings_count+1 so to build the offsets from the sizes
-  // using exclusive-scan technically requires strings_count+1 input values even though
-  // the final input value is never used.
-  // The input iterator is wrapped here to allow the 'last value' to be safely read.
-  auto map_fn = cuda::proclaim_return_type<size_type>(
-    [begin, strings_count] __device__(size_type idx) -> size_type {
-      return idx < strings_count ? static_cast<size_type>(begin[idx]) : size_type{0};
-    });
-  auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
+  // First, store the sizes into a temporary vector.
+  // This helps minimize calls to the input iterators which may have expensive
+  // logic to compute each size. This can also speed up the scan-to-offsets
+  // since the scan uses aggressive inlining on the iterator calls.
+  // Also, if this requires int64 offsets, the sizes can be reused.
+  auto sizes = rmm::device_uvector<size_type>(strings_count + 1, stream);
+  // The number of offsets is strings_count+1 so the sizes occupy all except the
+  // last entry which is filled in by the scan and technically should never be read.
+  thrust::copy(rmm::exec_policy_nosync(stream), begin, end, sizes.begin());
   // Use the sizes-to-offsets iterator to compute the total number of elements
-  auto const total_elements =
-    cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream);
+  auto const total_bytes =
+    cudf::detail::sizes_to_offsets(sizes.begin(), sizes.end(), d_offsets, stream);
 
   // TODO: replace exception with if-statement when enabling creating INT64 offsets
-  CUDF_EXPECTS(total_elements <= size_type_max,
+  CUDF_EXPECTS(total_bytes <= size_type_max,
                "Size of output exceeds the character size limit",
                std::overflow_error);
-  // if (total_elements >= get_offset64_threshold()) {
+  // auto const threshold = get_offset64_threshold();
+  // if (!is_large_strings_enabled()) {
+  //   CUDF_EXPECTS(
+  //     total_bytes < threshold, "Size of output exceeds the column size limit",
+  //     std::overflow_error);
+  // }
+  // if (total_bytes >= get_offset64_threshold()) {
   //   // recompute as int64 offsets when above the threshold
   //   offsets_column = make_numeric_column(
   //     data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
   //   auto d_offsets64 = offsets_column->mutable_view().template data<int64_t>();
-  //   sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream);
+  //   cudf::detail::sizes_to_offsets(sizes.begin(), sizes.end(), d_offsets64, stream);
   // }
 
-  return std::pair(std::move(offsets_column), total_elements);
+  return std::pair(std::move(offsets_column), total_bytes);
 }
 
 }  // namespace detail
diff --git a/cpp/tests/merge/merge_string_test.cpp b/cpp/tests/merge/merge_string_test.cpp
index 28179a7341c..e3c467405a9 100644
--- a/cpp/tests/merge/merge_string_test.cpp
+++ b/cpp/tests/merge/merge_string_test.cpp
@@ -411,3 +411,39 @@ TYPED_TEST(MergeStringTest, Merge2StringKeyNullColumns)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view2, output_column_view2);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view3, output_column_view3);
 }
+
+class MergeLargeStringsTest : public cudf::test::BaseFixture {};
+
+TEST_F(MergeLargeStringsTest, DISABLED_MergeLargeStrings)
+{
+  // CUDF_TEST_ENABLE_LARGE_STRINGS(); waiting on PR 15195
+  auto itr = thrust::constant_iterator<std::string_view>(
+    "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY");                // 50 bytes
+  auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000);  // 250MB
+  std::vector<cudf::table_view> input_views;
+  for (int i = 0; i < 10; ++i) {  // 2500MB > 2GB
+    input_views.push_back(cudf::table_view({input}));
+  }
+  std::vector<cudf::order> column_order{cudf::order::ASCENDING};
+  std::vector<cudf::null_order> null_precedence{cudf::null_order::AFTER};
+
+  auto result = cudf::merge(input_views, {0}, column_order, null_precedence);
+  auto sv     = cudf::strings_column_view(result->view().column(0));
+  EXPECT_EQ(sv.size(), 50'000'000);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+
+  // verify results in sections
+  auto splits = std::vector<cudf::size_type>({5'000'000,
+                                              10'000'000,
+                                              15'000'000,
+                                              20'000'000,
+                                              25'000'000,
+                                              30'000'000,
+                                              35'000'000,
+                                              40'000'000,
+                                              45'000'000});
+  auto sliced = cudf::split(sv.parent(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+}

From 809700a40df541663bfb3d62beeae40a68e74045 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 20 Mar 2024 13:57:22 -0400
Subject: [PATCH 3/5] use gather factory for merge results

---
 cpp/benchmarks/CMakeLists.txt                 |  2 +-
 cpp/benchmarks/merge/merge_strings.cpp        | 64 +++++++++++++++++++
 .../cudf/strings/detail/strings_children.cuh  | 25 ++++----
 cpp/src/strings/merge/merge.cu                | 61 ++++++------------
 cpp/tests/merge/merge_string_test.cpp         |  2 +-
 5 files changed, 99 insertions(+), 55 deletions(-)
 create mode 100644 cpp/benchmarks/merge/merge_strings.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index c82e475dece..0c0e5cf6abe 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -235,7 +235,7 @@ ConfigureNVBench(HASHING_NVBENCH hashing/hash.cpp)
 # ##################################################################################################
 # * merge benchmark -------------------------------------------------------------------------------
 ConfigureBench(MERGE_BENCH merge/merge.cpp)
-ConfigureNVBench(MERGE_NVBENCH merge/merge_structs.cpp merge/merge_lists.cpp)
+ConfigureNVBench(MERGE_NVBENCH merge/merge_lists.cpp merge/merge_structs.cpp merge/merge_strings.cpp)
 
 # ##################################################################################################
 # * null_mask benchmark ---------------------------------------------------------------------------
diff --git a/cpp/benchmarks/merge/merge_strings.cpp b/cpp/benchmarks/merge/merge_strings.cpp
new file mode 100644
index 00000000000..3d0f1865490
--- /dev/null
+++ b/cpp/benchmarks/merge/merge_strings.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/merge.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+void nvbench_merge_strings(nvbench::state& state)
+{
+  auto stream = cudf::get_default_stream();
+
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  if (static_cast<std::size_t>(2 * num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const table_profile =
+    data_profile_builder()
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
+      .no_validity();
+  auto const source_tables = create_random_table(
+    {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, table_profile);
+
+  auto const sorted_lhs = cudf::sort(cudf::table_view({source_tables->view().column(0)}));
+  auto const sorted_rhs = cudf::sort(cudf::table_view({source_tables->view().column(1)}));
+  auto const lhs        = sorted_lhs->view().column(0);
+  auto const rhs        = sorted_rhs->view().column(0);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto chars_size = cudf::strings_column_view(lhs).chars_size(stream) +
+                    cudf::strings_column_view(rhs).chars_size(stream);
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);  // all bytes are written
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    [[maybe_unused]] auto result = cudf::merge(
+      {cudf::table_view({lhs}), cudf::table_view({rhs})}, {0}, {cudf::order::ASCENDING});
+  });
+}
+
+NVBENCH_BENCH(nvbench_merge_strings)
+  .set_name("merge_strings")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index d77bdbe6b27..4ef157eb483 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -153,25 +153,25 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
     data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
   auto d_offsets = offsets_column->mutable_view().template data<int32_t>();
 
-  // First, store the sizes into a temporary vector.
-  // This helps minimize calls to the input iterators which may have expensive
-  // logic to compute each size. This can also speed up the scan-to-offsets
-  // since the scan uses aggressive inlining on the iterator calls.
-  // Also, if this requires int64 offsets, the sizes can be reused.
-  auto sizes = rmm::device_uvector<size_type>(strings_count + 1, stream);
-  // The number of offsets is strings_count+1 so the sizes occupy all except the
-  // last entry which is filled in by the scan and technically should never be read.
-  thrust::copy(rmm::exec_policy_nosync(stream), begin, end, sizes.begin());
+  // The number of offsets is strings_count+1 so to build the offsets from the sizes
+  // using exclusive-scan technically requires strings_count+1 input values even though
+  // the final input value is never used.
+  // The input iterator is wrapped here to allow the 'last value' to be safely read.
+  auto map_fn = cuda::proclaim_return_type<size_type>(
+    [begin, strings_count] __device__(size_type idx) -> size_type {
+      return idx < strings_count ? static_cast<size_type>(begin[idx]) : size_type{0};
+    });
+  auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
   // Use the sizes-to-offsets iterator to compute the total number of elements
   auto const total_bytes =
-    cudf::detail::sizes_to_offsets(sizes.begin(), sizes.end(), d_offsets, stream);
+    cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream);
 
   // TODO: replace exception with if-statement when enabling creating INT64 offsets
   CUDF_EXPECTS(total_bytes <= size_type_max,
                "Size of output exceeds the character size limit",
                std::overflow_error);
   // auto const threshold = get_offset64_threshold();
-  // if (!is_large_strings_enabled()) {
+  // if (!is_large_strings_enabled()) { implemented in PR 15195
   //   CUDF_EXPECTS(
   //     total_bytes < threshold, "Size of output exceeds the column size limit",
   //     std::overflow_error);
@@ -181,7 +181,8 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
   //   offsets_column = make_numeric_column(
   //     data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
   //   auto d_offsets64 = offsets_column->mutable_view().template data<int64_t>();
-  //   cudf::detail::sizes_to_offsets(sizes.begin(), sizes.end(), d_offsets64, stream);
+  //   cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64,
+  //   stream);
   // }
 
   return std::pair(std::move(offsets_column), total_bytes);
diff --git a/cpp/src/strings/merge/merge.cu b/cpp/src/strings/merge/merge.cu
index 9bc5d3d334f..9c75ee0e0c3 100644
--- a/cpp/src/strings/merge/merge.cu
+++ b/cpp/src/strings/merge/merge.cu
@@ -16,11 +16,8 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/strings/detail/merge.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 
@@ -28,9 +25,8 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cuda/functional>
-#include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
+#include <thrust/transform.h>
 #include <thrust/tuple.h>
 
 namespace cudf {
@@ -51,43 +47,26 @@ std::unique_ptr<column> merge(strings_column_view const& lhs,
   auto const rhs_column = column_device_view::create(rhs.parent(), stream);
   auto const d_rhs      = *rhs_column;
 
-  // caller will set the null mask
-  auto const null_count = lhs.null_count() + rhs.null_count();
-  auto null_mask        = (null_count > 0) ? cudf::detail::create_null_mask(
-                                        strings_count, mask_state::ALL_VALID, stream, mr)
-                                           : rmm::device_buffer{};
+  auto const begin = row_order.begin();
 
-  // build offsets column
-  auto offsets_transformer =
-    cuda::proclaim_return_type<size_type>([d_lhs, d_rhs] __device__(auto index_pair) {
-      auto const [side, index] = index_pair;
-      if (side == side::LEFT ? d_lhs.is_null(index) : d_rhs.is_null(index)) { return 0; }
-      auto d_str =
-        side == side::LEFT ? d_lhs.element<string_view>(index) : d_rhs.element<string_view>(index);
-      return d_str.size_bytes();
-    });
-  auto const begin             = row_order.begin();
-  auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer);
-  auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
+  // build vector of strings
+  rmm::device_uvector<string_index_pair> indices(strings_count, stream);
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(strings_count),
+                    indices.begin(),
+                    [d_lhs, d_rhs, begin] __device__(size_type idx) {
+                      auto const [side, index] = begin[idx];
+                      if (side == side::LEFT ? d_lhs.is_null(index) : d_rhs.is_null(index)) {
+                        return string_index_pair{nullptr, 0};
+                      }
+                      auto d_str = side == side::LEFT ? d_lhs.element<string_view>(index)
+                                                      : d_rhs.element<string_view>(index);
+                      return string_index_pair{d_str.data(), d_str.size_bytes()};
+                    });
 
-  // create the chars column
-  rmm::device_uvector<char> chars(bytes, stream, mr);
-  auto d_chars = chars.data();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::counting_iterator<size_type>(0),
-                     strings_count,
-                     [d_lhs, d_rhs, begin, d_offsets, d_chars] __device__(size_type idx) {
-                       auto const [side, index] = begin[idx];
-                       if (side == side::LEFT ? d_lhs.is_null(index) : d_rhs.is_null(index)) return;
-                       auto d_str = side == side::LEFT ? d_lhs.element<string_view>(index)
-                                                       : d_rhs.element<string_view>(index);
-                       memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes());
-                     });
-
-  return make_strings_column(
-    strings_count, std::move(offsets_column), chars.release(), null_count, std::move(null_mask));
+  // convert vector into strings column
+  return make_strings_column(indices.begin(), indices.end(), stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/tests/merge/merge_string_test.cpp b/cpp/tests/merge/merge_string_test.cpp
index e3c467405a9..f5b8dfd96f4 100644
--- a/cpp/tests/merge/merge_string_test.cpp
+++ b/cpp/tests/merge/merge_string_test.cpp
@@ -416,7 +416,7 @@ class MergeLargeStringsTest : public cudf::test::BaseFixture {};
 
 TEST_F(MergeLargeStringsTest, DISABLED_MergeLargeStrings)
 {
-  // CUDF_TEST_ENABLE_LARGE_STRINGS(); waiting on PR 15195
+  // CUDF_TEST_ENABLE_LARGE_STRINGS(); implemented in PR 15195
   auto itr = thrust::constant_iterator<std::string_view>(
     "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY");                // 50 bytes
   auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000);  // 250MB

From 81af79a07622c9ae52002ec6c1c268132bcc958d Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 20 Mar 2024 13:57:46 -0400
Subject: [PATCH 4/5] fix cmake format

---
 cpp/benchmarks/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 0c0e5cf6abe..59387638299 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -235,7 +235,9 @@ ConfigureNVBench(HASHING_NVBENCH hashing/hash.cpp)
 # ##################################################################################################
 # * merge benchmark -------------------------------------------------------------------------------
 ConfigureBench(MERGE_BENCH merge/merge.cpp)
-ConfigureNVBench(MERGE_NVBENCH merge/merge_lists.cpp merge/merge_structs.cpp merge/merge_strings.cpp)
+ConfigureNVBench(
+  MERGE_NVBENCH merge/merge_lists.cpp merge/merge_structs.cpp merge/merge_strings.cpp
+)
 
 # ##################################################################################################
 # * null_mask benchmark ---------------------------------------------------------------------------

From a42e5dbffcafe0085405b8eff184e49c7b23938e Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 20 Mar 2024 14:49:09 -0400
Subject: [PATCH 5/5] fix example code (until 15346 is merged)

---
 cpp/examples/strings/custom_optimized.cu | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/cpp/examples/strings/custom_optimized.cu b/cpp/examples/strings/custom_optimized.cu
index cefa3346150..62ca19a5ca9 100644
--- a/cpp/examples/strings/custom_optimized.cu
+++ b/cpp/examples/strings/custom_optimized.cu
@@ -153,8 +153,12 @@ std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
   redact_kernel<<<blocks, block_size, 0, stream.value()>>>(
     *d_names, *d_visibilities, offsets.data(), chars.data());
 
-  // create column from offsets and chars vectors (no copy is performed)
-  auto result = cudf::make_strings_column(names.size(), std::move(offsets), chars.release(), {}, 0);
+  // create column from offsets vector (move only)
+  auto offsets_column = std::make_unique<cudf::column>(std::move(offsets), rmm::device_buffer{}, 0);
+
+  // create column for chars vector (no copy is performed)
+  auto result = cudf::make_strings_column(
+    names.size(), std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});
 
   // wait for all of the above to finish
   stream.synchronize();