diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index 2c6a0b2cf22..bc01a46ca6d 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -34,6 +34,7 @@
 #include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
+#include <thrust/binary_search.h>
 #include <thrust/fill.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index 69a11aabfcd..ae3c0b3aa12 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -30,12 +30,9 @@
 #include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
-#include <thrust/binary_search.h>
 #include <thrust/copy.h>
-#include <thrust/count.h>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/scan.h>
 #include <thrust/transform.h>
 
 namespace cudf::strings::detail {
@@ -297,6 +294,44 @@ std::unique_ptr<column> create_offsets_from_positions(strings_column_view const&
                                                       rmm::cuda_stream_view stream,
                                                       rmm::device_async_resource_ref mr);
 
+/**
+ * @brief Count the number of delimiters in a strings column
+ *
+ * @tparam Tokenizer Functor containing `is_delimiter` function
+ * @tparam block_size Number of threads per block
+ * @tparam bytes_per_thread Number of bytes processed per thread
+ *
+ * @param tokenizer For checking delimiters
+ * @param d_offsets Offsets for the strings column
+ * @param chars_bytes Number of bytes in the strings column
+ * @param d_output Result of the count
+ */
+template <typename Tokenizer, int64_t block_size, size_type bytes_per_thread>
+CUDF_KERNEL void count_delimiters_kernel(Tokenizer tokenizer,
+                                         cudf::detail::input_offsetalator d_offsets,
+                                         int64_t chars_bytes,
+                                         int64_t* d_output)
+{
+  auto const idx      = cudf::detail::grid_1d::global_thread_id();
+  auto const byte_idx = static_cast<int64_t>(idx) * bytes_per_thread;
+  auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);
+
+  using block_reduce = cub::BlockReduce<int64_t, block_size>;
+  __shared__ typename block_reduce::TempStorage temp_storage;
+
+  int64_t count = 0;
+  // each thread processes multiple bytes
+  for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < chars_bytes); ++i) {
+    count += tokenizer.is_delimiter(i, d_offsets, chars_bytes);
+  }
+  auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum());
+
+  if ((lane_idx == 0) && (total > 0)) {
+    cuda::atomic_ref<int64_t, cuda::thread_scope_block> ref{*d_output};
+    ref.fetch_add(total, cuda::std::memory_order_relaxed);
+  }
+}
+
 /**
  * @brief Helper function used by split/rsplit and split_record/rsplit_record
  *
@@ -326,17 +361,19 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
     cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
 
   // count the number of delimiters in the entire column
-  auto const delimiter_count =
-    thrust::count_if(rmm::exec_policy(stream),
-                     thrust::counting_iterator<int64_t>(0),
-                     thrust::counting_iterator<int64_t>(chars_bytes),
-                     [tokenizer, d_offsets, chars_bytes] __device__(int64_t idx) {
-                       return tokenizer.is_delimiter(idx, d_offsets, chars_bytes);
-                     });
+  rmm::device_scalar<int64_t> d_count(0, stream);
+  constexpr int64_t block_size         = 512;
+  constexpr size_type bytes_per_thread = 4;
+  auto const num_blocks                = util::div_rounding_up_safe(
+    util::div_rounding_up_safe(chars_bytes, static_cast<int64_t>(bytes_per_thread)), block_size);
+  count_delimiters_kernel<Tokenizer, block_size, bytes_per_thread>
+    <<<num_blocks, block_size, 0, stream.value()>>>(
+      tokenizer, d_offsets, chars_bytes, d_count.data());
+
   // Create a vector of every delimiter position in the chars column.
   // These may include overlapping or otherwise out-of-bounds delimiters which
   // will be resolved during token processing.
-  auto delimiter_positions = rmm::device_uvector<int64_t>(delimiter_count, stream);
+  auto delimiter_positions = rmm::device_uvector<int64_t>(d_count.value(stream), stream);
   auto d_positions         = delimiter_positions.data();
   cudf::detail::copy_if_safe(
     thrust::counting_iterator<int64_t>(0),
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 42b7f089d61..b5515cfcfa6 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -574,6 +574,7 @@ ConfigureTest(
   large_strings/merge_tests.cpp
   large_strings/parquet_tests.cpp
   large_strings/reshape_tests.cpp
+  large_strings/split_strings_tests.cpp
   GPUS 1
   PERCENT 100
 )
diff --git a/cpp/tests/large_strings/split_strings_tests.cpp b/cpp/tests/large_strings/split_strings_tests.cpp
new file mode 100644
index 00000000000..320fb222241
--- /dev/null
+++ b/cpp/tests/large_strings/split_strings_tests.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/split/split.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <vector>
+
+struct StringsSplitTest : public cudf::test::StringsLargeTest {};
+
+TEST_F(StringsSplitTest, Split)
+{
+  auto const expected   = this->long_column();
+  auto const view       = cudf::column_view(expected);
+  auto const multiplier = 10;
+  auto const separator  = cudf::string_scalar("|");
+  auto const input      = cudf::strings::concatenate(
+    cudf::table_view(std::vector<cudf::column_view>(multiplier, view)), separator);
+
+  {
+    auto result = cudf::strings::split(cudf::strings_column_view(input->view()), separator);
+    for (auto c : result->view()) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, expected);
+    }
+  }
+
+  auto lc = cudf::strings::split_record(cudf::strings_column_view(input->view()), separator);
+  auto lv = cudf::lists_column_view(lc->view());
+  auto sv = cudf::strings_column_view(lv.child());
+  EXPECT_EQ(sv.size(), view.size() * multiplier);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+}