diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 275b8f9332f..a4e73c86e0e 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -107,6 +107,7 @@ outputs:
         - test -f $PREFIX/include/cudf/detail/binaryop.hpp
         - test -f $PREFIX/include/cudf/detail/calendrical_month_sequence.cuh
         - test -f $PREFIX/include/cudf/detail/concatenate.hpp
+        - test -f $PREFIX/include/cudf/detail/concatenate_masks.hpp
         - test -f $PREFIX/include/cudf/detail/contiguous_split.hpp
         - test -f $PREFIX/include/cudf/detail/copy.hpp
         - test -f $PREFIX/include/cudf/detail/datetime.hpp
diff --git a/cpp/include/cudf/detail/concatenate.cuh b/cpp/include/cudf/detail/concatenate_masks.hpp
similarity index 76%
rename from cpp/include/cudf/detail/concatenate.cuh
rename to cpp/include/cudf/detail/concatenate_masks.hpp
index 51bcb1afa1f..e7086ea17a5 100644
--- a/cpp/include/cudf/detail/concatenate.cuh
+++ b/cpp/include/cudf/detail/concatenate_masks.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,14 +17,11 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
-#include <cudf/concatenate.hpp>
-#include <cudf/detail/concatenate.hpp>
-#include <cudf/table/table_view.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-
-#include <vector>
+#include <rmm/device_buffer.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 
 namespace cudf {
 //! Inner interfaces and implementations
@@ -39,12 +36,13 @@ namespace detail {
  * @param dest_mask The output buffer to copy null masks into
  * @param output_size The total number of null masks bits that are being copied
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @return The number of nulls
  */
-void concatenate_masks(device_span<column_device_view const> d_views,
-                       device_span<size_t const> d_offsets,
-                       bitmask_type* dest_mask,
-                       size_type output_size,
-                       rmm::cuda_stream_view stream);
+size_type concatenate_masks(device_span<column_device_view const> d_views,
+                            device_span<size_t const> d_offsets,
+                            bitmask_type* dest_mask,
+                            size_type output_size,
+                            rmm::cuda_stream_view stream);
 
 /**
  * @brief Concatenates `views[i]`'s bitmask from the bits
@@ -54,10 +52,11 @@ void concatenate_masks(device_span<column_device_view const> d_views,
  * @param views Column views whose bitmasks will be concatenated
  * @param dest_mask The output buffer to copy null masks into
  * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @return The number of nulls
  */
-void concatenate_masks(host_span<column_view const> views,
-                       bitmask_type* dest_mask,
-                       rmm::cuda_stream_view stream);
+size_type concatenate_masks(host_span<column_view const> views,
+                            bitmask_type* dest_mask,
+                            rmm::cuda_stream_view stream);
 
 /**
  * @copydoc cudf::concatenate_masks(host_span<column_view const>, rmm::mr::device_memory_resource*)
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index c42cc5c69f9..b17475cb877 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -13,10 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <cudf/detail/concatenate.cuh>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/concatenate_masks.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
@@ -49,6 +49,7 @@
 
 namespace cudf {
 namespace detail {
+namespace {
 
 // From benchmark data, the fused kernel optimization appears to perform better
 // when there are more than a trivial number of columns, or when the null mask
@@ -100,23 +101,29 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
  * @brief Concatenates the null mask bits of all the column device views in the
  * `views` array to the destination bitmask.
  *
+ * @tparam block_size Block size for using with single_lane_block_sum_reduce
+ *
  * @param views Array of column_device_view
  * @param output_offsets Prefix sum of sizes of elements of `views`
  * @param number_of_views Size of `views` array
  * @param dest_mask The output buffer to copy null masks into
- * @param number_of_mask_bits The total number of null masks bits that are being
- * copied
+ * @param number_of_mask_bits The total number of null masks bits that are being copied
+ * @param out_valid_count To hold the total number of valid bits set
  */
+template <size_type block_size>
 __global__ void concatenate_masks_kernel(column_device_view const* views,
                                          size_t const* output_offsets,
                                          size_type number_of_views,
                                          bitmask_type* dest_mask,
-                                         size_type number_of_mask_bits)
+                                         size_type number_of_mask_bits,
+                                         size_type* out_valid_count)
 {
   size_type mask_index = threadIdx.x + blockIdx.x * blockDim.x;
 
   auto active_mask = __ballot_sync(0xFFFF'FFFFu, mask_index < number_of_mask_bits);
 
+  size_type warp_valid_count = 0;
+
   while (mask_index < number_of_mask_bits) {
     size_type const source_view_index =
       thrust::upper_bound(
@@ -129,32 +136,44 @@ __global__ void concatenate_masks_kernel(column_device_view const* views,
     }
     bitmask_type const new_word = __ballot_sync(active_mask, bit_is_set);
 
-    if (threadIdx.x % detail::warp_size == 0) { dest_mask[word_index(mask_index)] = new_word; }
+    if (threadIdx.x % detail::warp_size == 0) {
+      dest_mask[word_index(mask_index)] = new_word;
+      warp_valid_count += __popc(new_word);
+    }
 
     mask_index += blockDim.x * gridDim.x;
     active_mask = __ballot_sync(active_mask, mask_index < number_of_mask_bits);
   }
+
+  using detail::single_lane_block_sum_reduce;
+  auto const block_valid_count = single_lane_block_sum_reduce<block_size, 0>(warp_valid_count);
+  if (threadIdx.x == 0) { atomicAdd(out_valid_count, block_valid_count); }
 }
+}  // namespace
 
-void concatenate_masks(device_span<column_device_view const> d_views,
-                       device_span<size_t const> d_offsets,
-                       bitmask_type* dest_mask,
-                       size_type output_size,
-                       rmm::cuda_stream_view stream)
+size_type concatenate_masks(device_span<column_device_view const> d_views,
+                            device_span<size_t const> d_offsets,
+                            bitmask_type* dest_mask,
+                            size_type output_size,
+                            rmm::cuda_stream_view stream)
 {
+  rmm::device_scalar<size_type> d_valid_count(0, stream);
   constexpr size_type block_size{256};
   cudf::detail::grid_1d config(output_size, block_size);
-  concatenate_masks_kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
-    d_views.data(),
-    d_offsets.data(),
-    static_cast<size_type>(d_views.size()),
-    dest_mask,
-    output_size);
+  concatenate_masks_kernel<block_size>
+    <<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
+      d_views.data(),
+      d_offsets.data(),
+      static_cast<size_type>(d_views.size()),
+      dest_mask,
+      output_size,
+      d_valid_count.data());
+  return output_size - d_valid_count.value(stream);
 }
 
-void concatenate_masks(host_span<column_view const> views,
-                       bitmask_type* dest_mask,
-                       rmm::cuda_stream_view stream)
+size_type concatenate_masks(host_span<column_view const> views,
+                            bitmask_type* dest_mask,
+                            rmm::cuda_stream_view stream)
 {
   // Preprocess and upload inputs to device memory
   auto const device_views = create_device_views(views, stream);
@@ -162,9 +181,10 @@ void concatenate_masks(host_span<column_view const> views,
   auto const& d_offsets   = std::get<2>(device_views);
   auto const output_size  = std::get<3>(device_views);
 
-  concatenate_masks(d_views, d_offsets, dest_mask, output_size, stream);
+  return concatenate_masks(d_views, d_offsets, dest_mask, output_size, stream);
 }
 
+namespace {
 template <typename T, size_type block_size, bool Nullable>
 __global__ void fused_concatenate_kernel(column_device_view const* input_views,
                                          size_t const* input_offsets,
@@ -287,7 +307,8 @@ std::unique_ptr<column> for_each_concatenate(host_span<column_view const> views,
 
   // If concatenated column is nullable, proceed to calculate it
   if (has_nulls) {
-    cudf::detail::concatenate_masks(views, (col->mutable_view()).null_mask(), stream);
+    col->set_null_count(
+      cudf::detail::concatenate_masks(views, (col->mutable_view()).null_mask(), stream));
   } else {
     col->set_null_count(0);  // prevent null count from being materialized
   }
@@ -340,8 +361,6 @@ std::unique_ptr<column> concatenate_dispatch::operator()<cudf::struct_view>()
   return cudf::structs::detail::concatenate(views, stream, mr);
 }
 
-namespace {
-
 void bounds_and_type_check(host_span<column_view const> cols, rmm::cuda_stream_view stream);
 
 /**
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index d543225d3eb..ab22c07e4d5 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/concatenate.cuh>
+#include <cudf/detail/concatenate.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index 98ad108655f..70b6563b1bc 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -15,7 +15,7 @@
  */
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/concatenate.cuh>
+#include <cudf/detail/concatenate.hpp>
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/sorting.hpp>
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 33f44dbf8f5..15c1d21e74c 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -16,7 +16,6 @@
 #include "join_common_utils.cuh"
 
 #include <cudf/copying.hpp>
-#include <cudf/detail/concatenate.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/join.hpp>
 #include <cudf/detail/structs/utilities.hpp>
diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu
index 69d6949ad4a..485903deaec 100644
--- a/cpp/src/lists/copying/concatenate.cu
+++ b/cpp/src/lists/copying/concatenate.cu
@@ -19,8 +19,10 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/concatenate.cuh>
+#include <cudf/detail/concatenate.hpp>
+#include <cudf/detail/concatenate_masks.hpp>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/null_mask.cuh>
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -124,14 +126,9 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
     std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
   rmm::device_buffer null_mask = create_null_mask(
     total_list_count, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED);
-  cudf::size_type null_count{0};
-  if (has_nulls) {
-    cudf::detail::concatenate_masks(columns, static_cast<bitmask_type*>(null_mask.data()), stream);
-    null_count =
-      std::transform_reduce(columns.begin(), columns.end(), 0, std::plus{}, [](auto const& col) {
-        return col.null_count();
-      });
-  }
+  auto null_mask_data = static_cast<bitmask_type*>(null_mask.data());
+  auto const null_count =
+    has_nulls ? cudf::detail::concatenate_masks(columns, null_mask_data, stream) : size_type{0};
 
   // assemble into outgoing list column
   return make_lists_column(total_list_count,
diff --git a/cpp/src/structs/copying/concatenate.cu b/cpp/src/structs/copying/concatenate.cu
index 19552b2dc03..b5ec1c0812a 100644
--- a/cpp/src/structs/copying/concatenate.cu
+++ b/cpp/src/structs/copying/concatenate.cu
@@ -19,7 +19,8 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/concatenate.cuh>
+#include <cudf/detail/concatenate.hpp>
+#include <cudf/detail/concatenate_masks.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/structs/structs_column_view.hpp>
@@ -65,14 +66,9 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
     std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
   rmm::device_buffer null_mask =
     create_null_mask(total_length, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED);
-  cudf::size_type null_count{0};
-  if (has_nulls) {
-    cudf::detail::concatenate_masks(columns, static_cast<bitmask_type*>(null_mask.data()), stream);
-    null_count =
-      std::transform_reduce(columns.begin(), columns.end(), 0, std::plus{}, [](auto const& col) {
-        return col.null_count();
-      });
-  }
+  auto null_mask_data = static_cast<bitmask_type*>(null_mask.data());
+  auto const null_count =
+    has_nulls ? cudf::detail::concatenate_masks(columns, null_mask_data, stream) : size_type{0};
 
   // assemble into outgoing list column
   return make_structs_column(