From 97660665d05fcf409c1cb881690fe773a9e368d1 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 3 Oct 2022 13:25:34 -0400
Subject: [PATCH 1/7] Add libcudf strings example

---
 ci/release/update-version.sh               |   1 +
 cpp/examples/build.sh                      |   9 +-
 cpp/examples/strings/CMakeLists.txt        |  50 +++++++
 cpp/examples/strings/common.hpp            | 121 ++++++++++++++++
 cpp/examples/strings/custom_fast.cu        | 155 +++++++++++++++++++++
 cpp/examples/strings/custom_gather.cu      | 119 ++++++++++++++++
 cpp/examples/strings/custom_with_malloc.cu | 151 ++++++++++++++++++++
 cpp/examples/strings/libcudf_apis.cpp      |  62 +++++++++
 cpp/examples/strings/names.csv             |  20 +++
 9 files changed, 687 insertions(+), 1 deletion(-)
 create mode 100644 cpp/examples/strings/CMakeLists.txt
 create mode 100644 cpp/examples/strings/common.hpp
 create mode 100644 cpp/examples/strings/custom_fast.cu
 create mode 100644 cpp/examples/strings/custom_gather.cu
 create mode 100644 cpp/examples/strings/custom_with_malloc.cu
 create mode 100644 cpp/examples/strings/libcudf_apis.cpp
 create mode 100644 cpp/examples/strings/names.csv

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 8fad4e08c56..c23f558f071 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -74,6 +74,7 @@ sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md
 
 # Libcudf examples update
 sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/basic/CMakeLists.txt
+sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/strings/CMakeLists.txt
 
 # ucx-py version update
 sed_runner "s/export UCX_PY_VERSION=.*/export UCX_PY_VERSION='${NEXT_UCX_PY_VERSION}'/g" ci/gpu/build.sh
diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh
index 079f7358872..7d389cd318d 100755
--- a/cpp/examples/build.sh
+++ b/cpp/examples/build.sh
@@ -17,8 +17,15 @@ LIB_BUILD_DIR=${LIB_BUILD_DIR:-$(readlink -f "${EXAMPLES_DIR}/../build")}
 # Basic example
 BASIC_EXAMPLE_DIR=${EXAMPLES_DIR}/basic
 BASIC_EXAMPLE_BUILD_DIR=${BASIC_EXAMPLE_DIR}/build
-
 # Configure
 cmake -S ${BASIC_EXAMPLE_DIR} -B ${BASIC_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}"
 # Build
 cmake --build ${BASIC_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL}
+
+# Strings example
+STRINGS_EXAMPLE_DIR=${EXAMPLES_DIR}/strings
+STRINGS_EXAMPLE_BUILD_DIR=${STRINGS_EXAMPLE_DIR}/build
+# Configure
+cmake -S ${STRINGS_EXAMPLE_DIR} -B ${STRINGS_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}"
+# Build
+cmake --build ${STRINGS_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL}
diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt
new file mode 100644
index 00000000000..732a050902e
--- /dev/null
+++ b/cpp/examples/strings/CMakeLists.txt
@@ -0,0 +1,50 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+cmake_minimum_required(VERSION 3.18)
+
+project(
+  strings_examples
+  VERSION 0.0.1
+  LANGUAGES CXX CUDA
+)
+
+set(CPM_DOWNLOAD_VERSION v0.35.3)
+file(
+  DOWNLOAD
+  https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake
+  ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake
+)
+include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
+
+set(CUDF_TAG branch-22.12)
+CPMFindPackage(
+  NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
+  GIT_TAG ${CUDF_TAG}
+  GIT_SHALLOW
+    TRUE
+    SOURCE_SUBDIR
+    cpp
+)
+
+list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
+
+#
+add_executable(libcudf_apis libcudf_apis.cpp)
+target_compile_features(libcudf_apis PRIVATE cxx_std_17)
+target_link_libraries(libcudf_apis PRIVATE cudf::cudf nvToolsExt)
+
+add_executable(custom_with_malloc custom_with_malloc.cu)
+target_compile_features(custom_with_malloc PRIVATE cxx_std_17)
+target_compile_options(custom_with_malloc PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
+target_link_libraries(custom_with_malloc PRIVATE cudf::cudf nvToolsExt)
+
+add_executable(custom_gather custom_gather.cu)
+target_compile_features(custom_gather PRIVATE cxx_std_17)
+target_compile_options(custom_gather PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
+target_link_libraries(custom_gather PRIVATE cudf::cudf nvToolsExt)
+
+add_executable(custom_fast custom_fast.cu)
+target_compile_features(custom_fast PRIVATE cxx_std_17)
+target_compile_options(custom_fast PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
+target_link_libraries(custom_fast PRIVATE cudf::cudf nvToolsExt)
+
diff --git a/cpp/examples/strings/common.hpp b/cpp/examples/strings/common.hpp
new file mode 100644
index 00000000000..efdd211e9e3
--- /dev/null
+++ b/cpp/examples/strings/common.hpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/io/csv.hpp>
+#include <cudf/io/datasource.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <sys/time.h>
+
+/**
+ * @brief Main example function returns redacted strings column.
+ *
+ * This function returns a redacted version of the input `names` column
+ * using the the `visibilities` column as in the following example
+ * ```
+ * names        visibility  --> redacted
+ * John Doe     public          D John
+ * Bobby Joe    private         X X
+ * ```
+ *
+ * @param names First and last names separated with a single space
+ * @param visibilities String values `public` or `private` only
+ * @return Redacted strings column
+ */
+std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
+                                             cudf::column_view const& visibilities);
+
+/**
+ * @brief Create CUDA memory resource
+ */
+auto make_cuda_mr() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
+
+/**
+ * @brief Create a pool device memory resource
+ */
+auto make_pool_mr()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda_mr());
+}
+
+/**
+ * @brief Create memory resource for libcudf functions
+ */
+std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(std::string const& name)
+{
+  if (name == "pool") { return make_pool_mr(); }
+  return make_cuda_mr();
+}
+
+/**
+ * @brief Main for strings examples
+ *
+ * Command line parameters:
+ * 1. CSV file name/path
+ * 2. Memory resource (optional): 'pool' or 'cuda'
+ *
+ * The stdout includes the number of rows in the input and the output size in bytes.
+ */
+int main(int argc, char const** argv)
+{
+  if (argc < 2) {
+    std::cout << "required parameter: csv-file-path\n";
+    return 1;
+  }
+
+  auto const mr_name = std::string{argc > 2 ? std::string(argv[2]) : std::string("cuda")};
+  auto resource      = create_memory_resource(mr_name);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  auto const csv_file   = std::string{argv[1]};
+  auto const csv_result = [csv_file] {
+    cudf::io::csv_reader_options in_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_file}).header(-1);
+    return std::move(cudf::io::read_csv(in_opts).tbl);
+  }();
+  auto const csv_table = csv_result->view();
+
+  std::cout << "table: " << csv_table.num_rows() << " rows " << csv_table.num_columns()
+            << " columns\n";
+
+  // utility function that captures current time in microseconds
+  auto get_time = []() {
+    timeval tv;
+    gettimeofday(&tv, nullptr);
+    return static_cast<double>(tv.tv_sec * 1000000 + tv.tv_usec) / 1000000.0;
+  };
+
+  auto st      = get_time();
+  auto result  = redact_strings(csv_table.column(0), csv_table.column(1));
+  auto elapsed = get_time() - st;
+  std::cout << "Wall time: " << elapsed << " seconds\n";
+  std::cout << "Output size " << result->view().child(1).size() << " bytes\n";
+
+  return 0;
+}
diff --git a/cpp/examples/strings/custom_fast.cu b/cpp/examples/strings/custom_fast.cu
new file mode 100644
index 00000000000..41176199e8e
--- /dev/null
+++ b/cpp/examples/strings/custom_fast.cu
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common.hpp"
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <nvToolsExt.h>
+
+/**
+ * @brief Computes the size of each output row
+ *
+ * @param d_names Column of names
+ * @param d_visibilities Column of visibilities
+ * @param d_sizes Output sizes for each row
+ */
+__global__ void sizes_kernel(cudf::column_device_view const d_names,
+                             cudf::column_device_view const d_visibilities,
+                             cudf::size_type* d_sizes)
+{
+  auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index >= d_names.size()) return;
+
+  auto const visible   = cudf::string_view("public", 6);
+  auto const redaction = cudf::string_view("X X", 3);
+
+  auto const name = d_names.element<cudf::string_view>(index);
+  auto const vis  = d_visibilities.element<cudf::string_view>(index);
+
+  cudf::size_type result = redaction.size_bytes();  // init to redaction size
+  if (vis == visible) {
+    auto const space_idx    = name.find(' ');
+    auto const first        = name.substr(0, space_idx);
+    auto const last_initial = name.substr(space_idx + 1, 1);
+
+    result = first.size_bytes() + last_initial.size_bytes() + 1;
+  }
+
+  d_sizes[index] = result;
+}
+
+/**
+ * @brief Builds the output for each row
+ *
+ * @param d_names Column of names
+ * @param d_visibilities Column of visibilities
+ * @param d_offsets Byte offset in `d_chars` for each row
+ * @param d_chars Output memory for all rows
+ */
+__global__ void redact_kernel(cudf::column_device_view const d_names,
+                              cudf::column_device_view const d_visibilities,
+                              cudf::size_type const* d_offsets,
+                              char* d_chars)
+{
+  auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index >= d_names.size()) return;
+
+  auto const visible   = cudf::string_view("public", 6);
+  auto const redaction = cudf::string_view("X X", 3);
+
+  // resolve output_ptr using the offsets vector
+  char* output_ptr = d_chars + d_offsets[index];
+
+  auto const name = d_names.element<cudf::string_view>(index);
+  auto const vis  = d_visibilities.element<cudf::string_view>(index);
+
+  if (vis == visible) {
+    auto const space_idx    = name.find(' ');
+    auto const first        = name.substr(0, space_idx);
+    auto const last_initial = name.substr(space_idx + 1, 1);
+    auto const output_size  = first.size_bytes() + last_initial.size_bytes() + 1;
+
+    // build output string
+    memcpy(output_ptr, last_initial.data(), last_initial.size_bytes());
+    output_ptr += last_initial.size_bytes();
+    memcpy(output_ptr, " ", 1);
+    output_ptr++;
+    memcpy(output_ptr, first.data(), first.size_bytes());
+  } else {
+    memcpy(output_ptr, redaction.data(), redaction.size_bytes());
+  }
+}
+
+/**
+ * @brief Redacts each name per the corresponding visibility entry
+ *
+ * This implementation builds the strings column children (offsets and chars)
+ * directly into device memory for libcudf.
+ *
+ * @param names Column of names
+ * @param visibilities Column of visibilities
+ * @return Redacted column of names
+ */
+std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
+                                             cudf::column_view const& visibilities)
+{
+  auto stream = rmm::cuda_stream_default;
+
+  auto const d_names        = cudf::column_device_view::create(names, stream);
+  auto const d_visibilities = cudf::column_device_view::create(visibilities, stream);
+
+  constexpr int block_size = 128;
+  int const blocks         = (names.size() + block_size - 1) / block_size;
+
+  nvtxRangePushA("redact_strings");
+
+  // create offsets vector
+  auto offsets = rmm::device_uvector<cudf::size_type>(names.size() + 1, stream);
+
+  // compute output sizes
+  sizes_kernel<<<blocks, block_size, 0, stream.value()>>>(
+    *d_names, *d_visibilities, offsets.data());
+
+  // convert sizes to offsets (in place)
+  thrust::exclusive_scan(rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin());
+
+  // last element is the total output size
+  // (device-to-host copy of 1 integer)
+  cudf::size_type output_size = offsets.back_element(stream);
+
+  //  create chars vector
+  auto chars = rmm::device_uvector<char>(output_size, stream);
+
+  // build chars output
+  redact_kernel<<<blocks, block_size, 0, stream.value()>>>(
+    *d_names, *d_visibilities, offsets.data(), chars.data());
+
+  // create column from offsets and chars vectors (no copy is performed)
+  auto result = cudf::make_strings_column(names.size(), std::move(offsets), std::move(chars));
+  cudaStreamSynchronize(stream.value());
+
+  nvtxRangePop();
+  return result;
+}
diff --git a/cpp/examples/strings/custom_gather.cu b/cpp/examples/strings/custom_gather.cu
new file mode 100644
index 00000000000..6725a643fdf
--- /dev/null
+++ b/cpp/examples/strings/custom_gather.cu
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common.hpp"
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <cuda_runtime.h>
+#include <nvToolsExt.h>
+
+/**
+ * @brief Builds the output for each row
+ *
+ * @param d_names Column of names
+ * @param d_visibilities Column of visibilities
+ * @param redaction Redacted string replacement
+ * @param working_memory Output memory for all rows
+ * @param d_offsets Byte offset in `d_chars` for each row
+ * @param d_output Output array of string_view objects
+ */
+__global__ void redact_kernel(cudf::column_device_view const d_names,
+                              cudf::column_device_view const d_visibilities,
+                              cudf::string_view redaction,
+                              char* working_memory,
+                              cudf::offset_type const* d_offsets,
+                              cudf::string_view* d_output)
+{
+  auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index >= d_names.size()) return;
+
+  auto const visible = cudf::string_view("public", 6);
+
+  auto const name = d_names.element<cudf::string_view>(index);
+  auto const vis  = d_visibilities.element<cudf::string_view>(index);
+  if (vis == visible) {
+    auto const space_idx    = name.find(' ');
+    auto const first        = name.substr(0, space_idx);
+    auto const last_initial = name.substr(space_idx + 1, 1);
+    auto const output_size  = first.size_bytes() + last_initial.size_bytes() + 1;
+
+    char* output_ptr = working_memory + d_offsets[index];
+    d_output[index]  = cudf::string_view{output_ptr, output_size};
+
+    // build output string
+    memcpy(output_ptr, last_initial.data(), last_initial.size_bytes());
+    output_ptr += last_initial.size_bytes();
+    memcpy(output_ptr, " ", 1);
+    output_ptr++;
+    memcpy(output_ptr, first.data(), first.size_bytes());
+  } else {
+    d_output[index] = cudf::string_view{redaction.data(), redaction.size_bytes()};
+  }
+}
+
+/**
+ * @brief Redacts each name per the corresponding visibility entry
+ *
+ * This implementation builds the individual strings into a fixed memory buffer
+ * and then calls a factory function to gather them into a strings column.
+ *
+ * @param names Column of names
+ * @param visibilities Column of visibilities
+ * @return Redacted column of names
+ */
+std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
+                                             cudf::column_view const& visibilities)
+{
+  auto stream = rmm::cuda_stream_default;
+
+  auto const d_names        = cudf::column_device_view::create(names, stream);
+  auto const d_visibilities = cudf::column_device_view::create(visibilities, stream);
+  auto const d_redaction    = cudf::string_scalar(std::string("X X"), true, stream);
+
+  constexpr auto block_size = 128;
+  auto const blocks         = (names.size() + block_size - 1) / block_size;
+
+  nvtxRangePushA("redact_strings");
+
+  auto const scv     = cudf::strings_column_view(names);
+  auto const offsets = scv.offsets_begin();
+
+  // create working memory to hold the output of each string
+  auto working_memory = rmm::device_uvector<char>(scv.chars_size(), stream);
+  // create a vector for the output strings' pointers
+  auto str_ptrs = rmm::device_uvector<cudf::string_view>(names.size(), stream);
+
+  // build the output strings
+  redact_kernel<<<blocks, block_size, 0, stream.value()>>>(*d_names,
+                                                           *d_visibilities,
+                                                           d_redaction.value(),
+                                                           working_memory.data(),
+                                                           offsets,
+                                                           str_ptrs.data());
+
+  // create strings column from the string_pairs;
+  // this copies all the individual strings into a single output column
+  auto result = cudf::make_strings_column(str_ptrs, cudf::string_view{nullptr, 0}, stream);
+  // temporary memory cleanup cost here for str_ptrs and working_memory
+
+  nvtxRangePop();
+  return result;
+}
diff --git a/cpp/examples/strings/custom_with_malloc.cu b/cpp/examples/strings/custom_with_malloc.cu
new file mode 100644
index 00000000000..ebcc70b34c7
--- /dev/null
+++ b/cpp/examples/strings/custom_with_malloc.cu
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common.hpp"
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <cuda_runtime.h>
+#include <nvToolsExt.h>
+
+/**
+ * @brief Reserve CUDA malloc heap size
+ *
+ * Call this function to change the CUDA malloc heap size limit.
+ * This value depends on the total size of all the malloc()
+ * calls needed for redact_kernel.
+ *
+ * @param heap_size Number of bytes to reserve
+ *                  Default is 1GB
+ */
+void set_malloc_heap_size(size_t heap_size = 1073741824)  // 1GB
+{
+  size_t max_malloc_heap_size = 0;
+  cudaDeviceGetLimit(&max_malloc_heap_size, cudaLimitMallocHeapSize);
+  if (max_malloc_heap_size < heap_size) {
+    max_malloc_heap_size = heap_size;
+    if (cudaDeviceSetLimit(cudaLimitMallocHeapSize, max_malloc_heap_size) != cudaSuccess) {
+      fprintf(stderr, "could not set malloc heap size to %ldMB\n", (heap_size / (1024 * 1024)));
+      throw std::runtime_error("");
+    }
+  }
+}
+
+/**
+ * @brief Builds the output for each row
+ *
+ * Note: This uses malloc() in a device kernel which works great
+ * but is not very efficient. This can be useful for prototyping
+ * on functions where performance is not yet important.
+ * All calls to malloc() must have a corresponding free() call.
+ * The separate free_kernel is launched for this purpose.
+ *
+ * @param d_names Column of names
+ * @param d_visibilities Column of visibilities
+ * @param redaction Redacted string replacement
+ * @param d_output Output array of string_view objects
+ */
+__global__ void redact_kernel(cudf::column_device_view const d_names,
+                              cudf::column_device_view const d_visibilities,
+                              cudf::string_view redaction,
+                              cudf::string_view* d_output)
+{
+  // get index for this thread
+  auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index >= d_names.size()) return;
+
+  auto const visible = cudf::string_view("public", 6);
+
+  auto const name = d_names.element<cudf::string_view>(index);
+  auto const vis  = d_visibilities.element<cudf::string_view>(index);
+  if (vis == visible) {
+    auto const space_idx    = name.find(' ');
+    auto const first        = name.substr(0, space_idx);
+    auto const last_initial = name.substr(space_idx + 1, 1);
+    auto const output_size  = first.size_bytes() + last_initial.size_bytes() + 1;
+
+    char* output_ptr = static_cast<char*>(malloc(output_size));
+    d_output[index]  = cudf::string_view{output_ptr, output_size};
+
+    // build output string
+    memcpy(output_ptr, last_initial.data(), last_initial.size_bytes());
+    output_ptr += last_initial.size_bytes();
+    memcpy(output_ptr, " ", 1);
+    output_ptr++;
+    memcpy(output_ptr, first.data(), first.size_bytes());
+  } else {
+    d_output[index] = cudf::string_view{redaction.data(), redaction.size_bytes()};
+  }
+}
+
+/**
+ * @brief Frees the temporary individual string objects created in the
+ * redact_kernel
+ *
+ * Like malloc(), free() is not very efficient but must be called for
+ * each malloc() to return the memory to the CUDA malloc heap.
+ *
+ * @param redaction Redacted string replacement (not to be freed)
+ * @param d_output Output array of string_view objects to free
+ */
+__global__ void free_kernel(cudf::string_view redaction, cudf::string_view* d_output, int count)
+{
+  auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index >= count) return;
+
+  auto ptr = const_cast<char*>(d_output[index].data());
+  if (ptr != redaction.data()) free(ptr);
+}
+
+std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
+                                             cudf::column_view const& visibilities)
+{
+  auto stream = rmm::cuda_stream_default;
+  set_malloc_heap_size();
+
+  auto const d_names        = cudf::column_device_view::create(names, stream);
+  auto const d_visibilities = cudf::column_device_view::create(visibilities, stream);
+  auto const d_redaction    = cudf::string_scalar(std::string("X X"), true, stream);
+
+  constexpr auto block_size = 128;
+  auto const blocks         = (names.size() + block_size - 1) / block_size;
+
+  nvtxRangePushA("redact_strings");
+
+  // create a vector for the output strings' pointers
+  auto str_ptrs = new rmm::device_uvector<cudf::string_view>(names.size(), stream);
+
+  auto result = [&] {
+    // build the output strings
+    redact_kernel<<<blocks, block_size, 0, stream.value()>>>(
+      *d_names, *d_visibilities, d_redaction.value(), str_ptrs->data());
+    // create strings column from the string_view vector
+    // this copies all the individual strings into a single output column
+    return cudf::make_strings_column(*str_ptrs, cudf::string_view{nullptr, 0}, stream);
+  }();
+
+  // free the individual temporary memory pointers
+  free_kernel<<<blocks, block_size, 0, stream.value()>>>(
+    d_redaction.value(), str_ptrs->data(), names.size());
+  delete str_ptrs;
+
+  nvtxRangePop();
+  return result;
+}
diff --git a/cpp/examples/strings/libcudf_apis.cpp b/cpp/examples/strings/libcudf_apis.cpp
new file mode 100644
index 00000000000..009e92d8a0d
--- /dev/null
+++ b/cpp/examples/strings/libcudf_apis.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common.hpp"
+
+#include <cudf/copying.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/find.hpp>
+#include <cudf/strings/split/split.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/strings/substring.hpp>
+
+#include <cuda_runtime.h>
+#include <nvToolsExt.h>
+
+/**
+ * @brief Redacts each name per the corresponding visibility entry
+ *
+ * This implementation uses libcudf APIs to create the output result.
+ *
+ * @param names Column of names
+ * @param visibilities Column of visibilities
+ * @return Redacted column of names
+ */
+std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
+                                             cudf::column_view const& visibilities)
+{
+  auto const visible   = cudf::string_scalar(std::string("public"));
+  auto const redaction = cudf::string_scalar(std::string("X X"));
+
+  nvtxRangePushA("redact_strings");
+
+  auto const allowed      = cudf::strings::contains(visibilities, visible);
+  auto const redacted     = cudf::copy_if_else(names, redaction, allowed->view());
+  auto const first_last   = cudf::strings::split(redacted->view());
+  auto const first        = first_last->view().column(0);
+  auto const last         = first_last->view().column(1);
+  auto const last_initial = cudf::strings::slice_strings(last, 0, 1);
+
+  auto const last_initial_first = cudf::table_view({last_initial->view(), first});
+
+  auto result = cudf::strings::concatenate(last_initial_first, std::string(" "));
+
+  cudaStreamSynchronize(0);
+
+  nvtxRangePop();
+  return result;
+}
diff --git a/cpp/examples/strings/names.csv b/cpp/examples/strings/names.csv
new file mode 100644
index 00000000000..dec1c2a7a7b
--- /dev/null
+++ b/cpp/examples/strings/names.csv
@@ -0,0 +1,20 @@
+John Doe,public
+Jane Doe,private
+Billy Joe,private
+James James,public
+Michael Frederick,public
+Christopher Cheryl,public
+Jessica Autumn,public
+Matthew Tyrone,public
+Ashley Martha,public
+Jennifer Omar,public
+Joshua Lydia,public
+Amanda Jerome,public
+Daniel Theodore,public
+David Abby,public
+James Neil,public
+Robert Shawna,private
+John Sierra,private
+Joseph Nina,private
+Andrew Tammy,private
+Ryan Nikki,public
\ No newline at end of file

From 93c813f12687aa88954a065b24ab436eded7e017 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 3 Oct 2022 17:25:56 -0400
Subject: [PATCH 2/7] add/remove extra blank lines

---
 cpp/examples/strings/CMakeLists.txt | 1 -
 cpp/examples/strings/names.csv      | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt
index 732a050902e..92322418087 100644
--- a/cpp/examples/strings/CMakeLists.txt
+++ b/cpp/examples/strings/CMakeLists.txt
@@ -47,4 +47,3 @@ add_executable(custom_fast custom_fast.cu)
 target_compile_features(custom_fast PRIVATE cxx_std_17)
 target_compile_options(custom_fast PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
 target_link_libraries(custom_fast PRIVATE cudf::cudf nvToolsExt)
-
diff --git a/cpp/examples/strings/names.csv b/cpp/examples/strings/names.csv
index dec1c2a7a7b..77dca3e02af 100644
--- a/cpp/examples/strings/names.csv
+++ b/cpp/examples/strings/names.csv
@@ -17,4 +17,4 @@ Robert Shawna,private
 John Sierra,private
 Joseph Nina,private
 Andrew Tammy,private
-Ryan Nikki,public
\ No newline at end of file
+Ryan Nikki,public

From 0ecc23c95e3a459a4fa15e9177ba9c31aabd2288 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 5 Oct 2022 08:33:05 -0400
Subject: [PATCH 3/7] rename source files to match better with the blog names

---
 cpp/examples/strings/CMakeLists.txt            | 18 +++++++++---------
 .../{custom_fast.cu => custom_optimized.cu}    |  0
 .../{custom_gather.cu => custom_prealloc.cu}   |  0
 3 files changed, 9 insertions(+), 9 deletions(-)
 rename cpp/examples/strings/{custom_fast.cu => custom_optimized.cu} (100%)
 rename cpp/examples/strings/{custom_gather.cu => custom_prealloc.cu} (100%)

diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt
index 92322418087..41bc5e00224 100644
--- a/cpp/examples/strings/CMakeLists.txt
+++ b/cpp/examples/strings/CMakeLists.txt
@@ -38,12 +38,12 @@ target_compile_features(custom_with_malloc PRIVATE cxx_std_17)
 target_compile_options(custom_with_malloc PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
 target_link_libraries(custom_with_malloc PRIVATE cudf::cudf nvToolsExt)
 
-add_executable(custom_gather custom_gather.cu)
-target_compile_features(custom_gather PRIVATE cxx_std_17)
-target_compile_options(custom_gather PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
-target_link_libraries(custom_gather PRIVATE cudf::cudf nvToolsExt)
-
-add_executable(custom_fast custom_fast.cu)
-target_compile_features(custom_fast PRIVATE cxx_std_17)
-target_compile_options(custom_fast PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
-target_link_libraries(custom_fast PRIVATE cudf::cudf nvToolsExt)
+add_executable(custom_prealloc custom_prealloc.cu)
+target_compile_features(custom_prealloc PRIVATE cxx_std_17)
+target_compile_options(custom_prealloc PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
+target_link_libraries(custom_prealloc PRIVATE cudf::cudf nvToolsExt)
+
+add_executable(custom_optimized custom_optimized.cu)
+target_compile_features(custom_optimized PRIVATE cxx_std_17)
+target_compile_options(custom_optimized PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
+target_link_libraries(custom_optimized PRIVATE cudf::cudf nvToolsExt)
diff --git a/cpp/examples/strings/custom_fast.cu b/cpp/examples/strings/custom_optimized.cu
similarity index 100%
rename from cpp/examples/strings/custom_fast.cu
rename to cpp/examples/strings/custom_optimized.cu
diff --git a/cpp/examples/strings/custom_gather.cu b/cpp/examples/strings/custom_prealloc.cu
similarity index 100%
rename from cpp/examples/strings/custom_gather.cu
rename to cpp/examples/strings/custom_prealloc.cu

From e2cc8116e3ae467f803120483135c8e437a631d6 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 5 Oct 2022 20:30:56 -0400
Subject: [PATCH 4/7] update memcpy(1) to *output_ptr++

---
 cpp/examples/strings/custom_optimized.cu   | 3 +--
 cpp/examples/strings/custom_prealloc.cu    | 3 +--
 cpp/examples/strings/custom_with_malloc.cu | 3 +--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/cpp/examples/strings/custom_optimized.cu b/cpp/examples/strings/custom_optimized.cu
index 41176199e8e..0164e1bca06 100644
--- a/cpp/examples/strings/custom_optimized.cu
+++ b/cpp/examples/strings/custom_optimized.cu
@@ -94,8 +94,7 @@ __global__ void redact_kernel(cudf::column_device_view const d_names,
     // build output string
     memcpy(output_ptr, last_initial.data(), last_initial.size_bytes());
     output_ptr += last_initial.size_bytes();
-    memcpy(output_ptr, " ", 1);
-    output_ptr++;
+    *output_ptr++ = ' ';
     memcpy(output_ptr, first.data(), first.size_bytes());
   } else {
     memcpy(output_ptr, redaction.data(), redaction.size_bytes());
diff --git a/cpp/examples/strings/custom_prealloc.cu b/cpp/examples/strings/custom_prealloc.cu
index 6725a643fdf..d2ebecad6ff 100644
--- a/cpp/examples/strings/custom_prealloc.cu
+++ b/cpp/examples/strings/custom_prealloc.cu
@@ -61,8 +61,7 @@ __global__ void redact_kernel(cudf::column_device_view const d_names,
     // build output string
     memcpy(output_ptr, last_initial.data(), last_initial.size_bytes());
     output_ptr += last_initial.size_bytes();
-    memcpy(output_ptr, " ", 1);
-    output_ptr++;
+    *output_ptr++ = ' ';
     memcpy(output_ptr, first.data(), first.size_bytes());
   } else {
     d_output[index] = cudf::string_view{redaction.data(), redaction.size_bytes()};
diff --git a/cpp/examples/strings/custom_with_malloc.cu b/cpp/examples/strings/custom_with_malloc.cu
index ebcc70b34c7..5db457e4c8f 100644
--- a/cpp/examples/strings/custom_with_malloc.cu
+++ b/cpp/examples/strings/custom_with_malloc.cu
@@ -87,8 +87,7 @@ __global__ void redact_kernel(cudf::column_device_view const d_names,
     // build output string
     memcpy(output_ptr, last_initial.data(), last_initial.size_bytes());
     output_ptr += last_initial.size_bytes();
-    memcpy(output_ptr, " ", 1);
-    output_ptr++;
+    *output_ptr++ = ' ';
     memcpy(output_ptr, first.data(), first.size_bytes());
   } else {
     d_output[index] = cudf::string_view{redaction.data(), redaction.size_bytes()};

From 6c706f67c941d0b63741e69684c6590aa68eb7b9 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 11 Oct 2022 10:19:44 -0400
Subject: [PATCH 5/7] use chrono for wall time

---
 cpp/examples/strings/common.hpp | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/cpp/examples/strings/common.hpp b/cpp/examples/strings/common.hpp
index efdd211e9e3..dbd3c4dbd1b 100644
--- a/cpp/examples/strings/common.hpp
+++ b/cpp/examples/strings/common.hpp
@@ -27,11 +27,10 @@
 #include <rmm/mr/device/owning_wrapper.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
-#include <cstdlib>
+#include <chrono>
 #include <iostream>
 #include <memory>
 #include <string>
-#include <sys/time.h>
 
 /**
  * @brief Main example function returns redacted strings column.
@@ -97,24 +96,18 @@ int main(int argc, char const** argv)
   auto const csv_result = [csv_file] {
     cudf::io::csv_reader_options in_opts =
       cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_file}).header(-1);
-    return std::move(cudf::io::read_csv(in_opts).tbl);
+    return cudf::io::read_csv(in_opts).tbl;
   }();
   auto const csv_table = csv_result->view();
 
   std::cout << "table: " << csv_table.num_rows() << " rows " << csv_table.num_columns()
             << " columns\n";
 
-  // utility function that captures current time in microseconds
-  auto get_time = []() {
-    timeval tv;
-    gettimeofday(&tv, nullptr);
-    return static_cast<double>(tv.tv_sec * 1000000 + tv.tv_usec) / 1000000.0;
-  };
+  auto st     = std::chrono::steady_clock::now();
+  auto result = redact_strings(csv_table.column(0), csv_table.column(1));
 
-  auto st      = get_time();
-  auto result  = redact_strings(csv_table.column(0), csv_table.column(1));
-  auto elapsed = get_time() - st;
-  std::cout << "Wall time: " << elapsed << " seconds\n";
+  std::chrono::duration<double> elapsed = std::chrono::steady_clock::now() - st;
+  std::cout << "Wall time: " << elapsed.count() << " seconds\n";
   std::cout << "Output size " << result->view().child(1).size() << " bytes\n";
 
   return 0;

From 83055cc24c9690f50afc5bc85830fc862bd09c8a Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 12 Oct 2022 15:36:28 -0400
Subject: [PATCH 6/7] add more comments

---
 cpp/examples/strings/custom_optimized.cu   | 17 ++++++++++++++---
 cpp/examples/strings/custom_prealloc.cu    | 12 ++++++++++--
 cpp/examples/strings/custom_with_malloc.cu | 18 +++++++++++++-----
 3 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/cpp/examples/strings/custom_optimized.cu b/cpp/examples/strings/custom_optimized.cu
index 0164e1bca06..bfe650daa93 100644
--- a/cpp/examples/strings/custom_optimized.cu
+++ b/cpp/examples/strings/custom_optimized.cu
@@ -31,6 +31,8 @@
 /**
  * @brief Computes the size of each output row
  *
+ * This thread is called once per row in d_names.
+ *
  * @param d_names Column of names
  * @param d_visibilities Column of visibilities
  * @param d_sizes Output sizes for each row
@@ -39,7 +41,9 @@ __global__ void sizes_kernel(cudf::column_device_view const d_names,
                              cudf::column_device_view const d_visibilities,
                              cudf::size_type* d_sizes)
 {
+  // The row index is resolved from the CUDA thread/block objects
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  // There may be more threads than actual rows
   if (index >= d_names.size()) return;
 
   auto const visible   = cudf::string_view("public", 6);
@@ -63,6 +67,8 @@ __global__ void sizes_kernel(cudf::column_device_view const d_names,
 /**
  * @brief Builds the output for each row
  *
+ * This thread is called once per row in d_names.
+ *
  * @param d_names Column of names
  * @param d_visibilities Column of visibilities
  * @param d_offsets Byte offset in `d_chars` for each row
@@ -73,7 +79,9 @@ __global__ void redact_kernel(cudf::column_device_view const d_names,
                               cudf::size_type const* d_offsets,
                               char* d_chars)
 {
+  // The row index is resolved from the CUDA thread/block objects
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  // There may be more threads than actual rows
   if (index >= d_names.size()) return;
 
   auto const visible   = cudf::string_view("public", 6);
@@ -114,12 +122,13 @@ __global__ void redact_kernel(cudf::column_device_view const d_names,
 std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
                                              cudf::column_view const& visibilities)
 {
+  // all device memory operations and kernel functions will run on this stream
   auto stream = rmm::cuda_stream_default;
 
   auto const d_names        = cudf::column_device_view::create(names, stream);
   auto const d_visibilities = cudf::column_device_view::create(visibilities, stream);
 
-  constexpr int block_size = 128;
+  constexpr int block_size = 128;  // this arbitrary size should be a power of 2
   int const blocks         = (names.size() + block_size - 1) / block_size;
 
   nvtxRangePushA("redact_strings");
@@ -135,7 +144,7 @@ std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
   thrust::exclusive_scan(rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin());
 
   // last element is the total output size
-  // (device-to-host copy of 1 integer)
+  // (device-to-host copy of 1 integer -- includes synching the stream)
   cudf::size_type output_size = offsets.back_element(stream);
 
   //  create chars vector
@@ -147,7 +156,9 @@ std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
 
   // create column from offsets and chars vectors (no copy is performed)
   auto result = cudf::make_strings_column(names.size(), std::move(offsets), std::move(chars));
-  cudaStreamSynchronize(stream.value());
+
+  // wait for all of the above to finish
+  stream.synchronize();
 
   nvtxRangePop();
   return result;
diff --git a/cpp/examples/strings/custom_prealloc.cu b/cpp/examples/strings/custom_prealloc.cu
index d2ebecad6ff..c0bae03af5c 100644
--- a/cpp/examples/strings/custom_prealloc.cu
+++ b/cpp/examples/strings/custom_prealloc.cu
@@ -28,6 +28,8 @@
 /**
  * @brief Builds the output for each row
  *
+ * This thread is called once per row in d_names.
+ *
  * @param d_names Column of names
  * @param d_visibilities Column of visibilities
  * @param redaction Redacted string replacement
@@ -42,7 +44,9 @@ __global__ void redact_kernel(cudf::column_device_view const d_names,
                               cudf::offset_type const* d_offsets,
                               cudf::string_view* d_output)
 {
+  // The row index is resolved from the CUDA thread/block objects
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  // There may be more threads than actual rows
   if (index >= d_names.size()) return;
 
   auto const visible = cudf::string_view("public", 6);
@@ -81,14 +85,15 @@ __global__ void redact_kernel(cudf::column_device_view const d_names,
 std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
                                              cudf::column_view const& visibilities)
 {
+  // all device memory operations and kernel functions will run on this stream
   auto stream = rmm::cuda_stream_default;
 
   auto const d_names        = cudf::column_device_view::create(names, stream);
   auto const d_visibilities = cudf::column_device_view::create(visibilities, stream);
   auto const d_redaction    = cudf::string_scalar(std::string("X X"), true, stream);
 
-  constexpr auto block_size = 128;
-  auto const blocks         = (names.size() + block_size - 1) / block_size;
+  constexpr int block_size = 128;  // this arbitrary size should be a power of 2
+  auto const blocks        = (names.size() + block_size - 1) / block_size;
 
   nvtxRangePushA("redact_strings");
 
@@ -113,6 +118,9 @@ std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
   auto result = cudf::make_strings_column(str_ptrs, cudf::string_view{nullptr, 0}, stream);
   // temporary memory cleanup cost here for str_ptrs and working_memory
 
+  // wait for all of the above to finish
+  stream.synchronize();
+
   nvtxRangePop();
   return result;
 }
diff --git a/cpp/examples/strings/custom_with_malloc.cu b/cpp/examples/strings/custom_with_malloc.cu
index 5db457e4c8f..f1d397ef007 100644
--- a/cpp/examples/strings/custom_with_malloc.cu
+++ b/cpp/examples/strings/custom_with_malloc.cu
@@ -51,6 +51,8 @@ void set_malloc_heap_size(size_t heap_size = 1073741824)  // 1GB
 /**
  * @brief Builds the output for each row
  *
+ * This thread is called once per row in d_names.
+ *
  * Note: This uses malloc() in a device kernel which works great
  * but is not very efficient. This can be useful for prototyping
  * on functions where performance is not yet important.
@@ -67,8 +69,9 @@ __global__ void redact_kernel(cudf::column_device_view const d_names,
                               cudf::string_view redaction,
                               cudf::string_view* d_output)
 {
-  // get index for this thread
+  // The row index is resolved from the CUDA thread/block objects
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  // There may be more threads than actual rows
   if (index >= d_names.size()) return;
 
   auto const visible = cudf::string_view("public", 6);
@@ -110,21 +113,23 @@ __global__ void free_kernel(cudf::string_view redaction, cudf::string_view* d_ou
   if (index >= count) return;
 
   auto ptr = const_cast<char*>(d_output[index].data());
-  if (ptr != redaction.data()) free(ptr);
+  if (ptr != redaction.data()) { free(ptr); }
 }
 
 std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
                                              cudf::column_view const& visibilities)
 {
+  // all device memory operations and kernel functions will run on this stream
   auto stream = rmm::cuda_stream_default;
-  set_malloc_heap_size();
+
+  set_malloc_heap_size();  // to illustrate adjusting the malloc heap
 
   auto const d_names        = cudf::column_device_view::create(names, stream);
   auto const d_visibilities = cudf::column_device_view::create(visibilities, stream);
   auto const d_redaction    = cudf::string_scalar(std::string("X X"), true, stream);
 
-  constexpr auto block_size = 128;
-  auto const blocks         = (names.size() + block_size - 1) / block_size;
+  constexpr int block_size = 128;  // this arbitrary size should be a power of 2
+  auto const blocks        = (names.size() + block_size - 1) / block_size;
 
   nvtxRangePushA("redact_strings");
 
@@ -145,6 +150,9 @@ std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
     d_redaction.value(), str_ptrs->data(), names.size());
   delete str_ptrs;
 
+  // wait for all of the above to finish
+  stream.synchronize();
+
   nvtxRangePop();
   return result;
 }

From d95708ea720fb419025c2ff87f4590cba53a7d0d Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Fri, 14 Oct 2022 10:25:17 -0400
Subject: [PATCH 7/7] update cmake_minimum_required version to 3.23.1

---
 cpp/examples/basic/CMakeLists.txt   | 2 +-
 cpp/examples/strings/CMakeLists.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt
index b182cb08774..7e7c6b191b5 100644
--- a/cpp/examples/basic/CMakeLists.txt
+++ b/cpp/examples/basic/CMakeLists.txt
@@ -1,6 +1,6 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
-cmake_minimum_required(VERSION 3.18)
+cmake_minimum_required(VERSION 3.23.1)
 
 project(
   basic_example
diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt
index 41bc5e00224..1a16b2bc8fd 100644
--- a/cpp/examples/strings/CMakeLists.txt
+++ b/cpp/examples/strings/CMakeLists.txt
@@ -1,6 +1,6 @@
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
-cmake_minimum_required(VERSION 3.18)
+cmake_minimum_required(VERSION 3.23.1)
 
 project(
   strings_examples