add more comments

rapidsai · Oct 12, 2022 · 83055cc · 83055cc
1 parent c893ed0
commit 83055cc
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 10 deletions.
diff --git a/cpp/examples/strings/custom_optimized.cu b/cpp/examples/strings/custom_optimized.cu
@@ -31,6 +31,8 @@
 /**
  * @brief Computes the size of each output row
  *
+ * This thread is called once per row in d_names.
+ *
  * @param d_names Column of names
  * @param d_visibilities Column of visibilities
  * @param d_sizes Output sizes for each row
@@ -39,7 +41,9 @@ __global__ void sizes_kernel(cudf::column_device_view const d_names,
                              cudf::column_device_view const d_visibilities,
                              cudf::size_type* d_sizes)
 {
+  // The row index is resolved from the CUDA thread/block objects
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  // There may be more threads than actual rows
   if (index >= d_names.size()) return;
 
   auto const visible   = cudf::string_view("public", 6);
@@ -63,6 +67,8 @@ __global__ void sizes_kernel(cudf::column_device_view const d_names,
 /**
  * @brief Builds the output for each row
  *
+ * This thread is called once per row in d_names.
+ *
  * @param d_names Column of names
  * @param d_visibilities Column of visibilities
  * @param d_offsets Byte offset in `d_chars` for each row
@@ -73,7 +79,9 @@ __global__ void redact_kernel(cudf::column_device_view const d_names,
                               cudf::size_type const* d_offsets,
                               char* d_chars)
 {
+  // The row index is resolved from the CUDA thread/block objects
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  // There may be more threads than actual rows
   if (index >= d_names.size()) return;
 
   auto const visible   = cudf::string_view("public", 6);
@@ -114,12 +122,13 @@ __global__ void redact_kernel(cudf::column_device_view const d_names,
 std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
                                              cudf::column_view const& visibilities)
 {
+  // all device memory operations and kernel functions will run on this stream
   auto stream = rmm::cuda_stream_default;
 
   auto const d_names        = cudf::column_device_view::create(names, stream);
   auto const d_visibilities = cudf::column_device_view::create(visibilities, stream);
 
-  constexpr int block_size = 128;
+  constexpr int block_size = 128;  // this arbitrary size should be a power of 2
   int const blocks         = (names.size() + block_size - 1) / block_size;
 
   nvtxRangePushA("redact_strings");
@@ -135,7 +144,7 @@ std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
   thrust::exclusive_scan(rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin());
 
   // last element is the total output size
-  // (device-to-host copy of 1 integer)
+  // (device-to-host copy of 1 integer -- includes synching the stream)
   cudf::size_type output_size = offsets.back_element(stream);
 
   //  create chars vector
@@ -147,7 +156,9 @@ std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
 
   // create column from offsets and chars vectors (no copy is performed)
   auto result = cudf::make_strings_column(names.size(), std::move(offsets), std::move(chars));
-  cudaStreamSynchronize(stream.value());
+
+  // wait for all of the above to finish
+  stream.synchronize();
 
   nvtxRangePop();
   return result;

diff --git a/cpp/examples/strings/custom_prealloc.cu b/cpp/examples/strings/custom_prealloc.cu
@@ -28,6 +28,8 @@
 /**
  * @brief Builds the output for each row
  *
+ * This thread is called once per row in d_names.
+ *
  * @param d_names Column of names
  * @param d_visibilities Column of visibilities
  * @param redaction Redacted string replacement
@@ -42,7 +44,9 @@ __global__ void redact_kernel(cudf::column_device_view const d_names,
                               cudf::offset_type const* d_offsets,
                               cudf::string_view* d_output)
 {
+  // The row index is resolved from the CUDA thread/block objects
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  // There may be more threads than actual rows
   if (index >= d_names.size()) return;
 
   auto const visible = cudf::string_view("public", 6);
@@ -81,14 +85,15 @@ __global__ void redact_kernel(cudf::column_device_view const d_names,
 std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
                                              cudf::column_view const& visibilities)
 {
+  // all device memory operations and kernel functions will run on this stream
   auto stream = rmm::cuda_stream_default;
 
   auto const d_names        = cudf::column_device_view::create(names, stream);
   auto const d_visibilities = cudf::column_device_view::create(visibilities, stream);
   auto const d_redaction    = cudf::string_scalar(std::string("X X"), true, stream);
 
-  constexpr auto block_size = 128;
-  auto const blocks         = (names.size() + block_size - 1) / block_size;
+  constexpr int block_size = 128;  // this arbitrary size should be a power of 2
+  auto const blocks        = (names.size() + block_size - 1) / block_size;
 
   nvtxRangePushA("redact_strings");
 
@@ -113,6 +118,9 @@ std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
   auto result = cudf::make_strings_column(str_ptrs, cudf::string_view{nullptr, 0}, stream);
   // temporary memory cleanup cost here for str_ptrs and working_memory
 
+  // wait for all of the above to finish
+  stream.synchronize();
+
   nvtxRangePop();
   return result;
 }
diff --git a/cpp/examples/strings/custom_with_malloc.cu b/cpp/examples/strings/custom_with_malloc.cu
@@ -51,6 +51,8 @@ void set_malloc_heap_size(size_t heap_size = 1073741824)  // 1GB
 /**
  * @brief Builds the output for each row
  *
+ * This thread is called once per row in d_names.
+ *
  * Note: This uses malloc() in a device kernel which works great
  * but is not very efficient. This can be useful for prototyping
  * on functions where performance is not yet important.
@@ -67,8 +69,9 @@ __global__ void redact_kernel(cudf::column_device_view const d_names,
                               cudf::string_view redaction,
                               cudf::string_view* d_output)
 {
-  // get index for this thread
+  // The row index is resolved from the CUDA thread/block objects
   auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  // There may be more threads than actual rows
   if (index >= d_names.size()) return;
 
   auto const visible = cudf::string_view("public", 6);
@@ -110,21 +113,23 @@ __global__ void free_kernel(cudf::string_view redaction, cudf::string_view* d_ou
   if (index >= count) return;
 
   auto ptr = const_cast<char*>(d_output[index].data());
-  if (ptr != redaction.data()) free(ptr);
+  if (ptr != redaction.data()) { free(ptr); }
 }
 
 std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
                                              cudf::column_view const& visibilities)
 {
+  // all device memory operations and kernel functions will run on this stream
   auto stream = rmm::cuda_stream_default;
-  set_malloc_heap_size();
+
+  set_malloc_heap_size();  // to illustrate adjusting the malloc heap
 
   auto const d_names        = cudf::column_device_view::create(names, stream);
   auto const d_visibilities = cudf::column_device_view::create(visibilities, stream);
   auto const d_redaction    = cudf::string_scalar(std::string("X X"), true, stream);
 
-  constexpr auto block_size = 128;
-  auto const blocks         = (names.size() + block_size - 1) / block_size;
+  constexpr int block_size = 128;  // this arbitrary size should be a power of 2
+  auto const blocks        = (names.size() + block_size - 1) / block_size;
 
   nvtxRangePushA("redact_strings");
 
@@ -145,6 +150,9 @@ std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
     d_redaction.value(), str_ptrs->data(), names.size());
   delete str_ptrs;
 
+  // wait for all of the above to finish
+  stream.synchronize();
+
   nvtxRangePop();
   return result;
 }