Skip to content

Commit

Permalink
add more comments
Browse files Browse the repository at this point in the history
  • Loading branch information
davidwendt committed Oct 12, 2022
1 parent c893ed0 commit 83055cc
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 10 deletions.
17 changes: 14 additions & 3 deletions cpp/examples/strings/custom_optimized.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
/**
* @brief Computes the size of each output row
*
* This thread is called once per row in d_names.
*
* @param d_names Column of names
* @param d_visibilities Column of visibilities
* @param d_sizes Output sizes for each row
Expand All @@ -39,7 +41,9 @@ __global__ void sizes_kernel(cudf::column_device_view const d_names,
cudf::column_device_view const d_visibilities,
cudf::size_type* d_sizes)
{
// The row index is resolved from the CUDA thread/block objects
auto index = threadIdx.x + blockIdx.x * blockDim.x;
// There may be more threads than actual rows
if (index >= d_names.size()) return;

auto const visible = cudf::string_view("public", 6);
Expand All @@ -63,6 +67,8 @@ __global__ void sizes_kernel(cudf::column_device_view const d_names,
/**
* @brief Builds the output for each row
*
* This thread is called once per row in d_names.
*
* @param d_names Column of names
* @param d_visibilities Column of visibilities
* @param d_offsets Byte offset in `d_chars` for each row
Expand All @@ -73,7 +79,9 @@ __global__ void redact_kernel(cudf::column_device_view const d_names,
cudf::size_type const* d_offsets,
char* d_chars)
{
// The row index is resolved from the CUDA thread/block objects
auto index = threadIdx.x + blockIdx.x * blockDim.x;
// There may be more threads than actual rows
if (index >= d_names.size()) return;

auto const visible = cudf::string_view("public", 6);
Expand Down Expand Up @@ -114,12 +122,13 @@ __global__ void redact_kernel(cudf::column_device_view const d_names,
std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
cudf::column_view const& visibilities)
{
// all device memory operations and kernel functions will run on this stream
auto stream = rmm::cuda_stream_default;

auto const d_names = cudf::column_device_view::create(names, stream);
auto const d_visibilities = cudf::column_device_view::create(visibilities, stream);

constexpr int block_size = 128;
constexpr int block_size = 128; // this arbitrary size should be a power of 2
int const blocks = (names.size() + block_size - 1) / block_size;

nvtxRangePushA("redact_strings");
Expand All @@ -135,7 +144,7 @@ std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
thrust::exclusive_scan(rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin());

// last element is the total output size
// (device-to-host copy of 1 integer)
// (device-to-host copy of 1 integer -- includes synching the stream)
cudf::size_type output_size = offsets.back_element(stream);

// create chars vector
Expand All @@ -147,7 +156,9 @@ std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,

// create column from offsets and chars vectors (no copy is performed)
auto result = cudf::make_strings_column(names.size(), std::move(offsets), std::move(chars));
cudaStreamSynchronize(stream.value());

// wait for all of the above to finish
stream.synchronize();

nvtxRangePop();
return result;
Expand Down
12 changes: 10 additions & 2 deletions cpp/examples/strings/custom_prealloc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
/**
* @brief Builds the output for each row
*
* This thread is called once per row in d_names.
*
* @param d_names Column of names
* @param d_visibilities Column of visibilities
* @param redaction Redacted string replacement
Expand All @@ -42,7 +44,9 @@ __global__ void redact_kernel(cudf::column_device_view const d_names,
cudf::offset_type const* d_offsets,
cudf::string_view* d_output)
{
// The row index is resolved from the CUDA thread/block objects
auto index = threadIdx.x + blockIdx.x * blockDim.x;
// There may be more threads than actual rows
if (index >= d_names.size()) return;

auto const visible = cudf::string_view("public", 6);
Expand Down Expand Up @@ -81,14 +85,15 @@ __global__ void redact_kernel(cudf::column_device_view const d_names,
std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
cudf::column_view const& visibilities)
{
// all device memory operations and kernel functions will run on this stream
auto stream = rmm::cuda_stream_default;

auto const d_names = cudf::column_device_view::create(names, stream);
auto const d_visibilities = cudf::column_device_view::create(visibilities, stream);
auto const d_redaction = cudf::string_scalar(std::string("X X"), true, stream);

constexpr auto block_size = 128;
auto const blocks = (names.size() + block_size - 1) / block_size;
constexpr int block_size = 128; // this arbitrary size should be a power of 2
auto const blocks = (names.size() + block_size - 1) / block_size;

nvtxRangePushA("redact_strings");

Expand All @@ -113,6 +118,9 @@ std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
auto result = cudf::make_strings_column(str_ptrs, cudf::string_view{nullptr, 0}, stream);
// temporary memory cleanup cost here for str_ptrs and working_memory

// wait for all of the above to finish
stream.synchronize();

nvtxRangePop();
return result;
}
18 changes: 13 additions & 5 deletions cpp/examples/strings/custom_with_malloc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ void set_malloc_heap_size(size_t heap_size = 1073741824) // 1GB
/**
* @brief Builds the output for each row
*
* This thread is called once per row in d_names.
*
* Note: This uses malloc() in a device kernel which works great
* but is not very efficient. This can be useful for prototyping
* on functions where performance is not yet important.
Expand All @@ -67,8 +69,9 @@ __global__ void redact_kernel(cudf::column_device_view const d_names,
cudf::string_view redaction,
cudf::string_view* d_output)
{
// get index for this thread
// The row index is resolved from the CUDA thread/block objects
auto index = threadIdx.x + blockIdx.x * blockDim.x;
// There may be more threads than actual rows
if (index >= d_names.size()) return;

auto const visible = cudf::string_view("public", 6);
Expand Down Expand Up @@ -110,21 +113,23 @@ __global__ void free_kernel(cudf::string_view redaction, cudf::string_view* d_ou
if (index >= count) return;

auto ptr = const_cast<char*>(d_output[index].data());
if (ptr != redaction.data()) free(ptr);
if (ptr != redaction.data()) { free(ptr); }
}

std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
cudf::column_view const& visibilities)
{
// all device memory operations and kernel functions will run on this stream
auto stream = rmm::cuda_stream_default;
set_malloc_heap_size();

set_malloc_heap_size(); // to illustrate adjusting the malloc heap

auto const d_names = cudf::column_device_view::create(names, stream);
auto const d_visibilities = cudf::column_device_view::create(visibilities, stream);
auto const d_redaction = cudf::string_scalar(std::string("X X"), true, stream);

constexpr auto block_size = 128;
auto const blocks = (names.size() + block_size - 1) / block_size;
constexpr int block_size = 128; // this arbitrary size should be a power of 2
auto const blocks = (names.size() + block_size - 1) / block_size;

nvtxRangePushA("redact_strings");

Expand All @@ -145,6 +150,9 @@ std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
d_redaction.value(), str_ptrs->data(), names.size());
delete str_ptrs;

// wait for all of the above to finish
stream.synchronize();

nvtxRangePop();
return result;
}

0 comments on commit 83055cc

Please sign in to comment.