From 97660665d05fcf409c1cb881690fe773a9e368d1 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 3 Oct 2022 13:25:34 -0400 Subject: [PATCH 1/7] Add libcudf strings example --- ci/release/update-version.sh | 1 + cpp/examples/build.sh | 9 +- cpp/examples/strings/CMakeLists.txt | 50 +++++++ cpp/examples/strings/common.hpp | 121 ++++++++++++++++ cpp/examples/strings/custom_fast.cu | 155 +++++++++++++++++++++ cpp/examples/strings/custom_gather.cu | 119 ++++++++++++++++ cpp/examples/strings/custom_with_malloc.cu | 151 ++++++++++++++++++++ cpp/examples/strings/libcudf_apis.cpp | 62 +++++++++ cpp/examples/strings/names.csv | 20 +++ 9 files changed, 687 insertions(+), 1 deletion(-) create mode 100644 cpp/examples/strings/CMakeLists.txt create mode 100644 cpp/examples/strings/common.hpp create mode 100644 cpp/examples/strings/custom_fast.cu create mode 100644 cpp/examples/strings/custom_gather.cu create mode 100644 cpp/examples/strings/custom_with_malloc.cu create mode 100644 cpp/examples/strings/libcudf_apis.cpp create mode 100644 cpp/examples/strings/names.csv diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 8fad4e08c56..c23f558f071 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -74,6 +74,7 @@ sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md # Libcudf examples update sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/basic/CMakeLists.txt +sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/strings/CMakeLists.txt # ucx-py version update sed_runner "s/export UCX_PY_VERSION=.*/export UCX_PY_VERSION='${NEXT_UCX_PY_VERSION}'/g" ci/gpu/build.sh diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh index 079f7358872..7d389cd318d 100755 --- a/cpp/examples/build.sh +++ b/cpp/examples/build.sh @@ -17,8 +17,15 @@ LIB_BUILD_DIR=${LIB_BUILD_DIR:-$(readlink -f "${EXAMPLES_DIR}/../build")} # Basic example BASIC_EXAMPLE_DIR=${EXAMPLES_DIR}/basic BASIC_EXAMPLE_BUILD_DIR=${BASIC_EXAMPLE_DIR}/build - # Configure cmake -S ${BASIC_EXAMPLE_DIR} -B ${BASIC_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}" # Build cmake --build ${BASIC_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL} + +# Strings example +STRINGS_EXAMPLE_DIR=${EXAMPLES_DIR}/strings +STRINGS_EXAMPLE_BUILD_DIR=${STRINGS_EXAMPLE_DIR}/build +# Configure +cmake -S ${STRINGS_EXAMPLE_DIR} -B ${STRINGS_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}" +# Build +cmake --build ${STRINGS_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL} diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt new file mode 100644 index 00000000000..732a050902e --- /dev/null +++ b/cpp/examples/strings/CMakeLists.txt @@ -0,0 +1,50 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +cmake_minimum_required(VERSION 3.18) + +project( + strings_examples + VERSION 0.0.1 + LANGUAGES CXX CUDA +) + +set(CPM_DOWNLOAD_VERSION v0.35.3) +file( + DOWNLOAD + https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake + ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake +) +include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake) + +set(CUDF_TAG branch-22.12) +CPMFindPackage( + NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf + GIT_TAG ${CUDF_TAG} + GIT_SHALLOW + TRUE + SOURCE_SUBDIR + cpp +) + +list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr) + +# +add_executable(libcudf_apis libcudf_apis.cpp) +target_compile_features(libcudf_apis PRIVATE cxx_std_17) +target_link_libraries(libcudf_apis PRIVATE cudf::cudf nvToolsExt) + +add_executable(custom_with_malloc custom_with_malloc.cu) +target_compile_features(custom_with_malloc PRIVATE cxx_std_17) +target_compile_options(custom_with_malloc PRIVATE "$<$:${CUDF_CUDA_FLAGS}>") +target_link_libraries(custom_with_malloc PRIVATE cudf::cudf nvToolsExt) + +add_executable(custom_gather custom_gather.cu) +target_compile_features(custom_gather PRIVATE cxx_std_17) +target_compile_options(custom_gather PRIVATE "$<$:${CUDF_CUDA_FLAGS}>") +target_link_libraries(custom_gather PRIVATE cudf::cudf nvToolsExt) + +add_executable(custom_fast custom_fast.cu) +target_compile_features(custom_fast PRIVATE cxx_std_17) +target_compile_options(custom_fast PRIVATE "$<$:${CUDF_CUDA_FLAGS}>") +target_link_libraries(custom_fast PRIVATE cudf::cudf nvToolsExt) + diff --git a/cpp/examples/strings/common.hpp b/cpp/examples/strings/common.hpp new file mode 100644 index 00000000000..efdd211e9e3 --- /dev/null +++ b/cpp/examples/strings/common.hpp @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/** + * @brief Main example function returns redacted strings column. + * + * This function returns a redacted version of the input `names` column + * using the the `visibilities` column as in the following example + * ``` + * names visibility --> redacted + * John Doe public D John + * Bobby Joe private X X + * ``` + * + * @param names First and last names separated with a single space + * @param visibilities String values `public` or `private` only + * @return Redacted strings column + */ +std::unique_ptr redact_strings(cudf::column_view const& names, + cudf::column_view const& visibilities); + +/** + * @brief Create CUDA memory resource + */ +auto make_cuda_mr() { return std::make_shared(); } + +/** + * @brief Create a pool device memory resource + */ +auto make_pool_mr() +{ + return rmm::mr::make_owning_wrapper(make_cuda_mr()); +} + +/** + * @brief Create memory resource for libcudf functions + */ +std::shared_ptr create_memory_resource(std::string const& name) +{ + if (name == "pool") { return make_pool_mr(); } + return make_cuda_mr(); +} + +/** + * @brief Main for strings examples + * + * Command line parameters: + * 1. CSV file name/path + * 2. Memory resource (optional): 'pool' or 'cuda' + * + * The stdout includes the number of rows in the input and the output size in bytes. + */ +int main(int argc, char const** argv) +{ + if (argc < 2) { + std::cout << "required parameter: csv-file-path\n"; + return 1; + } + + auto const mr_name = std::string{argc > 2 ? std::string(argv[2]) : std::string("cuda")}; + auto resource = create_memory_resource(mr_name); + rmm::mr::set_current_device_resource(resource.get()); + + auto const csv_file = std::string{argv[1]}; + auto const csv_result = [csv_file] { + cudf::io::csv_reader_options in_opts = + cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_file}).header(-1); + return std::move(cudf::io::read_csv(in_opts).tbl); + }(); + auto const csv_table = csv_result->view(); + + std::cout << "table: " << csv_table.num_rows() << " rows " << csv_table.num_columns() + << " columns\n"; + + // utility function that captures current time in microseconds + auto get_time = []() { + timeval tv; + gettimeofday(&tv, nullptr); + return static_cast(tv.tv_sec * 1000000 + tv.tv_usec) / 1000000.0; + }; + + auto st = get_time(); + auto result = redact_strings(csv_table.column(0), csv_table.column(1)); + auto elapsed = get_time() - st; + std::cout << "Wall time: " << elapsed << " seconds\n"; + std::cout << "Output size " << result->view().child(1).size() << " bytes\n"; + + return 0; +} diff --git a/cpp/examples/strings/custom_fast.cu b/cpp/examples/strings/custom_fast.cu new file mode 100644 index 00000000000..41176199e8e --- /dev/null +++ b/cpp/examples/strings/custom_fast.cu @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common.hpp" + +#include +#include + +#include +#include +#include + +#include + +#include +#include + +/** + * @brief Computes the size of each output row + * + * @param d_names Column of names + * @param d_visibilities Column of visibilities + * @param d_sizes Output sizes for each row + */ +__global__ void sizes_kernel(cudf::column_device_view const d_names, + cudf::column_device_view const d_visibilities, + cudf::size_type* d_sizes) +{ + auto index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= d_names.size()) return; + + auto const visible = cudf::string_view("public", 6); + auto const redaction = cudf::string_view("X X", 3); + + auto const name = d_names.element(index); + auto const vis = d_visibilities.element(index); + + cudf::size_type result = redaction.size_bytes(); // init to redaction size + if (vis == visible) { + auto const space_idx = name.find(' '); + auto const first = name.substr(0, space_idx); + auto const last_initial = name.substr(space_idx + 1, 1); + + result = first.size_bytes() + last_initial.size_bytes() + 1; + } + + d_sizes[index] = result; +} + +/** + * @brief Builds the output for each row + * + * @param d_names Column of names + * @param d_visibilities Column of visibilities + * @param d_offsets Byte offset in `d_chars` for each row + * @param d_chars Output memory for all rows + */ +__global__ void redact_kernel(cudf::column_device_view const d_names, + cudf::column_device_view const d_visibilities, + cudf::size_type const* d_offsets, + char* d_chars) +{ + auto index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= d_names.size()) return; + + auto const visible = cudf::string_view("public", 6); + auto const redaction = cudf::string_view("X X", 3); + + // resolve output_ptr using the offsets vector + char* output_ptr = d_chars + d_offsets[index]; + + auto const name = d_names.element(index); + auto const vis = d_visibilities.element(index); + + if (vis == visible) { + auto const space_idx = name.find(' '); + auto const first = name.substr(0, space_idx); + auto const last_initial = name.substr(space_idx + 1, 1); + auto const output_size = first.size_bytes() + last_initial.size_bytes() + 1; + + // build output string + memcpy(output_ptr, last_initial.data(), last_initial.size_bytes()); + output_ptr += last_initial.size_bytes(); + memcpy(output_ptr, " ", 1); + output_ptr++; + memcpy(output_ptr, first.data(), first.size_bytes()); + } else { + memcpy(output_ptr, redaction.data(), redaction.size_bytes()); + } +} + +/** + * @brief Redacts each name per the corresponding visibility entry + * + * This implementation builds the strings column children (offsets and chars) + * directly into device memory for libcudf. + * + * @param names Column of names + * @param visibilities Column of visibilities + * @return Redacted column of names + */ +std::unique_ptr redact_strings(cudf::column_view const& names, + cudf::column_view const& visibilities) +{ + auto stream = rmm::cuda_stream_default; + + auto const d_names = cudf::column_device_view::create(names, stream); + auto const d_visibilities = cudf::column_device_view::create(visibilities, stream); + + constexpr int block_size = 128; + int const blocks = (names.size() + block_size - 1) / block_size; + + nvtxRangePushA("redact_strings"); + + // create offsets vector + auto offsets = rmm::device_uvector(names.size() + 1, stream); + + // compute output sizes + sizes_kernel<<>>( + *d_names, *d_visibilities, offsets.data()); + + // convert sizes to offsets (in place) + thrust::exclusive_scan(rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin()); + + // last element is the total output size + // (device-to-host copy of 1 integer) + cudf::size_type output_size = offsets.back_element(stream); + + // create chars vector + auto chars = rmm::device_uvector(output_size, stream); + + // build chars output + redact_kernel<<>>( + *d_names, *d_visibilities, offsets.data(), chars.data()); + + // create column from offsets and chars vectors (no copy is performed) + auto result = cudf::make_strings_column(names.size(), std::move(offsets), std::move(chars)); + cudaStreamSynchronize(stream.value()); + + nvtxRangePop(); + return result; +} diff --git a/cpp/examples/strings/custom_gather.cu b/cpp/examples/strings/custom_gather.cu new file mode 100644 index 00000000000..6725a643fdf --- /dev/null +++ b/cpp/examples/strings/custom_gather.cu @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common.hpp" + +#include +#include +#include + +#include + +#include +#include + +/** + * @brief Builds the output for each row + * + * @param d_names Column of names + * @param d_visibilities Column of visibilities + * @param redaction Redacted string replacement + * @param working_memory Output memory for all rows + * @param d_offsets Byte offset in `d_chars` for each row + * @param d_output Output array of string_view objects + */ +__global__ void redact_kernel(cudf::column_device_view const d_names, + cudf::column_device_view const d_visibilities, + cudf::string_view redaction, + char* working_memory, + cudf::offset_type const* d_offsets, + cudf::string_view* d_output) +{ + auto index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= d_names.size()) return; + + auto const visible = cudf::string_view("public", 6); + + auto const name = d_names.element(index); + auto const vis = d_visibilities.element(index); + if (vis == visible) { + auto const space_idx = name.find(' '); + auto const first = name.substr(0, space_idx); + auto const last_initial = name.substr(space_idx + 1, 1); + auto const output_size = first.size_bytes() + last_initial.size_bytes() + 1; + + char* output_ptr = working_memory + d_offsets[index]; + d_output[index] = cudf::string_view{output_ptr, output_size}; + + // build output string + memcpy(output_ptr, last_initial.data(), last_initial.size_bytes()); + output_ptr += last_initial.size_bytes(); + memcpy(output_ptr, " ", 1); + output_ptr++; + memcpy(output_ptr, first.data(), first.size_bytes()); + } else { + d_output[index] = cudf::string_view{redaction.data(), redaction.size_bytes()}; + } +} + +/** + * @brief Redacts each name per the corresponding visibility entry + * + * This implementation builds the individual strings into a fixed memory buffer + * and then calls a factory function to gather them into a strings column. + * + * @param names Column of names + * @param visibilities Column of visibilities + * @return Redacted column of names + */ +std::unique_ptr redact_strings(cudf::column_view const& names, + cudf::column_view const& visibilities) +{ + auto stream = rmm::cuda_stream_default; + + auto const d_names = cudf::column_device_view::create(names, stream); + auto const d_visibilities = cudf::column_device_view::create(visibilities, stream); + auto const d_redaction = cudf::string_scalar(std::string("X X"), true, stream); + + constexpr auto block_size = 128; + auto const blocks = (names.size() + block_size - 1) / block_size; + + nvtxRangePushA("redact_strings"); + + auto const scv = cudf::strings_column_view(names); + auto const offsets = scv.offsets_begin(); + + // create working memory to hold the output of each string + auto working_memory = rmm::device_uvector(scv.chars_size(), stream); + // create a vector for the output strings' pointers + auto str_ptrs = rmm::device_uvector(names.size(), stream); + + // build the output strings + redact_kernel<<>>(*d_names, + *d_visibilities, + d_redaction.value(), + working_memory.data(), + offsets, + str_ptrs.data()); + + // create strings column from the string_pairs; + // this copies all the individual strings into a single output column + auto result = cudf::make_strings_column(str_ptrs, cudf::string_view{nullptr, 0}, stream); + // temporary memory cleanup cost here for str_ptrs and working_memory + + nvtxRangePop(); + return result; +} diff --git a/cpp/examples/strings/custom_with_malloc.cu b/cpp/examples/strings/custom_with_malloc.cu new file mode 100644 index 00000000000..ebcc70b34c7 --- /dev/null +++ b/cpp/examples/strings/custom_with_malloc.cu @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common.hpp" + +#include +#include +#include + +#include + +#include +#include + +/** + * @brief Reserve CUDA malloc heap size + * + * Call this function to change the CUDA malloc heap size limit. + * This value depends on the total size of all the malloc() + * calls needed for redact_kernel. + * + * @param heap_size Number of bytes to reserve + * Default is 1GB + */ +void set_malloc_heap_size(size_t heap_size = 1073741824) // 1GB +{ + size_t max_malloc_heap_size = 0; + cudaDeviceGetLimit(&max_malloc_heap_size, cudaLimitMallocHeapSize); + if (max_malloc_heap_size < heap_size) { + max_malloc_heap_size = heap_size; + if (cudaDeviceSetLimit(cudaLimitMallocHeapSize, max_malloc_heap_size) != cudaSuccess) { + fprintf(stderr, "could not set malloc heap size to %ldMB\n", (heap_size / (1024 * 1024))); + throw std::runtime_error(""); + } + } +} + +/** + * @brief Builds the output for each row + * + * Note: This uses malloc() in a device kernel which works great + * but is not very efficient. This can be useful for prototyping + * on functions where performance is not yet important. + * All calls to malloc() must have a corresponding free() call. + * The separate free_kernel is launched for this purpose. + * + * @param d_names Column of names + * @param d_visibilities Column of visibilities + * @param redaction Redacted string replacement + * @param d_output Output array of string_view objects + */ +__global__ void redact_kernel(cudf::column_device_view const d_names, + cudf::column_device_view const d_visibilities, + cudf::string_view redaction, + cudf::string_view* d_output) +{ + // get index for this thread + auto index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= d_names.size()) return; + + auto const visible = cudf::string_view("public", 6); + + auto const name = d_names.element(index); + auto const vis = d_visibilities.element(index); + if (vis == visible) { + auto const space_idx = name.find(' '); + auto const first = name.substr(0, space_idx); + auto const last_initial = name.substr(space_idx + 1, 1); + auto const output_size = first.size_bytes() + last_initial.size_bytes() + 1; + + char* output_ptr = static_cast(malloc(output_size)); + d_output[index] = cudf::string_view{output_ptr, output_size}; + + // build output string + memcpy(output_ptr, last_initial.data(), last_initial.size_bytes()); + output_ptr += last_initial.size_bytes(); + memcpy(output_ptr, " ", 1); + output_ptr++; + memcpy(output_ptr, first.data(), first.size_bytes()); + } else { + d_output[index] = cudf::string_view{redaction.data(), redaction.size_bytes()}; + } +} + +/** + * @brief Frees the temporary individual string objects created in the + * redact_kernel + * + * Like malloc(), free() is not very efficient but must be called for + * each malloc() to return the memory to the CUDA malloc heap. + * + * @param redaction Redacted string replacement (not to be freed) + * @param d_output Output array of string_view objects to free + */ +__global__ void free_kernel(cudf::string_view redaction, cudf::string_view* d_output, int count) +{ + auto index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= count) return; + + auto ptr = const_cast(d_output[index].data()); + if (ptr != redaction.data()) free(ptr); +} + +std::unique_ptr redact_strings(cudf::column_view const& names, + cudf::column_view const& visibilities) +{ + auto stream = rmm::cuda_stream_default; + set_malloc_heap_size(); + + auto const d_names = cudf::column_device_view::create(names, stream); + auto const d_visibilities = cudf::column_device_view::create(visibilities, stream); + auto const d_redaction = cudf::string_scalar(std::string("X X"), true, stream); + + constexpr auto block_size = 128; + auto const blocks = (names.size() + block_size - 1) / block_size; + + nvtxRangePushA("redact_strings"); + + // create a vector for the output strings' pointers + auto str_ptrs = new rmm::device_uvector(names.size(), stream); + + auto result = [&] { + // build the output strings + redact_kernel<<>>( + *d_names, *d_visibilities, d_redaction.value(), str_ptrs->data()); + // create strings column from the string_view vector + // this copies all the individual strings into a single output column + return cudf::make_strings_column(*str_ptrs, cudf::string_view{nullptr, 0}, stream); + }(); + + // free the individual temporary memory pointers + free_kernel<<>>( + d_redaction.value(), str_ptrs->data(), names.size()); + delete str_ptrs; + + nvtxRangePop(); + return result; +} diff --git a/cpp/examples/strings/libcudf_apis.cpp b/cpp/examples/strings/libcudf_apis.cpp new file mode 100644 index 00000000000..009e92d8a0d --- /dev/null +++ b/cpp/examples/strings/libcudf_apis.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/** + * @brief Redacts each name per the corresponding visibility entry + * + * This implementation uses libcudf APIs to create the output result. + * + * @param names Column of names + * @param visibilities Column of visibilities + * @return Redacted column of names + */ +std::unique_ptr redact_strings(cudf::column_view const& names, + cudf::column_view const& visibilities) +{ + auto const visible = cudf::string_scalar(std::string("public")); + auto const redaction = cudf::string_scalar(std::string("X X")); + + nvtxRangePushA("redact_strings"); + + auto const allowed = cudf::strings::contains(visibilities, visible); + auto const redacted = cudf::copy_if_else(names, redaction, allowed->view()); + auto const first_last = cudf::strings::split(redacted->view()); + auto const first = first_last->view().column(0); + auto const last = first_last->view().column(1); + auto const last_initial = cudf::strings::slice_strings(last, 0, 1); + + auto const last_initial_first = cudf::table_view({last_initial->view(), first}); + + auto result = cudf::strings::concatenate(last_initial_first, std::string(" ")); + + cudaStreamSynchronize(0); + + nvtxRangePop(); + return result; +} diff --git a/cpp/examples/strings/names.csv b/cpp/examples/strings/names.csv new file mode 100644 index 00000000000..dec1c2a7a7b --- /dev/null +++ b/cpp/examples/strings/names.csv @@ -0,0 +1,20 @@ +John Doe,public +Jane Doe,private +Billy Joe,private +James James,public +Michael Frederick,public +Christopher Cheryl,public +Jessica Autumn,public +Matthew Tyrone,public +Ashley Martha,public +Jennifer Omar,public +Joshua Lydia,public +Amanda Jerome,public +Daniel Theodore,public +David Abby,public +James Neil,public +Robert Shawna,private +John Sierra,private +Joseph Nina,private +Andrew Tammy,private +Ryan Nikki,public \ No newline at end of file From 93c813f12687aa88954a065b24ab436eded7e017 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 3 Oct 2022 17:25:56 -0400 Subject: [PATCH 2/7] add/remove extra blank lines --- cpp/examples/strings/CMakeLists.txt | 1 - cpp/examples/strings/names.csv | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt index 732a050902e..92322418087 100644 --- a/cpp/examples/strings/CMakeLists.txt +++ b/cpp/examples/strings/CMakeLists.txt @@ -47,4 +47,3 @@ add_executable(custom_fast custom_fast.cu) target_compile_features(custom_fast PRIVATE cxx_std_17) target_compile_options(custom_fast PRIVATE "$<$:${CUDF_CUDA_FLAGS}>") target_link_libraries(custom_fast PRIVATE cudf::cudf nvToolsExt) - diff --git a/cpp/examples/strings/names.csv b/cpp/examples/strings/names.csv index dec1c2a7a7b..77dca3e02af 100644 --- a/cpp/examples/strings/names.csv +++ b/cpp/examples/strings/names.csv @@ -17,4 +17,4 @@ Robert Shawna,private John Sierra,private Joseph Nina,private Andrew Tammy,private -Ryan Nikki,public \ No newline at end of file +Ryan Nikki,public From 0ecc23c95e3a459a4fa15e9177ba9c31aabd2288 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 5 Oct 2022 08:33:05 -0400 Subject: [PATCH 3/7] rename source files to match better with the blog names --- cpp/examples/strings/CMakeLists.txt | 18 +++++++++--------- .../{custom_fast.cu => custom_optimized.cu} | 0 .../{custom_gather.cu => custom_prealloc.cu} | 0 3 files changed, 9 insertions(+), 9 deletions(-) rename cpp/examples/strings/{custom_fast.cu => custom_optimized.cu} (100%) rename cpp/examples/strings/{custom_gather.cu => custom_prealloc.cu} (100%) diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt index 92322418087..41bc5e00224 100644 --- a/cpp/examples/strings/CMakeLists.txt +++ b/cpp/examples/strings/CMakeLists.txt @@ -38,12 +38,12 @@ target_compile_features(custom_with_malloc PRIVATE cxx_std_17) target_compile_options(custom_with_malloc PRIVATE "$<$:${CUDF_CUDA_FLAGS}>") target_link_libraries(custom_with_malloc PRIVATE cudf::cudf nvToolsExt) -add_executable(custom_gather custom_gather.cu) -target_compile_features(custom_gather PRIVATE cxx_std_17) -target_compile_options(custom_gather PRIVATE "$<$:${CUDF_CUDA_FLAGS}>") -target_link_libraries(custom_gather PRIVATE cudf::cudf nvToolsExt) - -add_executable(custom_fast custom_fast.cu) -target_compile_features(custom_fast PRIVATE cxx_std_17) -target_compile_options(custom_fast PRIVATE "$<$:${CUDF_CUDA_FLAGS}>") -target_link_libraries(custom_fast PRIVATE cudf::cudf nvToolsExt) +add_executable(custom_prealloc custom_prealloc.cu) +target_compile_features(custom_prealloc PRIVATE cxx_std_17) +target_compile_options(custom_prealloc PRIVATE "$<$:${CUDF_CUDA_FLAGS}>") +target_link_libraries(custom_prealloc PRIVATE cudf::cudf nvToolsExt) + +add_executable(custom_optimized custom_optimized.cu) +target_compile_features(custom_optimized PRIVATE cxx_std_17) +target_compile_options(custom_optimized PRIVATE "$<$:${CUDF_CUDA_FLAGS}>") +target_link_libraries(custom_optimized PRIVATE cudf::cudf nvToolsExt) diff --git a/cpp/examples/strings/custom_fast.cu b/cpp/examples/strings/custom_optimized.cu similarity index 100% rename from cpp/examples/strings/custom_fast.cu rename to cpp/examples/strings/custom_optimized.cu diff --git a/cpp/examples/strings/custom_gather.cu b/cpp/examples/strings/custom_prealloc.cu similarity index 100% rename from cpp/examples/strings/custom_gather.cu rename to cpp/examples/strings/custom_prealloc.cu From e2cc8116e3ae467f803120483135c8e437a631d6 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 5 Oct 2022 20:30:56 -0400 Subject: [PATCH 4/7] update memcpy(1) to *output_ptr++ --- cpp/examples/strings/custom_optimized.cu | 3 +-- cpp/examples/strings/custom_prealloc.cu | 3 +-- cpp/examples/strings/custom_with_malloc.cu | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/cpp/examples/strings/custom_optimized.cu b/cpp/examples/strings/custom_optimized.cu index 41176199e8e..0164e1bca06 100644 --- a/cpp/examples/strings/custom_optimized.cu +++ b/cpp/examples/strings/custom_optimized.cu @@ -94,8 +94,7 @@ __global__ void redact_kernel(cudf::column_device_view const d_names, // build output string memcpy(output_ptr, last_initial.data(), last_initial.size_bytes()); output_ptr += last_initial.size_bytes(); - memcpy(output_ptr, " ", 1); - output_ptr++; + *output_ptr++ = ' '; memcpy(output_ptr, first.data(), first.size_bytes()); } else { memcpy(output_ptr, redaction.data(), redaction.size_bytes()); diff --git a/cpp/examples/strings/custom_prealloc.cu b/cpp/examples/strings/custom_prealloc.cu index 6725a643fdf..d2ebecad6ff 100644 --- a/cpp/examples/strings/custom_prealloc.cu +++ b/cpp/examples/strings/custom_prealloc.cu @@ -61,8 +61,7 @@ __global__ void redact_kernel(cudf::column_device_view const d_names, // build output string memcpy(output_ptr, last_initial.data(), last_initial.size_bytes()); output_ptr += last_initial.size_bytes(); - memcpy(output_ptr, " ", 1); - output_ptr++; + *output_ptr++ = ' '; memcpy(output_ptr, first.data(), first.size_bytes()); } else { d_output[index] = cudf::string_view{redaction.data(), redaction.size_bytes()}; diff --git a/cpp/examples/strings/custom_with_malloc.cu b/cpp/examples/strings/custom_with_malloc.cu index ebcc70b34c7..5db457e4c8f 100644 --- a/cpp/examples/strings/custom_with_malloc.cu +++ b/cpp/examples/strings/custom_with_malloc.cu @@ -87,8 +87,7 @@ __global__ void redact_kernel(cudf::column_device_view const d_names, // build output string memcpy(output_ptr, last_initial.data(), last_initial.size_bytes()); output_ptr += last_initial.size_bytes(); - memcpy(output_ptr, " ", 1); - output_ptr++; + *output_ptr++ = ' '; memcpy(output_ptr, first.data(), first.size_bytes()); } else { d_output[index] = cudf::string_view{redaction.data(), redaction.size_bytes()}; From 6c706f67c941d0b63741e69684c6590aa68eb7b9 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 11 Oct 2022 10:19:44 -0400 Subject: [PATCH 5/7] use chrono for wall time --- cpp/examples/strings/common.hpp | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/cpp/examples/strings/common.hpp b/cpp/examples/strings/common.hpp index efdd211e9e3..dbd3c4dbd1b 100644 --- a/cpp/examples/strings/common.hpp +++ b/cpp/examples/strings/common.hpp @@ -27,11 +27,10 @@ #include #include -#include +#include #include #include #include -#include /** * @brief Main example function returns redacted strings column. @@ -97,24 +96,18 @@ int main(int argc, char const** argv) auto const csv_result = [csv_file] { cudf::io::csv_reader_options in_opts = cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_file}).header(-1); - return std::move(cudf::io::read_csv(in_opts).tbl); + return cudf::io::read_csv(in_opts).tbl; }(); auto const csv_table = csv_result->view(); std::cout << "table: " << csv_table.num_rows() << " rows " << csv_table.num_columns() << " columns\n"; - // utility function that captures current time in microseconds - auto get_time = []() { - timeval tv; - gettimeofday(&tv, nullptr); - return static_cast(tv.tv_sec * 1000000 + tv.tv_usec) / 1000000.0; - }; + auto st = std::chrono::steady_clock::now(); + auto result = redact_strings(csv_table.column(0), csv_table.column(1)); - auto st = get_time(); - auto result = redact_strings(csv_table.column(0), csv_table.column(1)); - auto elapsed = get_time() - st; - std::cout << "Wall time: " << elapsed << " seconds\n"; + std::chrono::duration elapsed = std::chrono::steady_clock::now() - st; + std::cout << "Wall time: " << elapsed.count() << " seconds\n"; std::cout << "Output size " << result->view().child(1).size() << " bytes\n"; return 0; From 83055cc24c9690f50afc5bc85830fc862bd09c8a Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 12 Oct 2022 15:36:28 -0400 Subject: [PATCH 6/7] add more comments --- cpp/examples/strings/custom_optimized.cu | 17 ++++++++++++++--- cpp/examples/strings/custom_prealloc.cu | 12 ++++++++++-- cpp/examples/strings/custom_with_malloc.cu | 18 +++++++++++++----- 3 files changed, 37 insertions(+), 10 deletions(-) diff --git a/cpp/examples/strings/custom_optimized.cu b/cpp/examples/strings/custom_optimized.cu index 0164e1bca06..bfe650daa93 100644 --- a/cpp/examples/strings/custom_optimized.cu +++ b/cpp/examples/strings/custom_optimized.cu @@ -31,6 +31,8 @@ /** * @brief Computes the size of each output row * + * This thread is called once per row in d_names. + * * @param d_names Column of names * @param d_visibilities Column of visibilities * @param d_sizes Output sizes for each row @@ -39,7 +41,9 @@ __global__ void sizes_kernel(cudf::column_device_view const d_names, cudf::column_device_view const d_visibilities, cudf::size_type* d_sizes) { + // The row index is resolved from the CUDA thread/block objects auto index = threadIdx.x + blockIdx.x * blockDim.x; + // There may be more threads than actual rows if (index >= d_names.size()) return; auto const visible = cudf::string_view("public", 6); @@ -63,6 +67,8 @@ __global__ void sizes_kernel(cudf::column_device_view const d_names, /** * @brief Builds the output for each row * + * This thread is called once per row in d_names. + * * @param d_names Column of names * @param d_visibilities Column of visibilities * @param d_offsets Byte offset in `d_chars` for each row @@ -73,7 +79,9 @@ __global__ void redact_kernel(cudf::column_device_view const d_names, cudf::size_type const* d_offsets, char* d_chars) { + // The row index is resolved from the CUDA thread/block objects auto index = threadIdx.x + blockIdx.x * blockDim.x; + // There may be more threads than actual rows if (index >= d_names.size()) return; auto const visible = cudf::string_view("public", 6); @@ -114,12 +122,13 @@ __global__ void redact_kernel(cudf::column_device_view const d_names, std::unique_ptr redact_strings(cudf::column_view const& names, cudf::column_view const& visibilities) { + // all device memory operations and kernel functions will run on this stream auto stream = rmm::cuda_stream_default; auto const d_names = cudf::column_device_view::create(names, stream); auto const d_visibilities = cudf::column_device_view::create(visibilities, stream); - constexpr int block_size = 128; + constexpr int block_size = 128; // this arbitrary size should be a power of 2 int const blocks = (names.size() + block_size - 1) / block_size; nvtxRangePushA("redact_strings"); @@ -135,7 +144,7 @@ std::unique_ptr redact_strings(cudf::column_view const& names, thrust::exclusive_scan(rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin()); // last element is the total output size - // (device-to-host copy of 1 integer) + // (device-to-host copy of 1 integer -- includes synching the stream) cudf::size_type output_size = offsets.back_element(stream); // create chars vector @@ -147,7 +156,9 @@ std::unique_ptr redact_strings(cudf::column_view const& names, // create column from offsets and chars vectors (no copy is performed) auto result = cudf::make_strings_column(names.size(), std::move(offsets), std::move(chars)); - cudaStreamSynchronize(stream.value()); + + // wait for all of the above to finish + stream.synchronize(); nvtxRangePop(); return result; diff --git a/cpp/examples/strings/custom_prealloc.cu b/cpp/examples/strings/custom_prealloc.cu index d2ebecad6ff..c0bae03af5c 100644 --- a/cpp/examples/strings/custom_prealloc.cu +++ b/cpp/examples/strings/custom_prealloc.cu @@ -28,6 +28,8 @@ /** * @brief Builds the output for each row * + * This thread is called once per row in d_names. + * * @param d_names Column of names * @param d_visibilities Column of visibilities * @param redaction Redacted string replacement @@ -42,7 +44,9 @@ __global__ void redact_kernel(cudf::column_device_view const d_names, cudf::offset_type const* d_offsets, cudf::string_view* d_output) { + // The row index is resolved from the CUDA thread/block objects auto index = threadIdx.x + blockIdx.x * blockDim.x; + // There may be more threads than actual rows if (index >= d_names.size()) return; auto const visible = cudf::string_view("public", 6); @@ -81,14 +85,15 @@ __global__ void redact_kernel(cudf::column_device_view const d_names, std::unique_ptr redact_strings(cudf::column_view const& names, cudf::column_view const& visibilities) { + // all device memory operations and kernel functions will run on this stream auto stream = rmm::cuda_stream_default; auto const d_names = cudf::column_device_view::create(names, stream); auto const d_visibilities = cudf::column_device_view::create(visibilities, stream); auto const d_redaction = cudf::string_scalar(std::string("X X"), true, stream); - constexpr auto block_size = 128; - auto const blocks = (names.size() + block_size - 1) / block_size; + constexpr int block_size = 128; // this arbitrary size should be a power of 2 + auto const blocks = (names.size() + block_size - 1) / block_size; nvtxRangePushA("redact_strings"); @@ -113,6 +118,9 @@ std::unique_ptr redact_strings(cudf::column_view const& names, auto result = cudf::make_strings_column(str_ptrs, cudf::string_view{nullptr, 0}, stream); // temporary memory cleanup cost here for str_ptrs and working_memory + // wait for all of the above to finish + stream.synchronize(); + nvtxRangePop(); return result; } diff --git a/cpp/examples/strings/custom_with_malloc.cu b/cpp/examples/strings/custom_with_malloc.cu index 5db457e4c8f..f1d397ef007 100644 --- a/cpp/examples/strings/custom_with_malloc.cu +++ b/cpp/examples/strings/custom_with_malloc.cu @@ -51,6 +51,8 @@ void set_malloc_heap_size(size_t heap_size = 1073741824) // 1GB /** * @brief Builds the output for each row * + * This thread is called once per row in d_names. + * * Note: This uses malloc() in a device kernel which works great * but is not very efficient. This can be useful for prototyping * on functions where performance is not yet important. @@ -67,8 +69,9 @@ __global__ void redact_kernel(cudf::column_device_view const d_names, cudf::string_view redaction, cudf::string_view* d_output) { - // get index for this thread + // The row index is resolved from the CUDA thread/block objects auto index = threadIdx.x + blockIdx.x * blockDim.x; + // There may be more threads than actual rows if (index >= d_names.size()) return; auto const visible = cudf::string_view("public", 6); @@ -110,21 +113,23 @@ __global__ void free_kernel(cudf::string_view redaction, cudf::string_view* d_ou if (index >= count) return; auto ptr = const_cast(d_output[index].data()); - if (ptr != redaction.data()) free(ptr); + if (ptr != redaction.data()) { free(ptr); } } std::unique_ptr redact_strings(cudf::column_view const& names, cudf::column_view const& visibilities) { + // all device memory operations and kernel functions will run on this stream auto stream = rmm::cuda_stream_default; - set_malloc_heap_size(); + + set_malloc_heap_size(); // to illustrate adjusting the malloc heap auto const d_names = cudf::column_device_view::create(names, stream); auto const d_visibilities = cudf::column_device_view::create(visibilities, stream); auto const d_redaction = cudf::string_scalar(std::string("X X"), true, stream); - constexpr auto block_size = 128; - auto const blocks = (names.size() + block_size - 1) / block_size; + constexpr int block_size = 128; // this arbitrary size should be a power of 2 + auto const blocks = (names.size() + block_size - 1) / block_size; nvtxRangePushA("redact_strings"); @@ -145,6 +150,9 @@ std::unique_ptr redact_strings(cudf::column_view const& names, d_redaction.value(), str_ptrs->data(), names.size()); delete str_ptrs; + // wait for all of the above to finish + stream.synchronize(); + nvtxRangePop(); return result; } From d95708ea720fb419025c2ff87f4590cba53a7d0d Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 14 Oct 2022 10:25:17 -0400 Subject: [PATCH 7/7] update cmake_minimum_required version to 3.23.1 --- cpp/examples/basic/CMakeLists.txt | 2 +- cpp/examples/strings/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt index b182cb08774..7e7c6b191b5 100644 --- a/cpp/examples/basic/CMakeLists.txt +++ b/cpp/examples/basic/CMakeLists.txt @@ -1,6 +1,6 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. -cmake_minimum_required(VERSION 3.18) +cmake_minimum_required(VERSION 3.23.1) project( basic_example diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt index 41bc5e00224..1a16b2bc8fd 100644 --- a/cpp/examples/strings/CMakeLists.txt +++ b/cpp/examples/strings/CMakeLists.txt @@ -1,6 +1,6 @@ # Copyright (c) 2022, NVIDIA CORPORATION. -cmake_minimum_required(VERSION 3.18) +cmake_minimum_required(VERSION 3.23.1) project( strings_examples