diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a6f72ed6b75..4080c5d02da 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -666,6 +666,7 @@ add_library( src/unary/math_ops.cu src/unary/nan_ops.cu src/unary/null_ops.cu + src/utilities/cuda.cpp src/utilities/cuda_memcpy.cu src/utilities/default_stream.cpp src/utilities/host_memory.cpp diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh index f7984b29d6b..75bbe8174d3 100644 --- a/cpp/benchmarks/join/generate_input_tables.cuh +++ b/cpp/benchmarks/join/generate_input_tables.cuh @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include #include @@ -150,13 +151,8 @@ void generate_input_tables(key_type* const build_tbl, CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( &num_blocks_init_probe_tbl, init_probe_tbl, block_size, 0)); - int dev_id{-1}; - CUDF_CUDA_TRY(cudaGetDevice(&dev_id)); - - int num_sms{-1}; - CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id)); - - int const num_states = + auto const num_sms = cudf::detail::num_multiprocessors(); + auto const num_states = num_sms * std::max(num_blocks_init_build_tbl, num_blocks_init_probe_tbl) * block_size; rmm::device_uvector devStates(num_states, cudf::get_default_stream()); diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh index b6310e6cd2f..4071fa01fb2 100644 --- a/cpp/include/cudf/detail/copy_if.cuh +++ b/cpp/include/cudf/detail/copy_if.cuh @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh index 5007af7f9f1..d31ca3d92d1 100644 --- a/cpp/include/cudf/detail/utilities/cuda.cuh +++ b/cpp/include/cudf/detail/utilities/cuda.cuh @@ -189,35 +189,6 @@ __device__ T single_lane_block_sum_reduce(T lane_value) return result; } -/** - * @brief Get the number of elements that can be processed per thread. - * - * @param[in] kernel The kernel for which the elements per thread needs to be assessed - * @param[in] total_size Number of elements - * @param[in] block_size Expected block size - * - * @return cudf::size_type Elements per thread that can be processed for given specification. - */ -template -cudf::size_type elements_per_thread(Kernel kernel, - cudf::size_type total_size, - cudf::size_type block_size, - cudf::size_type max_per_thread = 32) -{ - CUDF_FUNC_RANGE(); - - // calculate theoretical occupancy - int max_blocks = 0; - CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, kernel, block_size, 0)); - - int device = 0; - CUDF_CUDA_TRY(cudaGetDevice(&device)); - int num_sms = 0; - CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device)); - int per_thread = total_size / (max_blocks * num_sms * block_size); - return std::clamp(per_thread, 1, max_per_thread); -} - /** * @brief Finds the smallest value not less than `number_to_round` and modulo `modulus` is * zero. Expects modulus to be a power of 2. diff --git a/cpp/include/cudf/detail/utilities/cuda.hpp b/cpp/include/cudf/detail/utilities/cuda.hpp new file mode 100644 index 00000000000..58c7ae8ed6a --- /dev/null +++ b/cpp/include/cudf/detail/utilities/cuda.hpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include + +namespace CUDF_EXPORT cudf { +namespace detail { + +/** + * @brief Get the number of multiprocessors on the device + */ +cudf::size_type num_multiprocessors(); + +/** + * @brief Get the number of elements that can be processed per thread. + * + * @param[in] kernel The kernel for which the elements per thread needs to be assessed + * @param[in] total_size Number of elements + * @param[in] block_size Expected block size + * + * @return cudf::size_type Elements per thread that can be processed for given specification. + */ +template +cudf::size_type elements_per_thread(Kernel kernel, + cudf::size_type total_size, + cudf::size_type block_size, + cudf::size_type max_per_thread = 32) +{ + CUDF_FUNC_RANGE(); + + // calculate theoretical occupancy + int max_blocks = 0; + CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, kernel, block_size, 0)); + + int per_thread = total_size / (max_blocks * num_multiprocessors() * block_size); + return std::clamp(per_thread, 1, max_per_thread); +} + +} // namespace detail +} // namespace CUDF_EXPORT cudf diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu index 861820f47e7..72649dbe427 100644 --- a/cpp/src/io/comp/debrotli.cu +++ b/cpp/src/io/comp/debrotli.cu @@ -58,6 +58,7 @@ THE SOFTWARE. #include "gpuinflate.hpp" #include "io/utilities/block_utils.cuh" +#include #include #include @@ -2047,19 +2048,14 @@ CUDF_KERNEL void __launch_bounds__(block_size, 2) */ size_t __host__ get_gpu_debrotli_scratch_size(int max_num_inputs) { - int sm_count = 0; - int dev = 0; uint32_t max_fb_size, min_fb_size, fb_size; - CUDF_CUDA_TRY(cudaGetDevice(&dev)); - if (cudaSuccess == cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev)) { - // printf("%d SMs on device %d\n", sm_count, dev); - max_num_inputs = - min(max_num_inputs, sm_count * 3); // no more than 3 blocks/sm at most due to 32KB smem use - if (max_num_inputs <= 0) { - max_num_inputs = sm_count * 2; // Target 2 blocks/SM by default for scratch mem computation - } + auto const sm_count = cudf::detail::num_multiprocessors(); + // no more than 3 blocks/sm at most due to 32KB smem use + max_num_inputs = std::min(max_num_inputs, sm_count * 3); + if (max_num_inputs <= 0) { + max_num_inputs = sm_count * 2; // Target 2 blocks/SM by default for scratch mem computation } - max_num_inputs = min(max(max_num_inputs, 1), 512); + max_num_inputs = std::min(std::max(max_num_inputs, 1), 512); // Max fb size per block occurs if all huffman tables for all 3 group types fail local_alloc() // with num_htrees=256 (See HuffmanTreeGroupAlloc) max_fb_size = 256 * (630 + 1080 + 920) * 2; // 1.3MB diff --git a/cpp/src/utilities/cuda.cpp b/cpp/src/utilities/cuda.cpp new file mode 100644 index 00000000000..53ca0608170 --- /dev/null +++ b/cpp/src/utilities/cuda.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +namespace cudf::detail { + +cudf::size_type num_multiprocessors() +{ + int device = 0; + CUDF_CUDA_TRY(cudaGetDevice(&device)); + int num_sms = 0; + CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device)); + return num_sms; +} + +} // namespace cudf::detail