From 4727200be88c2dc967c9bf76655cd661b41f4abc Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 15 Nov 2021 18:17:18 -0500 Subject: [PATCH 1/2] Removing conflict w/ CUDA_CHECK and other projects by conditionally definign macro. Also adding new raft-scoped macros --- cpp/include/raft/cudart_utils.h | 445 ++++++++++++++++---------------- 1 file changed, 229 insertions(+), 216 deletions(-) diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 85ca310530..73a3264c8a 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -38,10 +38,10 @@ namespace raft { /** * @brief Exception thrown when a CUDA error is encountered. */ -struct cuda_error : public raft::exception { - explicit cuda_error(char const* const message) : raft::exception(message) {} - explicit cuda_error(std::string const& message) : raft::exception(message) {} -}; + struct cuda_error : public raft::exception { + explicit cuda_error(char const* const message) : raft::exception(message) {} + explicit cuda_error(std::string const& message) : raft::exception(message) {} + }; } // namespace raft @@ -53,6 +53,7 @@ struct cuda_error : public raft::exception { * exception detailing the CUDA error that occurred * */ +#ifndef CUDA_TRY #define CUDA_TRY(call) \ do { \ cudaError_t const status = call; \ @@ -65,7 +66,7 @@ struct cuda_error : public raft::exception { throw raft::cuda_error(msg); \ } \ } while (0) - +#endif /** * @brief Debug macro to check for CUDA errors * @@ -86,13 +87,16 @@ struct cuda_error : public raft::exception { #endif /** FIXME: temporary alias for cuML compatibility */ +#ifndef CUDA_CHECK #define CUDA_CHECK(call) CUDA_TRY(call) +#endif ///@todo: enable this only after we have added logging support in raft // /** // * @brief check for cuda runtime API errors but log error instead of raising // * exception. // */ +#ifndef CUDA_CHECK_NO_THROW #define CUDA_CHECK_NO_THROW(call) \ do { \ cudaError_t const status = call; \ @@ -101,99 +105,108 @@ struct cuda_error : public raft::exception { __FILE__, __LINE__, cudaGetErrorString(status)); \ } \ } while (0) +#endif + +/** + * Alias to raft scope for now. + * TODO: Rename original implementations in 22.04 to fix + * https://github.com/rapidsai/raft/issues/128 + */ +#define RAFT_CUDA_CHECK(call) CUDA_CHECK(call) +#define RAFT_CUDA_CHECK_NO_THROW(call) CUDA_CHECK_NO_THROW(call) namespace raft { /** Helper method to get to know warp size in device code */ -__host__ __device__ constexpr inline int warp_size() { return 32; } + __host__ __device__ constexpr inline int warp_size() { return 32; } -__host__ __device__ constexpr inline unsigned int warp_full_mask() { - return 0xffffffff; -} + __host__ __device__ constexpr inline unsigned int warp_full_mask() { + return 0xffffffff; + } /** * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping * elements to threads. */ -class grid_1d_thread_t { - public: - int const block_size{0}; - int const num_blocks{0}; - - /** - * @param overall_num_elements The number of elements the kernel needs to handle/process - * @param num_threads_per_block The grid block size, determined according to the kernel's - * specific features (amount of shared memory necessary, SM functional units use pattern etc.); - * this can't be determined generically/automatically (as opposed to the number of blocks) - * @param elements_per_thread Typically, a single kernel thread processes more than a single - * element; this affects the number of threads the grid must contain - */ - grid_1d_thread_t(size_t overall_num_elements, size_t num_threads_per_block, - size_t max_num_blocks_1d, size_t elements_per_thread = 1) - : block_size(num_threads_per_block), - num_blocks(std::min((overall_num_elements + - (elements_per_thread * num_threads_per_block) - 1) / - (elements_per_thread * num_threads_per_block), - max_num_blocks_1d)) { - RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); - RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, - "num_threads_per_block / warp_size() must be > 0"); - RAFT_EXPECTS(elements_per_thread > 0, "elements_per_thread must be > 0"); - } -}; + class grid_1d_thread_t { + public: + int const block_size{0}; + int const num_blocks{0}; + + /** + * @param overall_num_elements The number of elements the kernel needs to handle/process + * @param num_threads_per_block The grid block size, determined according to the kernel's + * specific features (amount of shared memory necessary, SM functional units use pattern etc.); + * this can't be determined generically/automatically (as opposed to the number of blocks) + * @param elements_per_thread Typically, a single kernel thread processes more than a single + * element; this affects the number of threads the grid must contain + */ + grid_1d_thread_t(size_t overall_num_elements, size_t num_threads_per_block, + size_t max_num_blocks_1d, size_t elements_per_thread = 1) + : block_size(num_threads_per_block), + num_blocks(std::min((overall_num_elements + + (elements_per_thread * num_threads_per_block) - 1) / + (elements_per_thread * num_threads_per_block), + max_num_blocks_1d)) { + RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); + RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, + "num_threads_per_block / warp_size() must be > 0"); + RAFT_EXPECTS(elements_per_thread > 0, "elements_per_thread must be > 0"); + } + }; /** * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping * elements to warps. */ -class grid_1d_warp_t { - public: - int const block_size{0}; - int const num_blocks{0}; - - /** - * @param overall_num_elements The number of elements the kernel needs to handle/process - * @param num_threads_per_block The grid block size, determined according to the kernel's - * specific features (amount of shared memory necessary, SM functional units use pattern etc.); - * this can't be determined generically/automatically (as opposed to the number of blocks) - */ - grid_1d_warp_t(size_t overall_num_elements, size_t num_threads_per_block, - size_t max_num_blocks_1d) - : block_size(num_threads_per_block), - num_blocks(std::min( - (overall_num_elements + (num_threads_per_block / warp_size()) - 1) / - (num_threads_per_block / warp_size()), - max_num_blocks_1d)) { - RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); - RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, - "num_threads_per_block / warp_size() must be > 0"); - } -}; + class grid_1d_warp_t { + public: + int const block_size{0}; + int const num_blocks{0}; + + /** + * @param overall_num_elements The number of elements the kernel needs to handle/process + * @param num_threads_per_block The grid block size, determined according to the kernel's + * specific features (amount of shared memory necessary, SM functional units use pattern etc.); + * this can't be determined generically/automatically (as opposed to the number of blocks) + */ + grid_1d_warp_t(size_t overall_num_elements, size_t num_threads_per_block, + size_t max_num_blocks_1d) + : block_size(num_threads_per_block), + num_blocks(std::min( + (overall_num_elements + (num_threads_per_block / warp_size()) - 1) / + (num_threads_per_block / warp_size()), + max_num_blocks_1d)) { + RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); + RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, + "num_threads_per_block / warp_size() must be > 0"); + } + }; /** * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping * elements to blocks. */ -class grid_1d_block_t { - public: - int const block_size{0}; - int const num_blocks{0}; - - /** - * @param overall_num_elements The number of elements the kernel needs to handle/process - * @param num_threads_per_block The grid block size, determined according to the kernel's - * specific features (amount of shared memory necessary, SM functional units use pattern etc.); - * this can't be determined generically/automatically (as opposed to the number of blocks) - */ - grid_1d_block_t(size_t overall_num_elements, size_t num_threads_per_block, - size_t max_num_blocks_1d) - : block_size(num_threads_per_block), - num_blocks(std::min(overall_num_elements, max_num_blocks_1d)) { - RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); - RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, - "num_threads_per_block / warp_size() must be > 0"); - } -}; + class grid_1d_block_t { + public: + int const block_size{0}; + int const num_blocks{0}; + + /** + * @param overall_num_elements The number of elements the kernel needs to handle/process + * @param num_threads_per_block The grid block size, determined according to the kernel's + * specific features (amount of shared memory necessary, SM functional units use pattern etc.); + * this can't be determined generically/automatically (as opposed to the number of blocks) + */ + grid_1d_block_t(size_t overall_num_elements, size_t num_threads_per_block, + size_t max_num_blocks_1d) + : block_size(num_threads_per_block), + num_blocks(std::min(overall_num_elements, max_num_blocks_1d)) { + RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); + RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, + "num_threads_per_block / warp_size() must be > 0"); + } + }; /** * @brief Generic copy method for all kinds of transfers @@ -203,12 +216,12 @@ class grid_1d_block_t { * @param len lenth of the src/dst buffers in terms of number of elements * @param stream cuda stream */ -template -void copy(Type* dst, const Type* src, size_t len, - rmm::cuda_stream_view stream) { - CUDA_CHECK( - cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream)); -} + template + void copy(Type* dst, const Type* src, size_t len, + rmm::cuda_stream_view stream) { + CUDA_CHECK( + cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream)); + } /** * @defgroup Copy Copy methods @@ -217,145 +230,145 @@ void copy(Type* dst, const Type* src, size_t len, * @{ */ /** performs a host to device copy */ -template -void update_device(Type* d_ptr, const Type* h_ptr, size_t len, - rmm::cuda_stream_view stream) { - copy(d_ptr, h_ptr, len, stream); -} + template + void update_device(Type* d_ptr, const Type* h_ptr, size_t len, + rmm::cuda_stream_view stream) { + copy(d_ptr, h_ptr, len, stream); + } /** performs a device to host copy */ -template -void update_host(Type* h_ptr, const Type* d_ptr, size_t len, - rmm::cuda_stream_view stream) { - copy(h_ptr, d_ptr, len, stream); -} - -template -void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, - rmm::cuda_stream_view stream) { - CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), - cudaMemcpyDeviceToDevice, stream)); -} + template + void update_host(Type* h_ptr, const Type* d_ptr, size_t len, + rmm::cuda_stream_view stream) { + copy(h_ptr, d_ptr, len, stream); + } + + template + void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, + rmm::cuda_stream_view stream) { + CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), + cudaMemcpyDeviceToDevice, stream)); + } /** @} */ /** * @defgroup Debug Utils for debugging host/device buffers * @{ */ -template -void print_host_vector(const char* variable_name, const T* host_mem, - size_t componentsCount, OutStream& out) { - out << variable_name << "=["; - for (size_t i = 0; i < componentsCount; ++i) { - if (i != 0) out << ","; - out << host_mem[i]; - } - out << "];\n"; -} - -template -void print_device_vector(const char* variable_name, const T* devMem, - size_t componentsCount, OutStream& out) { - T* host_mem = new T[componentsCount]; - CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), - cudaMemcpyDeviceToHost)); - print_host_vector(variable_name, host_mem, componentsCount, out); - delete[] host_mem; -} + template + void print_host_vector(const char* variable_name, const T* host_mem, + size_t componentsCount, OutStream& out) { + out << variable_name << "=["; + for (size_t i = 0; i < componentsCount; ++i) { + if (i != 0) out << ","; + out << host_mem[i]; + } + out << "];\n"; + } + + template + void print_device_vector(const char* variable_name, const T* devMem, + size_t componentsCount, OutStream& out) { + T* host_mem = new T[componentsCount]; + CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), + cudaMemcpyDeviceToHost)); + print_host_vector(variable_name, host_mem, componentsCount, out); + delete[] host_mem; + } /** @} */ -static std::mutex mutex_; -static std::unordered_map allocations; - -template -void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, - bool setZero = false) { - size_t size = len * sizeof(Type); - ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream); - if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream)); - - std::lock_guard _(mutex_); - allocations[ptr] = size; -} - -template -void deallocate(Type*& ptr, rmm::cuda_stream_view stream) { - std::lock_guard _(mutex_); - size_t size = allocations[ptr]; - allocations.erase(ptr); - rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream); -} - -inline void deallocate_all(rmm::cuda_stream_view stream) { - std::lock_guard _(mutex_); - for (auto& alloc : allocations) { - void* ptr = alloc.first; - size_t size = alloc.second; - rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream); - } - allocations.clear(); -} + static std::mutex mutex_; + static std::unordered_map allocations; + + template + void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, + bool setZero = false) { + size_t size = len * sizeof(Type); + ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream); + if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream)); + + std::lock_guard _(mutex_); + allocations[ptr] = size; + } + + template + void deallocate(Type*& ptr, rmm::cuda_stream_view stream) { + std::lock_guard _(mutex_); + size_t size = allocations[ptr]; + allocations.erase(ptr); + rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream); + } + + inline void deallocate_all(rmm::cuda_stream_view stream) { + std::lock_guard _(mutex_); + for (auto& alloc : allocations) { + void* ptr = alloc.first; + size_t size = alloc.second; + rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream); + } + allocations.clear(); + } /** helper method to get max usable shared mem per block parameter */ -inline int getSharedMemPerBlock() { - int devId; - CUDA_CHECK(cudaGetDevice(&devId)); - int smemPerBlk; - CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk, - cudaDevAttrMaxSharedMemoryPerBlock, devId)); - return smemPerBlk; -} + inline int getSharedMemPerBlock() { + int devId; + CUDA_CHECK(cudaGetDevice(&devId)); + int smemPerBlk; + CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk, + cudaDevAttrMaxSharedMemoryPerBlock, devId)); + return smemPerBlk; + } /** helper method to get multi-processor count parameter */ -inline int getMultiProcessorCount() { - int devId; - CUDA_CHECK(cudaGetDevice(&devId)); - int mpCount; - CUDA_CHECK( - cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId)); - return mpCount; -} + inline int getMultiProcessorCount() { + int devId; + CUDA_CHECK(cudaGetDevice(&devId)); + int mpCount; + CUDA_CHECK( + cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId)); + return mpCount; + } /** helper method to convert an array on device to a string on host */ -template -std::string arr2Str(const T* arr, int size, std::string name, - cudaStream_t stream, int width = 4) { - std::stringstream ss; + template + std::string arr2Str(const T* arr, int size, std::string name, + cudaStream_t stream, int width = 4) { + std::stringstream ss; - T* arr_h = (T*)malloc(size * sizeof(T)); - update_host(arr_h, arr, size, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + T* arr_h = (T*)malloc(size * sizeof(T)); + update_host(arr_h, arr, size, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); - ss << name << " = [ "; - for (int i = 0; i < size; i++) { - ss << std::setw(width) << arr_h[i]; + ss << name << " = [ "; + for (int i = 0; i < size; i++) { + ss << std::setw(width) << arr_h[i]; - if (i < size - 1) ss << ", "; - } - ss << " ]" << std::endl; + if (i < size - 1) ss << ", "; + } + ss << " ]" << std::endl; - free(arr_h); + free(arr_h); - return ss.str(); -} + return ss.str(); + } /** this seems to be unused, but may be useful in the future */ -template -void ASSERT_DEVICE_MEM(T* ptr, std::string name) { - cudaPointerAttributes s_att; - cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr); - - if (s_err != 0 || s_att.device == -1) - std::cout << "Invalid device pointer encountered in " << name - << ". device=" << s_att.device << ", err=" << s_err << std::endl; -} - -inline uint32_t curTimeMillis() { - auto now = std::chrono::high_resolution_clock::now(); - auto duration = now.time_since_epoch(); - return std::chrono::duration_cast(duration) - .count(); -} + template + void ASSERT_DEVICE_MEM(T* ptr, std::string name) { + cudaPointerAttributes s_att; + cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr); + + if (s_err != 0 || s_att.device == -1) + std::cout << "Invalid device pointer encountered in " << name + << ". device=" << s_att.device << ", err=" << s_err << std::endl; + } + + inline uint32_t curTimeMillis() { + auto now = std::chrono::high_resolution_clock::now(); + auto duration = now.time_since_epoch(); + return std::chrono::duration_cast(duration) + .count(); + } /** Helper function to calculate need memory for allocate to store dense matrix. * @param rows number of rows in matrix @@ -363,33 +376,33 @@ inline uint32_t curTimeMillis() { * @return need number of items to allocate via allocate() * @sa allocate() */ -inline size_t allocLengthForMatrix(size_t rows, size_t columns) { - return rows * columns; -} + inline size_t allocLengthForMatrix(size_t rows, size_t columns) { + return rows * columns; + } /** Helper function to check alignment of pointer. * @param ptr the pointer to check * @param alignment to be checked for * @return true if address in bytes is a multiple of alignment */ -template -bool is_aligned(Type* ptr, size_t alignment) { - return reinterpret_cast(ptr) % alignment == 0; -} + template + bool is_aligned(Type* ptr, size_t alignment) { + return reinterpret_cast(ptr) % alignment == 0; + } /** calculate greatest common divisor of two numbers * @a integer * @b integer * @ return gcd of a and b */ -template -IntType gcd(IntType a, IntType b) { - while (b != 0) { - IntType tmp = b; - b = a % b; - a = tmp; - } - return a; -} + template + IntType gcd(IntType a, IntType b) { + while (b != 0) { + IntType tmp = b; + b = a % b; + a = tmp; + } + return a; + } } // namespace raft From 7d44aba8a825c77ca5a80042d5bfc59834ad81fc Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 15 Nov 2021 18:20:15 -0500 Subject: [PATCH 2/2] Updating style --- cpp/include/raft/cudart_utils.h | 390 ++++++++++++++++---------------- 1 file changed, 195 insertions(+), 195 deletions(-) diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h index 73a3264c8a..486103dedb 100644 --- a/cpp/include/raft/cudart_utils.h +++ b/cpp/include/raft/cudart_utils.h @@ -38,10 +38,10 @@ namespace raft { /** * @brief Exception thrown when a CUDA error is encountered. */ - struct cuda_error : public raft::exception { - explicit cuda_error(char const* const message) : raft::exception(message) {} - explicit cuda_error(std::string const& message) : raft::exception(message) {} - }; +struct cuda_error : public raft::exception { + explicit cuda_error(char const* const message) : raft::exception(message) {} + explicit cuda_error(std::string const& message) : raft::exception(message) {} +}; } // namespace raft @@ -118,22 +118,22 @@ namespace raft { namespace raft { /** Helper method to get to know warp size in device code */ - __host__ __device__ constexpr inline int warp_size() { return 32; } +__host__ __device__ constexpr inline int warp_size() { return 32; } - __host__ __device__ constexpr inline unsigned int warp_full_mask() { - return 0xffffffff; - } +__host__ __device__ constexpr inline unsigned int warp_full_mask() { + return 0xffffffff; +} /** * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping * elements to threads. */ - class grid_1d_thread_t { - public: - int const block_size{0}; - int const num_blocks{0}; +class grid_1d_thread_t { + public: + int const block_size{0}; + int const num_blocks{0}; - /** + /** * @param overall_num_elements The number of elements the kernel needs to handle/process * @param num_threads_per_block The grid block size, determined according to the kernel's * specific features (amount of shared memory necessary, SM functional units use pattern etc.); @@ -141,72 +141,72 @@ namespace raft { * @param elements_per_thread Typically, a single kernel thread processes more than a single * element; this affects the number of threads the grid must contain */ - grid_1d_thread_t(size_t overall_num_elements, size_t num_threads_per_block, - size_t max_num_blocks_1d, size_t elements_per_thread = 1) - : block_size(num_threads_per_block), - num_blocks(std::min((overall_num_elements + - (elements_per_thread * num_threads_per_block) - 1) / - (elements_per_thread * num_threads_per_block), - max_num_blocks_1d)) { - RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); - RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, - "num_threads_per_block / warp_size() must be > 0"); - RAFT_EXPECTS(elements_per_thread > 0, "elements_per_thread must be > 0"); - } - }; + grid_1d_thread_t(size_t overall_num_elements, size_t num_threads_per_block, + size_t max_num_blocks_1d, size_t elements_per_thread = 1) + : block_size(num_threads_per_block), + num_blocks(std::min((overall_num_elements + + (elements_per_thread * num_threads_per_block) - 1) / + (elements_per_thread * num_threads_per_block), + max_num_blocks_1d)) { + RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); + RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, + "num_threads_per_block / warp_size() must be > 0"); + RAFT_EXPECTS(elements_per_thread > 0, "elements_per_thread must be > 0"); + } +}; /** * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping * elements to warps. */ - class grid_1d_warp_t { - public: - int const block_size{0}; - int const num_blocks{0}; +class grid_1d_warp_t { + public: + int const block_size{0}; + int const num_blocks{0}; - /** + /** * @param overall_num_elements The number of elements the kernel needs to handle/process * @param num_threads_per_block The grid block size, determined according to the kernel's * specific features (amount of shared memory necessary, SM functional units use pattern etc.); * this can't be determined generically/automatically (as opposed to the number of blocks) */ - grid_1d_warp_t(size_t overall_num_elements, size_t num_threads_per_block, - size_t max_num_blocks_1d) - : block_size(num_threads_per_block), - num_blocks(std::min( - (overall_num_elements + (num_threads_per_block / warp_size()) - 1) / - (num_threads_per_block / warp_size()), - max_num_blocks_1d)) { - RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); - RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, - "num_threads_per_block / warp_size() must be > 0"); - } - }; + grid_1d_warp_t(size_t overall_num_elements, size_t num_threads_per_block, + size_t max_num_blocks_1d) + : block_size(num_threads_per_block), + num_blocks(std::min( + (overall_num_elements + (num_threads_per_block / warp_size()) - 1) / + (num_threads_per_block / warp_size()), + max_num_blocks_1d)) { + RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); + RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, + "num_threads_per_block / warp_size() must be > 0"); + } +}; /** * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping * elements to blocks. */ - class grid_1d_block_t { - public: - int const block_size{0}; - int const num_blocks{0}; +class grid_1d_block_t { + public: + int const block_size{0}; + int const num_blocks{0}; - /** + /** * @param overall_num_elements The number of elements the kernel needs to handle/process * @param num_threads_per_block The grid block size, determined according to the kernel's * specific features (amount of shared memory necessary, SM functional units use pattern etc.); * this can't be determined generically/automatically (as opposed to the number of blocks) */ - grid_1d_block_t(size_t overall_num_elements, size_t num_threads_per_block, - size_t max_num_blocks_1d) - : block_size(num_threads_per_block), - num_blocks(std::min(overall_num_elements, max_num_blocks_1d)) { - RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); - RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, - "num_threads_per_block / warp_size() must be > 0"); - } - }; + grid_1d_block_t(size_t overall_num_elements, size_t num_threads_per_block, + size_t max_num_blocks_1d) + : block_size(num_threads_per_block), + num_blocks(std::min(overall_num_elements, max_num_blocks_1d)) { + RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0"); + RAFT_EXPECTS(num_threads_per_block / warp_size() > 0, + "num_threads_per_block / warp_size() must be > 0"); + } +}; /** * @brief Generic copy method for all kinds of transfers @@ -216,12 +216,12 @@ namespace raft { * @param len lenth of the src/dst buffers in terms of number of elements * @param stream cuda stream */ - template - void copy(Type* dst, const Type* src, size_t len, - rmm::cuda_stream_view stream) { - CUDA_CHECK( - cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream)); - } +template +void copy(Type* dst, const Type* src, size_t len, + rmm::cuda_stream_view stream) { + CUDA_CHECK( + cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream)); +} /** * @defgroup Copy Copy methods @@ -230,145 +230,145 @@ namespace raft { * @{ */ /** performs a host to device copy */ - template - void update_device(Type* d_ptr, const Type* h_ptr, size_t len, - rmm::cuda_stream_view stream) { - copy(d_ptr, h_ptr, len, stream); - } +template +void update_device(Type* d_ptr, const Type* h_ptr, size_t len, + rmm::cuda_stream_view stream) { + copy(d_ptr, h_ptr, len, stream); +} /** performs a device to host copy */ - template - void update_host(Type* h_ptr, const Type* d_ptr, size_t len, - rmm::cuda_stream_view stream) { - copy(h_ptr, d_ptr, len, stream); - } - - template - void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, - rmm::cuda_stream_view stream) { - CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), - cudaMemcpyDeviceToDevice, stream)); - } +template +void update_host(Type* h_ptr, const Type* d_ptr, size_t len, + rmm::cuda_stream_view stream) { + copy(h_ptr, d_ptr, len, stream); +} + +template +void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, + rmm::cuda_stream_view stream) { + CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), + cudaMemcpyDeviceToDevice, stream)); +} /** @} */ /** * @defgroup Debug Utils for debugging host/device buffers * @{ */ - template - void print_host_vector(const char* variable_name, const T* host_mem, - size_t componentsCount, OutStream& out) { - out << variable_name << "=["; - for (size_t i = 0; i < componentsCount; ++i) { - if (i != 0) out << ","; - out << host_mem[i]; - } - out << "];\n"; - } - - template - void print_device_vector(const char* variable_name, const T* devMem, - size_t componentsCount, OutStream& out) { - T* host_mem = new T[componentsCount]; - CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), - cudaMemcpyDeviceToHost)); - print_host_vector(variable_name, host_mem, componentsCount, out); - delete[] host_mem; - } +template +void print_host_vector(const char* variable_name, const T* host_mem, + size_t componentsCount, OutStream& out) { + out << variable_name << "=["; + for (size_t i = 0; i < componentsCount; ++i) { + if (i != 0) out << ","; + out << host_mem[i]; + } + out << "];\n"; +} + +template +void print_device_vector(const char* variable_name, const T* devMem, + size_t componentsCount, OutStream& out) { + T* host_mem = new T[componentsCount]; + CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), + cudaMemcpyDeviceToHost)); + print_host_vector(variable_name, host_mem, componentsCount, out); + delete[] host_mem; +} /** @} */ - static std::mutex mutex_; - static std::unordered_map allocations; - - template - void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, - bool setZero = false) { - size_t size = len * sizeof(Type); - ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream); - if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream)); - - std::lock_guard _(mutex_); - allocations[ptr] = size; - } - - template - void deallocate(Type*& ptr, rmm::cuda_stream_view stream) { - std::lock_guard _(mutex_); - size_t size = allocations[ptr]; - allocations.erase(ptr); - rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream); - } - - inline void deallocate_all(rmm::cuda_stream_view stream) { - std::lock_guard _(mutex_); - for (auto& alloc : allocations) { - void* ptr = alloc.first; - size_t size = alloc.second; - rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream); - } - allocations.clear(); - } +static std::mutex mutex_; +static std::unordered_map allocations; + +template +void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, + bool setZero = false) { + size_t size = len * sizeof(Type); + ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream); + if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream)); + + std::lock_guard _(mutex_); + allocations[ptr] = size; +} + +template +void deallocate(Type*& ptr, rmm::cuda_stream_view stream) { + std::lock_guard _(mutex_); + size_t size = allocations[ptr]; + allocations.erase(ptr); + rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream); +} + +inline void deallocate_all(rmm::cuda_stream_view stream) { + std::lock_guard _(mutex_); + for (auto& alloc : allocations) { + void* ptr = alloc.first; + size_t size = alloc.second; + rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream); + } + allocations.clear(); +} /** helper method to get max usable shared mem per block parameter */ - inline int getSharedMemPerBlock() { - int devId; - CUDA_CHECK(cudaGetDevice(&devId)); - int smemPerBlk; - CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk, - cudaDevAttrMaxSharedMemoryPerBlock, devId)); - return smemPerBlk; - } +inline int getSharedMemPerBlock() { + int devId; + CUDA_CHECK(cudaGetDevice(&devId)); + int smemPerBlk; + CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk, + cudaDevAttrMaxSharedMemoryPerBlock, devId)); + return smemPerBlk; +} /** helper method to get multi-processor count parameter */ - inline int getMultiProcessorCount() { - int devId; - CUDA_CHECK(cudaGetDevice(&devId)); - int mpCount; - CUDA_CHECK( - cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId)); - return mpCount; - } +inline int getMultiProcessorCount() { + int devId; + CUDA_CHECK(cudaGetDevice(&devId)); + int mpCount; + CUDA_CHECK( + cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId)); + return mpCount; +} /** helper method to convert an array on device to a string on host */ - template - std::string arr2Str(const T* arr, int size, std::string name, - cudaStream_t stream, int width = 4) { - std::stringstream ss; +template +std::string arr2Str(const T* arr, int size, std::string name, + cudaStream_t stream, int width = 4) { + std::stringstream ss; - T* arr_h = (T*)malloc(size * sizeof(T)); - update_host(arr_h, arr, size, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + T* arr_h = (T*)malloc(size * sizeof(T)); + update_host(arr_h, arr, size, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); - ss << name << " = [ "; - for (int i = 0; i < size; i++) { - ss << std::setw(width) << arr_h[i]; + ss << name << " = [ "; + for (int i = 0; i < size; i++) { + ss << std::setw(width) << arr_h[i]; - if (i < size - 1) ss << ", "; - } - ss << " ]" << std::endl; + if (i < size - 1) ss << ", "; + } + ss << " ]" << std::endl; - free(arr_h); + free(arr_h); - return ss.str(); - } + return ss.str(); +} /** this seems to be unused, but may be useful in the future */ - template - void ASSERT_DEVICE_MEM(T* ptr, std::string name) { - cudaPointerAttributes s_att; - cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr); - - if (s_err != 0 || s_att.device == -1) - std::cout << "Invalid device pointer encountered in " << name - << ". device=" << s_att.device << ", err=" << s_err << std::endl; - } - - inline uint32_t curTimeMillis() { - auto now = std::chrono::high_resolution_clock::now(); - auto duration = now.time_since_epoch(); - return std::chrono::duration_cast(duration) - .count(); - } +template +void ASSERT_DEVICE_MEM(T* ptr, std::string name) { + cudaPointerAttributes s_att; + cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr); + + if (s_err != 0 || s_att.device == -1) + std::cout << "Invalid device pointer encountered in " << name + << ". device=" << s_att.device << ", err=" << s_err << std::endl; +} + +inline uint32_t curTimeMillis() { + auto now = std::chrono::high_resolution_clock::now(); + auto duration = now.time_since_epoch(); + return std::chrono::duration_cast(duration) + .count(); +} /** Helper function to calculate need memory for allocate to store dense matrix. * @param rows number of rows in matrix @@ -376,33 +376,33 @@ namespace raft { * @return need number of items to allocate via allocate() * @sa allocate() */ - inline size_t allocLengthForMatrix(size_t rows, size_t columns) { - return rows * columns; - } +inline size_t allocLengthForMatrix(size_t rows, size_t columns) { + return rows * columns; +} /** Helper function to check alignment of pointer. * @param ptr the pointer to check * @param alignment to be checked for * @return true if address in bytes is a multiple of alignment */ - template - bool is_aligned(Type* ptr, size_t alignment) { - return reinterpret_cast(ptr) % alignment == 0; - } +template +bool is_aligned(Type* ptr, size_t alignment) { + return reinterpret_cast(ptr) % alignment == 0; +} /** calculate greatest common divisor of two numbers * @a integer * @b integer * @ return gcd of a and b */ - template - IntType gcd(IntType a, IntType b) { - while (b != 0) { - IntType tmp = b; - b = a % b; - a = tmp; - } - return a; - } +template +IntType gcd(IntType a, IntType b) { + while (b != 0) { + IntType tmp = b; + b = a % b; + a = tmp; + } + return a; +} } // namespace raft