From d630156a99c2fb58c20267ab6641ad49d2ce6686 Mon Sep 17 00:00:00 2001 From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com> Date: Fri, 17 Dec 2021 19:52:35 +0100 Subject: [PATCH] Move NVTX range helpers to raft (#4445) Move NVTX range helpers to raft and extend them a little bit. Corresponding raft PR: https://github.com/rapidsai/raft/pull/416 . Authors: - Artem M. Chirkin (https://github.com/achirkin) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: https://github.com/rapidsai/cuml/pull/4445 --- cpp/CMakeLists.txt | 5 +- cpp/cmake/thirdparty/get_raft.cmake | 7 +- cpp/src/arima/batched_arima.cu | 28 +-- cpp/src/arima/batched_kalman.cu | 14 +- cpp/src/common/nvtx.cu | 175 ------------------ cpp/src/common/nvtx.hpp | 26 ++- cpp/src/dbscan/dbscan.cuh | 5 +- cpp/src/dbscan/mergelabels/tree_reduction.cuh | 5 +- cpp/src/dbscan/runner.cuh | 36 ++-- .../batched-levelalgo/builder.cuh | 28 ++- .../batched-levelalgo/quantiles.cuh | 2 +- cpp/src/decisiontree/decisiontree.cuh | 2 +- cpp/src/glm/ols.cuh | 4 +- cpp/src/randomforest/randomforest.cu | 12 +- cpp/src/randomforest/randomforest.cuh | 8 +- cpp/src/svm/linear.cu | 5 +- cpp/src/umap/runner.cuh | 67 ++++--- cpp/src_prims/linalg/lstsq.cuh | 6 +- cpp/test/CMakeLists.txt | 1 - cpp/test/sg/nvtx_test.cpp | 48 ----- python/cuml/common/cuda.pyx | 33 +--- .../cuml/ensemble/randomforestclassifier.pyx | 17 +- .../cuml/ensemble/randomforestregressor.pyx | 17 +- python/cuml/tsa/batched_lbfgs.py | 98 +++++----- 24 files changed, 192 insertions(+), 457 deletions(-) delete mode 100644 cpp/src/common/nvtx.cu delete mode 100644 cpp/test/sg/nvtx_test.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 52e0b1bc3f..e3194c0e54 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -241,8 +241,7 @@ if(BUILD_CUML_CPP_LIBRARY) # common components add_library(${CUML_CPP_TARGET} SHARED - src/common/logger.cpp - src/common/nvtx.cu) + src/common/logger.cpp) # FIL components target_sources(${CUML_CPP_TARGET} @@ -367,7 +366,6 @@ if(BUILD_CUML_CPP_LIBRARY) target_compile_definitions(${CUML_CPP_TARGET} PUBLIC - $<$:NVTX_ENABLED> DISABLE_CUSPARSE_DEPRECATED PRIVATE CUML_CPP_API @@ -407,7 +405,6 @@ if(BUILD_CUML_CPP_LIBRARY) CUDA::cudart CUDA::cusparse GPUTreeShap::GPUTreeShap - $<$:CUDA::nvToolsExt> $<$:FAISS::FAISS> $,treelite::treelite_static,treelite::treelite> $,treelite::treelite_runtime_static,treelite::treelite_runtime> diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake index 514d4b963b..c7282899ef 100644 --- a/cpp/cmake/thirdparty/get_raft.cmake +++ b/cpp/cmake/thirdparty/get_raft.cmake @@ -34,8 +34,9 @@ function(find_and_configure_raft) GIT_TAG ${PKG_PINNED_TAG} SOURCE_SUBDIR cpp OPTIONS - "BUILD_TESTS OFF" - ) + "BUILD_TESTS OFF" + "NVTX ${NVTX}" + ) if(raft_ADDED) message(VERBOSE "CUML: Using RAFT located in ${raft_SOURCE_DIR}") @@ -58,4 +59,4 @@ set(CUML_BRANCH_VERSION_raft "${CUML_VERSION_MAJOR}.${CUML_VERSION_MINOR}") find_and_configure_raft(VERSION ${CUML_MIN_VERSION_raft} FORK rapidsai PINNED_TAG branch-${CUML_BRANCH_VERSION_raft} - ) \ No newline at end of file + ) diff --git a/cpp/src/arima/batched_arima.cu b/cpp/src/arima/batched_arima.cu index cb91831a7f..35cf3354c9 100644 --- a/cpp/src/arima/batched_arima.cu +++ b/cpp/src/arima/batched_arima.cu @@ -29,9 +29,9 @@ #include #include -#include #include #include +#include #include #include #include @@ -104,7 +104,7 @@ void predict(raft::handle_t& handle, double* d_lower, double* d_upper) { - ML::PUSH_RANGE(__func__); + raft::common::nvtx::range fun_scope(__func__); const auto stream = handle.get_stream(); bool diff = order.need_diff() && pre_diff && level == 0; @@ -245,8 +245,6 @@ void predict(raft::handle_t& handle, }); /// TODO: 2D copy kernel? } - - ML::POP_RANGE(); } /** @@ -360,7 +358,7 @@ void conditional_sum_of_squares(raft::handle_t& handle, double* d_loglike, int truncate) { - ML::PUSH_RANGE(__func__); + raft::common::nvtx::range fun_scope(__func__); auto stream = handle.get_stream(); int n_phi = order.n_phi(); @@ -393,8 +391,6 @@ void conditional_sum_of_squares(raft::handle_t& handle, start_y, start_v); CUDA_CHECK(cudaPeekAtLastError()); - - ML::POP_RANGE(); } void batched_loglike(raft::handle_t& handle, @@ -417,7 +413,7 @@ void batched_loglike(raft::handle_t& handle, double* d_lower, double* d_upper) { - ML::PUSH_RANGE(__func__); + raft::common::nvtx::range fun_scope(__func__); auto stream = handle.get_stream(); @@ -473,7 +469,6 @@ void batched_loglike(raft::handle_t& handle, /* Tranfer log-likelihood device -> host */ raft::update_host(loglike, d_loglike, batch_size, stream); } - ML::POP_RANGE(); } void batched_loglike(raft::handle_t& handle, @@ -490,7 +485,7 @@ void batched_loglike(raft::handle_t& handle, LoglikeMethod method, int truncate) { - ML::PUSH_RANGE(__func__); + raft::common::nvtx::range fun_scope(__func__); // unpack parameters auto stream = handle.get_stream(); @@ -518,8 +513,6 @@ void batched_loglike(raft::handle_t& handle, host_loglike, method, truncate); - - ML::POP_RANGE(); } void batched_loglike_grad(raft::handle_t& handle, @@ -536,7 +529,7 @@ void batched_loglike_grad(raft::handle_t& handle, LoglikeMethod method, int truncate) { - ML::PUSH_RANGE(__func__); + raft::common::nvtx::range fun_scope(__func__); auto stream = handle.get_stream(); auto counting = thrust::make_counting_iterator(0); int N = order.complexity(); @@ -597,8 +590,6 @@ void batched_loglike_grad(raft::handle_t& handle, d_x_pert[N * bid + i] = d_x[N * bid + i]; }); } - - ML::POP_RANGE(); } void information_criterion(raft::handle_t& handle, @@ -612,7 +603,7 @@ void information_criterion(raft::handle_t& handle, double* d_ic, int ic_type) { - ML::PUSH_RANGE(__func__); + raft::common::nvtx::range fun_scope(__func__); auto stream = handle.get_stream(); /* Compute log-likelihood in d_ic */ @@ -628,8 +619,6 @@ void information_criterion(raft::handle_t& handle, batch_size, n_obs - order.n_diff(), stream); - - ML::POP_RANGE(); } /** @@ -962,7 +951,7 @@ void estimate_x0(raft::handle_t& handle, const ARIMAOrder& order, bool missing) { - ML::PUSH_RANGE(__func__); + raft::common::nvtx::range fun_scope(__func__); const auto& handle_impl = handle; auto stream = handle_impl.get_stream(); auto cublas_handle = handle_impl.get_cublas_handle(); @@ -1007,7 +996,6 @@ void estimate_x0(raft::handle_t& handle, // Do the computation of the initial parameters _start_params(handle, params, bm_yd, bm_exog_diff, order); - ML::POP_RANGE(); } } // namespace ML diff --git a/cpp/src/arima/batched_kalman.cu b/cpp/src/arima/batched_kalman.cu index 1c189dde13..01306b0862 100644 --- a/cpp/src/arima/batched_kalman.cu +++ b/cpp/src/arima/batched_kalman.cu @@ -30,9 +30,9 @@ #include #include -#include #include #include +#include #include namespace ML { @@ -1283,7 +1283,7 @@ void _batched_kalman_filter(raft::handle_t& handle, MLCommon::LinAlg::Batched::b_gemm(false, true, rd, rd, 1, 1.0, RQb, Rb, 0.0, RQR); // Durbin Koopman "Time Series Analysis" pg 138 - ML::PUSH_RANGE("Init P"); + raft::common::nvtx::push_range("Init P"); MLCommon::LinAlg::Batched::Matrix P( rd, rd, batch_size, cublasHandle, arima_mem.P_batches, arima_mem.P_dense, stream, true); { @@ -1326,7 +1326,7 @@ void _batched_kalman_filter(raft::handle_t& handle, _lyapunov_wrapper(handle, arima_mem, Tb, RQR, P, rd); } } - ML::POP_RANGE(); + raft::common::nvtx::pop_range(); // Initialize the state alpha by solving (I - T*) x* = c with: // | mu | @@ -1442,7 +1442,7 @@ void init_batched_kalman_matrices(raft::handle_t& handle, double* d_R_b, double* d_T_b) { - ML::PUSH_RANGE(__func__); + raft::common::nvtx::range fun_scope(__func__); auto stream = handle.get_stream(); @@ -1535,8 +1535,6 @@ void init_batched_kalman_matrices(raft::handle_t& handle, // If rd=2 and phi_2=-1, I-TxT is singular if (rd == 2 && order.p == 2 && abs(batch_T[1] + 1) < 0.01) { batch_T[1] = -0.99; } }); - - ML::POP_RANGE(); } void batched_kalman_filter(raft::handle_t& handle, @@ -1556,7 +1554,7 @@ void batched_kalman_filter(raft::handle_t& handle, double* d_lower, double* d_upper) { - ML::PUSH_RANGE(__func__); + raft::common::nvtx::range fun_scope(__func__); auto cublasHandle = handle.get_cublas_handle(); auto stream = handle.get_stream(); @@ -1607,8 +1605,6 @@ void batched_kalman_filter(raft::handle_t& handle, level, d_lower, d_upper); - - ML::POP_RANGE(); } void batched_jones_transform(raft::handle_t& handle, diff --git a/cpp/src/common/nvtx.cu b/cpp/src/common/nvtx.cu deleted file mode 100644 index 5f778e0bec..0000000000 --- a/cpp/src/common/nvtx.cu +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include "nvtx.hpp" - -namespace ML { - -/** - * @brief An internal struct to store associated state with the color - * generator - */ -struct ColorGenState { - /** collection of all tagged colors generated so far */ - static std::unordered_map allColors; - /** mutex for accessing the above map */ - static std::mutex mapMutex; - /** saturation */ - static constexpr float S = 0.9f; - /** value */ - static constexpr float V = 0.85f; - /** golden ratio */ - static constexpr float Phi = 1.61803f; - /** inverse golden ratio */ - static constexpr float InvPhi = 1.f / Phi; -}; - -std::unordered_map ColorGenState::allColors; -std::mutex ColorGenState::mapMutex; - -// all h, s, v are in range [0, 1] -// Ref: http://en.wikipedia.org/wiki/HSL_and_HSV#Converting_to_RGB -uint32_t hsv2rgb(float h, float s, float v) -{ - uint32_t out = 0xff000000u; - if (s <= 0.0f) { return out; } - // convert hue from [0, 1] range to [0, 360] - float h_deg = h * 360.f; - if (0.f > h_deg || h_deg >= 360.f) h_deg = 0.f; - h_deg /= 60.f; - int h_range = (int)h_deg; - float h_mod = h_deg - h_range; - float x = v * (1.f - s); - float y = v * (1.f - (s * h_mod)); - float z = v * (1.f - (s * (1.f - h_mod))); - float r, g, b; - switch (h_range) { - case 0: - r = v; - g = z; - b = x; - break; - case 1: - r = y; - g = v; - b = x; - break; - case 2: - r = x; - g = v; - b = z; - break; - case 3: - r = x; - g = y; - b = v; - break; - case 4: - r = z; - g = x; - b = v; - break; - case 5: - default: - r = v; - g = x; - b = y; - break; - } - out |= (uint32_t(r * 256.f) << 16); - out |= (uint32_t(g * 256.f) << 8); - out |= uint32_t(b * 256.f); - return out; -} - -/** - * @brief Helper method to generate 'visually distinct' colors. - * Inspired from https://martin.ankerl.com/2009/12/09/how-to-create-random-colors-programmatically/ - * However, if an associated tag is passed, it will look up in its history for - * any generated color against this tag and if found, just returns it, else - * generates a new color, assigns a tag to it and stores it for future usage. - * Such a thing is very useful for nvtx markers where the ranges associated - * with a specific tag should ideally get the same color for the purpose of - * visualizing it on nsight-systems timeline. - * @param tag look for any previously generated colors with this tag or - * associate the currently generated color with it - * @return returns 32b RGB integer with alpha channel set of 0xff - */ -uint32_t generateNextColor(const std::string& tag) -{ - std::lock_guard guard(ColorGenState::mapMutex); - if (!tag.empty()) { - auto itr = ColorGenState::allColors.find(tag); - if (itr != ColorGenState::allColors.end()) { return itr->second; } - } - float h = rand() * 1.f / RAND_MAX; - h += ColorGenState::InvPhi; - if (h >= 1.f) h -= 1.f; - auto rgb = hsv2rgb(h, ColorGenState::S, ColorGenState::V); - if (!tag.empty()) { ColorGenState::allColors[tag] = rgb; } - return rgb; -} - -#ifdef NVTX_ENABLED - -#include - -nvtxDomainHandle_t domain = nvtxDomainCreateA("cuml_cpp"); - -void PUSH_RANGE(const char* name, cudaStream_t stream) -{ - CUDA_CHECK(cudaStreamSynchronize(stream)); - PUSH_RANGE(name); -} - -void POP_RANGE(cudaStream_t stream) -{ - CUDA_CHECK(cudaStreamSynchronize(stream)); - POP_RANGE(); -} - -void PUSH_RANGE(const char* name) -{ - nvtxEventAttributes_t eventAttrib = {0}; - eventAttrib.version = NVTX_VERSION; - eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; - eventAttrib.colorType = NVTX_COLOR_ARGB; - eventAttrib.color = generateNextColor(name); - eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; - eventAttrib.message.ascii = name; - nvtxDomainRangePushEx(domain, &eventAttrib); -} - -void POP_RANGE() { nvtxDomainRangePop(domain); } - -#else // NVTX_ENABLED - -void PUSH_RANGE(const char* name, cudaStream_t stream) {} - -void POP_RANGE(cudaStream_t stream) {} - -void PUSH_RANGE(const char* name) {} - -void POP_RANGE() {} - -#endif // NVTX_ENABLED - -} // end namespace ML diff --git a/cpp/src/common/nvtx.hpp b/cpp/src/common/nvtx.hpp index bf9d16ed8d..1d9a2a3336 100644 --- a/cpp/src/common/nvtx.hpp +++ b/cpp/src/common/nvtx.hpp @@ -16,7 +16,7 @@ #pragma once -#include +#include namespace ML { @@ -25,21 +25,37 @@ namespace ML { * @param name range name * @param stream stream to synchronize */ -void PUSH_RANGE(const char* name, cudaStream_t stream); +[[deprecated("Use new raft::common::nvtx::push_range from ")]] inline void +PUSH_RANGE(const char* name, cudaStream_t stream) +{ + raft::common::nvtx::push_range(name); +} /** * @brief Synchronize CUDA stream and pop the latest nvtx range * @param stream stream to synchronize */ -void POP_RANGE(cudaStream_t stream); +[[deprecated("Use new raft::common::nvtx::pop_range from ")]] inline void +POP_RANGE(cudaStream_t stream) +{ + raft::common::nvtx::pop_range(); +} /** * @brief Push a named nvtx range * @param name range name */ -void PUSH_RANGE(const char* name); +[[deprecated("Use new raft::common::nvtx::push_range from ")]] inline void +PUSH_RANGE(const char* name) +{ + raft::common::nvtx::push_range(name); +} /** Pop the latest range */ -void POP_RANGE(); +[[deprecated("Use new raft::common::nvtx::pop_range from ")]] inline void +POP_RANGE() +{ + raft::common::nvtx::pop_range(); +} } // end namespace ML diff --git a/cpp/src/dbscan/dbscan.cuh b/cpp/src/dbscan/dbscan.cuh index 9d7c1061cc..467476070e 100644 --- a/cpp/src/dbscan/dbscan.cuh +++ b/cpp/src/dbscan/dbscan.cuh @@ -18,7 +18,7 @@ #include "runner.cuh" -#include +#include #include #include @@ -108,7 +108,7 @@ void dbscanFitImpl(const raft::handle_t& handle, cudaStream_t stream, int verbosity) { - ML::PUSH_RANGE("ML::Dbscan::Fit"); + raft::common::nvtx::range fun_scope("ML::Dbscan::Fit"); ML::Logger::get().setLevel(verbosity); int algo_vd = (metric == raft::distance::Precomputed) ? 2 : 1; int algo_adj = 1; @@ -201,7 +201,6 @@ void dbscanFitImpl(const raft::handle_t& handle, workspace.data(), batch_size, stream); - ML::POP_RANGE(); } } // namespace Dbscan diff --git a/cpp/src/dbscan/mergelabels/tree_reduction.cuh b/cpp/src/dbscan/mergelabels/tree_reduction.cuh index 98ec5f8cb0..6128f9d7fd 100644 --- a/cpp/src/dbscan/mergelabels/tree_reduction.cuh +++ b/cpp/src/dbscan/mergelabels/tree_reduction.cuh @@ -18,7 +18,7 @@ #include "runner.cuh" -#include +#include #include @@ -76,9 +76,8 @@ void tree_reduction(const raft::handle_t& handle, if (receiver) { CUML_LOG_DEBUG("--> Merge labels"); - ML::PUSH_RANGE("Trace::Dbscan::MergeLabels"); + raft::common::nvtx::range fun_scope("Trace::Dbscan::MergeLabels"); MergeLabels::run(handle, labels, labels_temp, mask, work_buffer, m, N, stream); - ML::POP_RANGE(); } s *= 2; diff --git a/cpp/src/dbscan/runner.cuh b/cpp/src/dbscan/runner.cuh index 82d5c9bb14..9e0c2e6148 100644 --- a/cpp/src/dbscan/runner.cuh +++ b/cpp/src/dbscan/runner.cuh @@ -17,8 +17,8 @@ #pragma once #include -#include #include