From fe895f2a486a6b1eb5ec46734d6931994b958c40 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 9 Dec 2021 15:42:56 +0100 Subject: [PATCH 1/2] Add CUML_USING_RANGE pragma for easier NVTX profiling --- cpp/src/common/nvtx.hpp | 74 ++++++++++++++++++++++++++++++++++ cpp/src/glm/ols.cuh | 21 +++++----- cpp/src/glm/preprocess.cuh | 41 ++++++++++++++----- cpp/src/svm/linear.cu | 5 +-- cpp/src_prims/linalg/lstsq.cuh | 7 ++-- 5 files changed, 121 insertions(+), 27 deletions(-) diff --git a/cpp/src/common/nvtx.hpp b/cpp/src/common/nvtx.hpp index bf9d16ed8d..f6c2e5a8d8 100644 --- a/cpp/src/common/nvtx.hpp +++ b/cpp/src/common/nvtx.hpp @@ -42,4 +42,78 @@ void PUSH_RANGE(const char* name); /** Pop the latest range */ void POP_RANGE(); +/** Push a named nvtx range that would be popped at the end of the object lifetime. */ +class AUTO_RANGE { + private: + std::optional stream; + + template + void init(const char* name, Args... args) + { + if constexpr (sizeof...(args) > 0) { + int length = std::snprintf(nullptr, 0, name, args...); + assert(length >= 0); + auto buf = std::make_unique(length + 1); + std::snprintf(buf.get(), length + 1, name, args...); + + if (stream.has_value()) + PUSH_RANGE(buf.get(), stream.value()); + else + PUSH_RANGE(buf.get()); + } else { + if (stream.has_value()) + PUSH_RANGE(name, stream.value()); + else + PUSH_RANGE(name); + } + } + + public: + /** + * Synchronize CUDA stream and push a named nvtx range + * At the end of the object lifetime, synchronize again and pop the range. + * + * @param stream stream to synchronize + * @param name range name (accepts printf-style arguments) + */ + template + AUTO_RANGE(rmm::cuda_stream_view stream, const char* name, Args... args) + : stream(std::make_optional(stream)) + { + init(name, args...); + } + + /** + * Push a named nvtx range. + * At the end of the object lifetime, pop the range back. + * + * @param name range name (accepts printf-style arguments) + */ + template + AUTO_RANGE(const char* name, Args... args) : stream(std::nullopt) + { + init(name, args...); + } + + ~AUTO_RANGE() + { + if (stream.has_value()) + POP_RANGE(stream.value()); + else + POP_RANGE(); + } +}; + +/*! + \def CUML_USING_RANGE(...) + When NVTX is enabled, push a named nvtx range now and pop it at the end of the code block. + + This macro initializes a dummy AUTO_RANGE variable on the stack, +*/ +#ifdef NVTX_ENABLED +#define CUML_USING_RANGE(...) ML::AUTO_RANGE _AUTO_RANGE_##__LINE__(__VA_ARGS__) +#else +#define CUML_USING_RANGE(...) (void)0 +#endif + } // end namespace ML diff --git a/cpp/src/glm/ols.cuh b/cpp/src/glm/ols.cuh index 0334f72906..1124435f4a 100644 --- a/cpp/src/glm/ols.cuh +++ b/cpp/src/glm/ols.cuh @@ -94,17 +94,18 @@ void olsFit(const raft::handle_t& handle, int selectedAlgo = algo; if (n_cols > n_rows || n_cols == 1) selectedAlgo = 0; - ML::PUSH_RANGE("Trace::MLCommon::LinAlg::ols-lstsq*", stream); - switch (selectedAlgo) { - case 0: LinAlg::lstsqSvdJacobi(handle, input, n_rows, n_cols, labels, coef, stream); break; - case 1: LinAlg::lstsqEig(handle, input, n_rows, n_cols, labels, coef, stream); break; - case 2: LinAlg::lstsqQR(handle, input, n_rows, n_cols, labels, coef, stream); break; - case 3: LinAlg::lstsqSvdQR(handle, input, n_rows, n_cols, labels, coef, stream); break; - default: - ASSERT(false, "olsFit: no algorithm with this id (%d) has been implemented", algo); - break; + { + CUML_USING_RANGE(stream, "ML::GLM::olsFit::impl-%d", selectedAlgo); + switch (selectedAlgo) { + case 0: LinAlg::lstsqSvdJacobi(handle, input, n_rows, n_cols, labels, coef, stream); break; + case 1: LinAlg::lstsqEig(handle, input, n_rows, n_cols, labels, coef, stream); break; + case 2: LinAlg::lstsqQR(handle, input, n_rows, n_cols, labels, coef, stream); break; + case 3: LinAlg::lstsqSvdQR(handle, input, n_rows, n_cols, labels, coef, stream); break; + default: + ASSERT(false, "olsFit: no algorithm with this id (%d) has been implemented", algo); + break; + } } - ML::POP_RANGE(stream); if (fit_intercept) { postProcessData(handle, diff --git a/cpp/src/glm/preprocess.cuh b/cpp/src/glm/preprocess.cuh index 8ee77966c2..db173643ad 100644 --- a/cpp/src/glm/preprocess.cuh +++ b/cpp/src/glm/preprocess.cuh @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include #include @@ -46,17 +47,22 @@ void preProcessData(const raft::handle_t& handle, bool normalize, cudaStream_t stream) { + CUML_USING_RANGE("ML::GLM::preProcessData-%d-%d", n_rows, n_cols); ASSERT(n_cols > 0, "Parameter n_cols: number of columns cannot be less than one"); ASSERT(n_rows > 1, "Parameter n_rows: number of rows cannot be less than two"); if (fit_intercept) { - raft::stats::mean(mu_input, input, n_cols, n_rows, false, false, stream); - raft::stats::meanCenter(input, input, mu_input, n_cols, n_rows, false, true, stream); + { + CUML_USING_RANGE(stream, "ML::GLM::preProcessData-mean"); + raft::stats::mean(mu_input, input, n_cols, n_rows, false, false, stream); + raft::stats::meanCenter(input, input, mu_input, n_cols, n_rows, false, true, stream); - raft::stats::mean(mu_labels, labels, 1, n_rows, false, false, stream); - raft::stats::meanCenter(labels, labels, mu_labels, 1, n_rows, false, true, stream); + raft::stats::mean(mu_labels, labels, 1, n_rows, false, false, stream); + raft::stats::meanCenter(labels, labels, mu_labels, 1, n_rows, false, true, stream); + } if (normalize) { + CUML_USING_RANGE(stream, "ML::GLM::preProcessData-normalize"); raft::linalg::colNorm(norm2_input, input, n_cols, @@ -86,6 +92,7 @@ void postProcessData(const raft::handle_t& handle, bool normalize, cudaStream_t stream) { + CUML_USING_RANGE("ML::GLM::postProcessData-%d-%d", n_rows, n_cols); ASSERT(n_cols > 0, "Parameter n_cols: number of columns cannot be less than one"); ASSERT(n_rows > 1, "Parameter n_rows: number of rows cannot be less than two"); @@ -93,19 +100,31 @@ void postProcessData(const raft::handle_t& handle, rmm::device_scalar d_intercept(stream); if (normalize) { + CUML_USING_RANGE(stream, "ML::GLM::postProcessData-denormalize"); raft::matrix::matrixVectorBinaryMult(input, norm2_input, n_rows, n_cols, false, true, stream); raft::matrix::matrixVectorBinaryDivSkipZero( coef, norm2_input, 1, n_cols, false, true, stream, true); } - raft::linalg::gemm( - handle, mu_input, 1, n_cols, coef, d_intercept.data(), 1, 1, CUBLAS_OP_N, CUBLAS_OP_N, stream); - - raft::linalg::subtract(d_intercept.data(), mu_labels, d_intercept.data(), 1, stream); - *intercept = d_intercept.value(stream); + { + CUML_USING_RANGE(stream, "ML::GLM::postProcessData-shift"); + raft::linalg::gemm(handle, + mu_input, + 1, + n_cols, + coef, + d_intercept.data(), + 1, + 1, + CUBLAS_OP_N, + CUBLAS_OP_N, + stream); - raft::stats::meanAdd(input, input, mu_input, n_cols, n_rows, false, true, stream); - raft::stats::meanAdd(labels, labels, mu_labels, 1, n_rows, false, true, stream); + raft::linalg::subtract(d_intercept.data(), mu_labels, d_intercept.data(), 1, stream); + *intercept = d_intercept.value(stream); + raft::stats::meanAdd(input, input, mu_input, n_cols, n_rows, false, true, stream); + raft::stats::meanAdd(labels, labels, mu_labels, 1, n_rows, false, true, stream); + } } }; // namespace GLM diff --git a/cpp/src/svm/linear.cu b/cpp/src/svm/linear.cu index be5401f211..36b7dc87b0 100644 --- a/cpp/src/svm/linear.cu +++ b/cpp/src/svm/linear.cu @@ -362,6 +362,8 @@ LinearSVMModel LinearSVMModel::fit(const raft::handle_t& handle, const T* y, const T* sampleWeight) { + CUML_USING_RANGE("ML::SVM::LinearSVMModel-%d-%d", nRows, nCols); + cudaStream_t stream = handle.get_stream(); rmm::device_uvector classesBuf(0, stream); const std::size_t nClasses = @@ -376,8 +378,6 @@ LinearSVMModel LinearSVMModel::fit(const raft::handle_t& handle, const int coefCols = narrowDown(model.coefCols()); const std::size_t coefRows = model.coefRows; - ML::PUSH_RANGE("Trace::LinearSVMModel::fit"); - auto nCols1 = nCols + int(params.fit_intercept && params.penalized_intercept); T iC = params.C > 0 ? (1.0 / params.C) : 1.0; @@ -504,7 +504,6 @@ LinearSVMModel LinearSVMModel::fit(const raft::handle_t& handle, raft::linalg::transpose(handle, ps1, model.probScale, 2, coefCols, stream); } - ML::POP_RANGE(); return model; } diff --git a/cpp/src_prims/linalg/lstsq.cuh b/cpp/src_prims/linalg/lstsq.cuh index b920fa084c..b946bda1c0 100644 --- a/cpp/src_prims/linalg/lstsq.cuh +++ b/cpp/src_prims/linalg/lstsq.cuh @@ -301,9 +301,10 @@ void lstsqEig(const raft::handle_t& handle, multAbDone.record(multAbStream); // Q S Q* <- covA - ML::PUSH_RANGE("Trace::MLCommon::LinAlg::lstsq::eigDC", mainStream); - raft::linalg::eigDC(handle, covA, n_cols, n_cols, Q, S, mainStream); - ML::POP_RANGE(mainStream); + { + CUML_USING_RANGE("raft::linalg::eigDC", mainStream); + raft::linalg::eigDC(handle, covA, n_cols, n_cols, Q, S, mainStream); + } // QS <- Q invS raft::linalg::matrixVectorOp( From 99e2bf5e162c12d18952846216eba4b43ac991c3 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 9 Dec 2021 16:05:32 +0100 Subject: [PATCH 2/2] Fix the docs --- cpp/src/common/nvtx.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/src/common/nvtx.hpp b/cpp/src/common/nvtx.hpp index f6c2e5a8d8..fda1cd660f 100644 --- a/cpp/src/common/nvtx.hpp +++ b/cpp/src/common/nvtx.hpp @@ -75,6 +75,7 @@ class AUTO_RANGE { * * @param stream stream to synchronize * @param name range name (accepts printf-style arguments) + * @param args the arguments for the printf-style formatting */ template AUTO_RANGE(rmm::cuda_stream_view stream, const char* name, Args... args) @@ -88,6 +89,7 @@ class AUTO_RANGE { * At the end of the object lifetime, pop the range back. * * @param name range name (accepts printf-style arguments) + * @param args the arguments for the printf-style formatting */ template AUTO_RANGE(const char* name, Args... args) : stream(std::nullopt) @@ -106,9 +108,10 @@ class AUTO_RANGE { /*! \def CUML_USING_RANGE(...) - When NVTX is enabled, push a named nvtx range now and pop it at the end of the code block. + When NVTX is enabled, push a named nvtx range and pop it at the end of the enclosing code block. This macro initializes a dummy AUTO_RANGE variable on the stack, + which pushes the range in its constructor and pops it in the destructor. */ #ifdef NVTX_ENABLED #define CUML_USING_RANGE(...) ML::AUTO_RANGE _AUTO_RANGE_##__LINE__(__VA_ARGS__)