Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CUML_USING_RANGE macro for easier NVTX profiling #4436

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions cpp/src/common/nvtx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,81 @@ void PUSH_RANGE(const char* name);
/** Pop the latest range */
void POP_RANGE();

/** Push a named nvtx range that would be popped at the end of the object lifetime. */
class AUTO_RANGE {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this can be used more generally both inside of RAFT primitives and across different projects. Is there any particular reason we shouldn't push this whole file to RAFT?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(I asked the same question in the gtests for your current RAFT PR but I haven't finished the review yet so I figured I'd ask here in comment).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would be fantastic! I'd definitely use it in Triton backends if it were available in RAFT.

Copy link
Contributor Author

@achirkin achirkin Dec 9, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With your permission, I'd love to move this to raft :) My PR rapidsai/raft#401 would benefit from that indeed.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Absolutely, I think that's a great idea!

private:
std::optional<rmm::cuda_stream_view> stream;

template <typename... Args>
void init(const char* name, Args... args)
{
if constexpr (sizeof...(args) > 0) {
int length = std::snprintf(nullptr, 0, name, args...);
assert(length >= 0);
auto buf = std::make_unique<char[]>(length + 1);
std::snprintf(buf.get(), length + 1, name, args...);

if (stream.has_value())
PUSH_RANGE(buf.get(), stream.value());
else
PUSH_RANGE(buf.get());
} else {
if (stream.has_value())
PUSH_RANGE(name, stream.value());
else
PUSH_RANGE(name);
}
}

public:
/**
* Synchronize CUDA stream and push a named nvtx range
* At the end of the object lifetime, synchronize again and pop the range.
*
* @param stream stream to synchronize
* @param name range name (accepts printf-style arguments)
* @param args the arguments for the printf-style formatting
*/
template <typename... Args>
AUTO_RANGE(rmm::cuda_stream_view stream, const char* name, Args... args)
: stream(std::make_optional(stream))
{
init(name, args...);
}

/**
* Push a named nvtx range.
* At the end of the object lifetime, pop the range back.
*
* @param name range name (accepts printf-style arguments)
* @param args the arguments for the printf-style formatting
*/
template <typename... Args>
AUTO_RANGE(const char* name, Args... args) : stream(std::nullopt)
{
init(name, args...);
}

~AUTO_RANGE()
{
if (stream.has_value())
POP_RANGE(stream.value());
else
POP_RANGE();
}
};

/*!
\def CUML_USING_RANGE(...)
When NVTX is enabled, push a named nvtx range and pop it at the end of the enclosing code block.

This macro initializes a dummy AUTO_RANGE variable on the stack,
which pushes the range in its constructor and pops it in the destructor.
*/
#ifdef NVTX_ENABLED
#define CUML_USING_RANGE(...) ML::AUTO_RANGE _AUTO_RANGE_##__LINE__(__VA_ARGS__)
#else
#define CUML_USING_RANGE(...) (void)0
#endif

} // end namespace ML
21 changes: 11 additions & 10 deletions cpp/src/glm/ols.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -94,17 +94,18 @@ void olsFit(const raft::handle_t& handle,
int selectedAlgo = algo;
if (n_cols > n_rows || n_cols == 1) selectedAlgo = 0;

ML::PUSH_RANGE("Trace::MLCommon::LinAlg::ols-lstsq*", stream);
switch (selectedAlgo) {
case 0: LinAlg::lstsqSvdJacobi(handle, input, n_rows, n_cols, labels, coef, stream); break;
case 1: LinAlg::lstsqEig(handle, input, n_rows, n_cols, labels, coef, stream); break;
case 2: LinAlg::lstsqQR(handle, input, n_rows, n_cols, labels, coef, stream); break;
case 3: LinAlg::lstsqSvdQR(handle, input, n_rows, n_cols, labels, coef, stream); break;
default:
ASSERT(false, "olsFit: no algorithm with this id (%d) has been implemented", algo);
break;
{
CUML_USING_RANGE(stream, "ML::GLM::olsFit::impl-%d", selectedAlgo);
switch (selectedAlgo) {
case 0: LinAlg::lstsqSvdJacobi(handle, input, n_rows, n_cols, labels, coef, stream); break;
case 1: LinAlg::lstsqEig(handle, input, n_rows, n_cols, labels, coef, stream); break;
case 2: LinAlg::lstsqQR(handle, input, n_rows, n_cols, labels, coef, stream); break;
case 3: LinAlg::lstsqSvdQR(handle, input, n_rows, n_cols, labels, coef, stream); break;
default:
ASSERT(false, "olsFit: no algorithm with this id (%d) has been implemented", algo);
break;
}
}
ML::POP_RANGE(stream);

if (fit_intercept) {
postProcessData(handle,
Expand Down
41 changes: 30 additions & 11 deletions cpp/src/glm/preprocess.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#pragma once

#include <raft/cudart_utils.h>
#include <common/nvtx.hpp>
#include <raft/linalg/gemm.cuh>
#include <raft/linalg/norm.cuh>
#include <raft/matrix/math.hpp>
Expand Down Expand Up @@ -46,17 +47,22 @@ void preProcessData(const raft::handle_t& handle,
bool normalize,
cudaStream_t stream)
{
CUML_USING_RANGE("ML::GLM::preProcessData-%d-%d", n_rows, n_cols);
ASSERT(n_cols > 0, "Parameter n_cols: number of columns cannot be less than one");
ASSERT(n_rows > 1, "Parameter n_rows: number of rows cannot be less than two");

if (fit_intercept) {
raft::stats::mean(mu_input, input, n_cols, n_rows, false, false, stream);
raft::stats::meanCenter(input, input, mu_input, n_cols, n_rows, false, true, stream);
{
CUML_USING_RANGE(stream, "ML::GLM::preProcessData-mean");
raft::stats::mean(mu_input, input, n_cols, n_rows, false, false, stream);
raft::stats::meanCenter(input, input, mu_input, n_cols, n_rows, false, true, stream);

raft::stats::mean(mu_labels, labels, 1, n_rows, false, false, stream);
raft::stats::meanCenter(labels, labels, mu_labels, 1, n_rows, false, true, stream);
raft::stats::mean(mu_labels, labels, 1, n_rows, false, false, stream);
raft::stats::meanCenter(labels, labels, mu_labels, 1, n_rows, false, true, stream);
}

if (normalize) {
CUML_USING_RANGE(stream, "ML::GLM::preProcessData-normalize");
raft::linalg::colNorm(norm2_input,
input,
n_cols,
Expand Down Expand Up @@ -86,26 +92,39 @@ void postProcessData(const raft::handle_t& handle,
bool normalize,
cudaStream_t stream)
{
CUML_USING_RANGE("ML::GLM::postProcessData-%d-%d", n_rows, n_cols);
ASSERT(n_cols > 0, "Parameter n_cols: number of columns cannot be less than one");
ASSERT(n_rows > 1, "Parameter n_rows: number of rows cannot be less than two");

cublasHandle_t cublas_handle = handle.get_cublas_handle();
rmm::device_scalar<math_t> d_intercept(stream);

if (normalize) {
CUML_USING_RANGE(stream, "ML::GLM::postProcessData-denormalize");
raft::matrix::matrixVectorBinaryMult(input, norm2_input, n_rows, n_cols, false, true, stream);
raft::matrix::matrixVectorBinaryDivSkipZero(
coef, norm2_input, 1, n_cols, false, true, stream, true);
}

raft::linalg::gemm(
handle, mu_input, 1, n_cols, coef, d_intercept.data(), 1, 1, CUBLAS_OP_N, CUBLAS_OP_N, stream);

raft::linalg::subtract(d_intercept.data(), mu_labels, d_intercept.data(), 1, stream);
*intercept = d_intercept.value(stream);
{
CUML_USING_RANGE(stream, "ML::GLM::postProcessData-shift");
raft::linalg::gemm(handle,
mu_input,
1,
n_cols,
coef,
d_intercept.data(),
1,
1,
CUBLAS_OP_N,
CUBLAS_OP_N,
stream);

raft::stats::meanAdd(input, input, mu_input, n_cols, n_rows, false, true, stream);
raft::stats::meanAdd(labels, labels, mu_labels, 1, n_rows, false, true, stream);
raft::linalg::subtract(d_intercept.data(), mu_labels, d_intercept.data(), 1, stream);
*intercept = d_intercept.value(stream);
raft::stats::meanAdd(input, input, mu_input, n_cols, n_rows, false, true, stream);
raft::stats::meanAdd(labels, labels, mu_labels, 1, n_rows, false, true, stream);
}
}

}; // namespace GLM
Expand Down
5 changes: 2 additions & 3 deletions cpp/src/svm/linear.cu
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,8 @@ LinearSVMModel<T> LinearSVMModel<T>::fit(const raft::handle_t& handle,
const T* y,
const T* sampleWeight)
{
CUML_USING_RANGE("ML::SVM::LinearSVMModel-%d-%d", nRows, nCols);

cudaStream_t stream = handle.get_stream();
rmm::device_uvector<T> classesBuf(0, stream);
const std::size_t nClasses =
Expand All @@ -376,8 +378,6 @@ LinearSVMModel<T> LinearSVMModel<T>::fit(const raft::handle_t& handle,
const int coefCols = narrowDown(model.coefCols());
const std::size_t coefRows = model.coefRows;

ML::PUSH_RANGE("Trace::LinearSVMModel::fit");

auto nCols1 = nCols + int(params.fit_intercept && params.penalized_intercept);
T iC = params.C > 0 ? (1.0 / params.C) : 1.0;

Expand Down Expand Up @@ -504,7 +504,6 @@ LinearSVMModel<T> LinearSVMModel<T>::fit(const raft::handle_t& handle,
raft::linalg::transpose(handle, ps1, model.probScale, 2, coefCols, stream);
}

ML::POP_RANGE();
return model;
}

Expand Down
7 changes: 4 additions & 3 deletions cpp/src_prims/linalg/lstsq.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -301,9 +301,10 @@ void lstsqEig(const raft::handle_t& handle,
multAbDone.record(multAbStream);

// Q S Q* <- covA
ML::PUSH_RANGE("Trace::MLCommon::LinAlg::lstsq::eigDC", mainStream);
raft::linalg::eigDC(handle, covA, n_cols, n_cols, Q, S, mainStream);
ML::POP_RANGE(mainStream);
{
CUML_USING_RANGE("raft::linalg::eigDC", mainStream);
raft::linalg::eigDC(handle, covA, n_cols, n_cols, Q, S, mainStream);
}

// QS <- Q invS
raft::linalg::matrixVectorOp(
Expand Down