Skip to content

Commit

Permalink
Add NVTX ranges to all CUB algorithms
Browse files Browse the repository at this point in the history
Fixes: #719

Co-authored-by: Michael Schellenberger Costa <[email protected]>
  • Loading branch information
bernhardmgruber and miscco committed Apr 22, 2024
1 parent ac49021 commit a74fc2d
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 0 deletions.
107 changes: 107 additions & 0 deletions cub/cub/detail/nvtx.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/******************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/

#pragma once

#include <cub/config.cuh>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#if __has_include(<source_location>)
# include <source_location>
#endif // __has_include(<source_location>)

// NVTX documentation: https://nvidia.github.io/NVTX/

#include <nvtx3/nvToolsExt.h>
#include <nvtx3/nvToolsExtCuda.h>
// TODO(bgruber): #include <nvtx3/nvToolsExt.h> does not work, but exists in the docs
// TODO(bgruber): try: NVTX3_FUNC_RANGE and scoped_range

CUB_NAMESPACE_BEGIN

namespace detail
{

struct NVTXGlobal
{
nvtxDomainHandle_t domain;

NVTXGlobal()
{
domain = nvtxDomainCreate("CUB");
}

~NVTXGlobal()
{
nvtxDomainDestroy(domain);
}
};

// Global setup and teardown of the NVTX domain for CUB.
inline NVTXGlobal nvtxGlobal;

// A scope guard to create an named NVTX range inside the CUB domain.
struct NVTXGuard
{
CUB_RUNTIME_FUNCTION NVTXGuard(const char* name)
{
NV_IF_TARGET(
NV_IS_HOST,
// TODO(bgruber): documentation mentions a nvtxDomainRangePushA, but it does not exist
nvtxEventAttributes_t eventAttrib{};
eventAttrib.version = NVTX_VERSION;
eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
eventAttrib.message.ascii = name;
nvtxDomainRangePushEx(nvtxGlobal.domain, &eventAttrib););
}

#ifdef __cpp_lib_source_location
NVTXGuard(const std::source_location& loc = std::source_location::current())
: NVTXGuard(loc.function_name())
{}
#endif // __cpp_lib_source_location

CUB_RUNTIME_FUNCTION ~NVTXGuard()
{
NV_IF_TARGET(NV_IS_HOST, nvtxDomainRangePop(nvtxGlobal.domain););
}
};
} // namespace detail

// Inserts a NVTX range starting here until the end of the current function scope
// TODO(bgruber): replace this by NVTX3_FUNC_RANGE from <nvtx3/nvtx3.hpp>, when available. Wasn't available in CTK 12.4.
#define CUB_BEGIN_NVTX_RANGE_SCOPE [[maybe_unused]] ::cub::detail::NVTXGuard youShallNotGuessThisVariableName(__func__)

CUB_NAMESPACE_END
17 changes: 17 additions & 0 deletions cub/cub/device/device_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
#endif // no system header

#include <cub/detail/choose_offset.cuh>
#include <cub/detail/nvtx.cuh>
#include <cub/device/dispatch/dispatch_reduce.cuh>
#include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
#include <cub/iterator/arg_index_input_iterator.cuh>
Expand Down Expand Up @@ -195,6 +196,8 @@ struct DeviceReduce
T init,
cudaStream_t stream = 0)
{
CUB_BEGIN_NVTX_RANGE_SCOPE;

// Signed integer type for global offsets
using OffsetT = detail::choose_offset_t<NumItemsT>;

Expand Down Expand Up @@ -304,6 +307,8 @@ struct DeviceReduce
NumItemsT num_items,
cudaStream_t stream = 0)
{
CUB_BEGIN_NVTX_RANGE_SCOPE;

// Signed integer type for global offsets
using OffsetT = detail::choose_offset_t<NumItemsT>;

Expand Down Expand Up @@ -423,6 +428,8 @@ struct DeviceReduce
NumItemsT num_items,
cudaStream_t stream = 0)
{
CUB_BEGIN_NVTX_RANGE_SCOPE;

// Signed integer type for global offsets
using OffsetT = detail::choose_offset_t<NumItemsT>;

Expand Down Expand Up @@ -547,6 +554,8 @@ struct DeviceReduce
int num_items,
cudaStream_t stream = 0)
{
CUB_BEGIN_NVTX_RANGE_SCOPE;

// Signed integer type for global offsets
using OffsetT = int;

Expand Down Expand Up @@ -673,6 +682,8 @@ struct DeviceReduce
NumItemsT num_items,
cudaStream_t stream = 0)
{
CUB_BEGIN_NVTX_RANGE_SCOPE;

// Signed integer type for global offsets
using OffsetT = detail::choose_offset_t<NumItemsT>;

Expand Down Expand Up @@ -802,6 +813,8 @@ struct DeviceReduce
int num_items,
cudaStream_t stream = 0)
{
CUB_BEGIN_NVTX_RANGE_SCOPE;

// Signed integer type for global offsets
using OffsetT = int;

Expand Down Expand Up @@ -968,6 +981,8 @@ struct DeviceReduce
T init,
cudaStream_t stream = 0)
{
CUB_BEGIN_NVTX_RANGE_SCOPE;

using OffsetT = detail::choose_offset_t<NumItemsT>;

return DispatchTransformReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, TransformOpT, T>::Dispatch(
Expand Down Expand Up @@ -1135,6 +1150,8 @@ struct DeviceReduce
NumItemsT num_items,
cudaStream_t stream = 0)
{
CUB_BEGIN_NVTX_RANGE_SCOPE;

// Signed integer type for global offsets
using OffsetT = detail::choose_offset_t<NumItemsT>;

Expand Down

0 comments on commit a74fc2d

Please sign in to comment.