Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[hip][cuda] Added finer grained tracing options to hip. #18180

Merged
merged 5 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions runtime/src/iree/hal/drivers/cuda/api.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,15 +77,14 @@ typedef struct iree_hal_cuda_device_params_t {
// Specifies how command buffers are recorded and executed.
iree_hal_cuda_command_buffer_mode_t command_buffer_mode;

// Enables tracing of command buffers when IREE tracing is enabled.
// May take advantage of additional extensions for more accurate timing or
// hardware-specific performance counters.
// Controls the verbosity of command buffers tracing when when IREE
// tracing is enabled.
//
// NOTE: tracing has a non-trivial overhead and will skew the timing of
// submissions and introduce false barriers between dispatches. Use this to
// identify slow dispatches and refine from there; be wary of whole-program
// tracing with this enabled.
bool stream_tracing;
// submissions and may introduce false barriers between dispatches.
// Use this to identify slow dispatches and command buffers and refine
// from there; be wary of whole-program tracing with this enabled.
int32_t stream_tracing;

// Whether to use async allocations even if reported as available by the
// device. Defaults to true when the device supports it.
Expand Down
13 changes: 11 additions & 2 deletions runtime/src/iree/hal/drivers/cuda/cuda_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ IREE_API_EXPORT void iree_hal_cuda_device_params_initialize(
out_params->event_pool_capacity = 32;
out_params->queue_count = 1;
out_params->command_buffer_mode = IREE_HAL_CUDA_COMMAND_BUFFER_MODE_GRAPH;
out_params->stream_tracing = false;
out_params->stream_tracing = 0;
out_params->async_allocations = true;
}

Expand Down Expand Up @@ -346,9 +346,18 @@ static iree_status_t iree_hal_cuda_device_create_internal(

// Enable tracing for the (currently only) stream - no-op if disabled.
if (iree_status_is_ok(status) && device->params.stream_tracing) {
if (device->params.stream_tracing >= IREE_HAL_CUDA_TRACING_VERBOSITY_MAX ||
device->params.stream_tracing < IREE_HAL_CUDA_TRACING_VERBOSITY_OFF) {
return iree_make_status(
IREE_STATUS_INVALID_ARGUMENT,
"invalid stream_tracing argument: expected to be between %d and %d",
IREE_HAL_CUDA_TRACING_VERBOSITY_OFF,
IREE_HAL_CUDA_TRACING_VERBOSITY_MAX);
}
status = iree_hal_cuda_tracing_context_allocate(
device->cuda_symbols, device->identifier, dispatch_stream,
&device->block_pool, host_allocator, &device->tracing_context);
device->params.stream_tracing, &device->block_pool, host_allocator,
&device->tracing_context);
}

// Memory pool support is conditional.
Expand Down
102 changes: 59 additions & 43 deletions runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,10 @@ iree_hal_cuda_graph_command_buffer_cast(iree_hal_command_buffer_t* base_value) {
#if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE

static void iree_cuda_graph_command_buffer_trace_zone_begin_external(
iree_hal_cuda_graph_command_buffer_t* command_buffer, const char* file_name,
size_t file_name_length, uint32_t line, const char* function_name,
size_t function_name_length, const char* name, size_t name_length) {
iree_hal_cuda_graph_command_buffer_t* command_buffer, int32_t verbosity,
const char* file_name, size_t file_name_length, uint32_t line,
const char* function_name, size_t function_name_length, const char* name,
size_t name_length) {
// Make sure there are no new nodes after the last barrier.
// Work should start after the event.
if (IREE_UNLIKELY(command_buffer->graph_node_count != 0)) {
Expand All @@ -97,7 +98,7 @@ static void iree_cuda_graph_command_buffer_trace_zone_begin_external(
size_t dependency_count = command_buffer->cu_barrier_node ? 1 : 0;
IREE_CUDA_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL(
command_buffer->tracing_context, &command_buffer->tracing_event_list,
tracing_event_node, command_buffer->cu_graph,
tracing_event_node, command_buffer->cu_graph, verbosity,
&command_buffer->cu_barrier_node, dependency_count, file_name,
file_name_length, line, function_name, function_name_length, name,
name_length);
Expand All @@ -109,7 +110,7 @@ static void iree_cuda_graph_command_buffer_trace_zone_begin_external(
}

static void iree_cuda_graph_command_buffer_trace_zone_end(
iree_hal_cuda_graph_command_buffer_t* command_buffer) {
iree_hal_cuda_graph_command_buffer_t* command_buffer, int32_t verbosity) {
// Make sure there are no new nodes after the last barrier.
// Prior work should end before the tracing event is recorded.
if (IREE_UNLIKELY(command_buffer->graph_node_count != 0)) {
Expand All @@ -124,35 +125,37 @@ static void iree_cuda_graph_command_buffer_trace_zone_end(
"ending a zone should at least depend on the beginning");
IREE_CUDA_GRAPH_TRACE_ZONE_END(
command_buffer->tracing_context, &command_buffer->tracing_event_list,
tracing_event_node, command_buffer->cu_graph,
tracing_event_node, command_buffer->cu_graph, verbosity,
&command_buffer->cu_barrier_node, dependency_count);

// We need to wait on the tracing end before other work starts.
// GPU tracing zones are first-in, last-out.
command_buffer->cu_barrier_node = *tracing_event_node;
}

#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( \
command_buffer, file_name, file_name_length, line, function_name, \
function_name_length, name, name_length) \
iree_cuda_graph_command_buffer_trace_zone_begin_external( \
command_buffer, file_name, file_name_length, line, function_name, \
function_name_length, name, name_length)
#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer) \
#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( \
command_buffer, verbosity, file_name, file_name_length, line, \
function_name, function_name_length, name, name_length) \
iree_cuda_graph_command_buffer_trace_zone_begin_external( \
command_buffer, verbosity, file_name, file_name_length, line, \
function_name, function_name_length, name, name_length)
#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer, \
verbosity) \
IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( \
command_buffer, /*file_name=*/NULL, 0, /*line=*/0, __FUNCTION__, \
strlen(__FUNCTION__), /*name=*/NULL, 0)
#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer) \
iree_cuda_graph_command_buffer_trace_zone_end(command_buffer)
command_buffer, verbosity, /*file_name=*/NULL, 0, /*line=*/0, \
__FUNCTION__, strlen(__FUNCTION__), /*name=*/NULL, 0)
#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, \
verbosity) \
iree_cuda_graph_command_buffer_trace_zone_end(command_buffer, verbosity)

#else // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE

#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( \
command_buffer, file_name, file_name_length, line, function_name, \
function_name_length, name, name_length)
#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer)
#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer)

#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( \
command_buffer, verbosity, file_name, file_name_length, line, \
function_name, function_name_length, name, name_length)
#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer, \
verbosity)
#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, verbosity)
#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE

iree_status_t iree_hal_cuda_graph_command_buffer_create(
Expand Down Expand Up @@ -335,7 +338,8 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_begin(
command_buffer->symbols,
cuGraphCreate(&command_buffer->cu_graph, /*flags=*/0), "cuGraphCreate");

IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer);
IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(
command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE);

return iree_ok_status();
}
Expand All @@ -349,7 +353,8 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_end(
IREE_RETURN_IF_ERROR(
iree_hal_cuda_graph_command_buffer_flush_collectives(command_buffer));

IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer);
IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(
command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE);

// Reset state used during recording.
command_buffer->cu_barrier_node = NULL;
Expand Down Expand Up @@ -384,8 +389,9 @@ static void iree_hal_cuda_graph_command_buffer_begin_debug_group(

(void)command_buffer;
IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL(
command_buffer, location ? location->file.data : NULL,
location ? location->file.size : 0, location ? location->line : 0,
command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE,
location ? location->file.data : NULL, location ? location->file.size : 0,
location ? location->line : 0,
/*func_name=*/NULL, 0, label.data, label.size);
}

Expand All @@ -394,7 +400,8 @@ static void iree_hal_cuda_graph_command_buffer_end_debug_group(
iree_hal_cuda_graph_command_buffer_t* command_buffer =
iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
(void)command_buffer;
IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer);
IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(
command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE);
}

static iree_status_t
Expand Down Expand Up @@ -507,7 +514,8 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_fill_buffer(
iree_hal_cuda_graph_command_buffer_t* command_buffer =
iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
IREE_TRACE_ZONE_BEGIN(z0);
IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer);
IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(
command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE);

IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, iree_hal_cuda_graph_command_buffer_flush_collectives(command_buffer));
Expand Down Expand Up @@ -546,7 +554,8 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_fill_buffer(
dependency_count, &params, command_buffer->cu_context),
"cuGraphAddMemsetNode");

IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer);
IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(
command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE);
IREE_TRACE_ZONE_END(z0);
return iree_ok_status();
}
Expand All @@ -557,7 +566,8 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_update_buffer(
iree_hal_cuda_graph_command_buffer_t* command_buffer =
iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
IREE_TRACE_ZONE_BEGIN(z0);
IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer);
IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(
command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE);

IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, iree_hal_cuda_graph_command_buffer_flush_collectives(command_buffer));
Expand Down Expand Up @@ -608,7 +618,8 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_update_buffer(
dependency_count, &params, command_buffer->cu_context),
"cuGraphAddMemcpyNode");

IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer);
IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(
command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE);
IREE_TRACE_ZONE_END(z0);
return iree_ok_status();
}
Expand All @@ -619,7 +630,8 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_copy_buffer(
iree_hal_cuda_graph_command_buffer_t* command_buffer =
iree_hal_cuda_graph_command_buffer_cast(base_command_buffer);
IREE_TRACE_ZONE_BEGIN(z0);
IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer);
IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(
command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE);

IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, iree_hal_cuda_graph_command_buffer_flush_collectives(command_buffer));
Expand Down Expand Up @@ -666,7 +678,8 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_copy_buffer(
dependency_count, &params, command_buffer->cu_context),
"cuGraphAddMemcpyNode");

IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer);
IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(
command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE);
IREE_TRACE_ZONE_END(z0);
return iree_ok_status();
}
Expand Down Expand Up @@ -763,9 +776,10 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_dispatch(
executable, entry_point, &kernel_info));

IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL(
command_buffer, kernel_info.source_filename.data,
kernel_info.source_filename.size, kernel_info.source_line,
kernel_info.function_name.data, kernel_info.function_name.size,
command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE,
kernel_info.source_filename.data, kernel_info.source_filename.size,
kernel_info.source_line, kernel_info.function_name.data,
kernel_info.function_name.size,
/*name=*/NULL, 0);

IREE_RETURN_AND_END_ZONE_IF_ERROR(
Expand Down Expand Up @@ -865,7 +879,8 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_dispatch(
dependency_count, &params),
"cuGraphAddKernelNode");

IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer);
IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(
command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE);
IREE_TRACE_ZONE_END(z0);
return iree_ok_status();
}
Expand Down Expand Up @@ -898,10 +913,10 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_dispatch2(
executable, entry_point, &kernel_info));

IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL(
command_buffer, kernel_info.source_filename.data,
kernel_info.source_filename.size, kernel_info.source_line,
kernel_info.function_name.data, kernel_info.function_name.size,
/*name=*/NULL, 0);
command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE,
kernel_info.source_filename.data, kernel_info.source_filename.size,
kernel_info.source_line, kernel_info.function_name.data,
kernel_info.function_name.size, /*name=*/NULL, 0);

IREE_RETURN_AND_END_ZONE_IF_ERROR(
z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1,
Expand Down Expand Up @@ -990,7 +1005,8 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_dispatch2(
dependency_count, &params),
"cuGraphAddKernelNode");

IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer);
IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(
command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE);
IREE_TRACE_ZONE_END(z0);
return iree_ok_status();
}
Expand Down
7 changes: 4 additions & 3 deletions runtime/src/iree/hal/drivers/cuda/nccl_channel.c
Original file line number Diff line number Diff line change
Expand Up @@ -559,7 +559,8 @@ iree_status_t iree_hal_cuda_nccl_submit_batch(
iree_string_view_t collective_str =
iree_hal_collective_op_format(&entry->op, &string_temp);
IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL(
tracing_context, tracing_event_list, stream, __FILE__, strlen(__FILE__),
tracing_context, tracing_event_list, stream,
IREE_HAL_CUDA_TRACING_VERBOSITY_FINE, __FILE__, strlen(__FILE__),
(uint32_t)__LINE__, __FUNCTION__, strlen(__FUNCTION__),
collective_str.data, collective_str.size);
}
Expand All @@ -578,8 +579,8 @@ iree_status_t iree_hal_cuda_nccl_submit_batch(
// End all zones we began above - note that these are just simply nested so
// order doesn't matter so long as we end the right number of zones.
for (iree_host_size_t i = 0; i < batch->count; ++i) {
IREE_CUDA_STREAM_TRACE_ZONE_END(tracing_context, tracing_event_list,
stream);
IREE_CUDA_STREAM_TRACE_ZONE_END(tracing_context, tracing_event_list, stream,
IREE_HAL_CUDA_TRACING_VERBOSITY_FINE);
}
#endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE

Expand Down
12 changes: 8 additions & 4 deletions runtime/src/iree/hal/drivers/cuda/registration/driver_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,14 @@ IREE_FLAG(
"Enables CUDA asynchronous stream-ordered allocations when supported.");

IREE_FLAG(
bool, cuda_tracing, true,
"Enables tracing of stream events when Tracy instrumentation is enabled.\n"
"Severely impacts benchmark timings and should only be used when\n"
"analyzing dispatch timings.");
int32_t, cuda_tracing, 2,
"Controls the verbosity of tracing when Tracy instrumentation is enabled.\n"
"The impact to benchmark timing becomes more severe as the verbosity\n"
"increases, and thus should be only enabled when needed.\n"
"Permissible values are:\n"
" 0 : stream tracing disabled.\n"
" 1 : coarse command buffer level tracing enabled.\n"
" 2 : fine-grained kernel level tracing enabled.\n");

IREE_FLAG(int32_t, cuda_default_index, 0,
"Specifies the index of the default CUDA device to use");
Expand Down
Loading
Loading