From b7a284210ea16480b01ac040be08bdf0c0268c6f Mon Sep 17 00:00:00 2001 From: Andrew Woloszyn Date: Fri, 9 Aug 2024 10:25:48 -0500 Subject: [PATCH 1/5] [hip] Added finer grained tracing options to hip. This allows us to specify the verbosity at which we want device-side tracing. Command-buffer level tracing has a significantly lower overhead if we do not need individual kernel timing. Signed-off-by: Andrew Woloszyn --- runtime/src/iree/hal/drivers/hip/api.h | 11 +-- .../hal/drivers/hip/graph_command_buffer.c | 72 ++++++++++--------- runtime/src/iree/hal/drivers/hip/hip_device.c | 5 +- .../src/iree/hal/drivers/hip/rccl_channel.c | 8 +-- .../drivers/hip/registration/driver_module.c | 20 ++++-- .../hal/drivers/hip/stream_command_buffer.c | 12 ++-- runtime/src/iree/hal/drivers/hip/tracing.c | 26 ++++--- runtime/src/iree/hal/drivers/hip/tracing.h | 66 +++++++++-------- 8 files changed, 126 insertions(+), 94 deletions(-) diff --git a/runtime/src/iree/hal/drivers/hip/api.h b/runtime/src/iree/hal/drivers/hip/api.h index a5055357d1e7..2218b19f2aa3 100644 --- a/runtime/src/iree/hal/drivers/hip/api.h +++ b/runtime/src/iree/hal/drivers/hip/api.h @@ -76,15 +76,16 @@ typedef struct iree_hal_hip_device_params_t { // Specifies how command buffers are recorded and executed. iree_hal_hip_command_buffer_mode_t command_buffer_mode; - // Enables tracing of command buffers when IREE tracing is enabled. + // Controls the verbosity of command buffers tracing when when IREE + // tracing is enabled. // May take advantage of additional extensions for more accurate timing or // hardware-specific performance counters. // // NOTE: tracing has a non-trivial overhead and will skew the timing of - // submissions and introduce false barriers between dispatches. Use this to - // identify slow dispatches and refine from there; be wary of whole-program - // tracing with this enabled. - bool stream_tracing; + // submissions and may introduce false barriers between dispatches. + // Use this to identify slow dispatches and command buffers and refine + // from there; be wary of whole-program tracing with this enabled. + int32_t stream_tracing; // Whether to use async allocations even if reported as available by the // device. Defaults to true when the device supports it. diff --git a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c index 99b3538caf77..c3de559a7385 100644 --- a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c +++ b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c @@ -83,9 +83,10 @@ iree_hal_hip_graph_command_buffer_cast(iree_hal_command_buffer_t* base_value) { #if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE static void iree_hip_graph_command_buffer_trace_zone_begin_external( - iree_hal_hip_graph_command_buffer_t* command_buffer, const char* file_name, - size_t file_name_length, uint32_t line, const char* function_name, - size_t function_name_length, const char* name, size_t name_length) { + iree_hal_hip_graph_command_buffer_t* command_buffer, int32_t verbosity, + const char* file_name, size_t file_name_length, uint32_t line, + const char* function_name, size_t function_name_length, const char* name, + size_t name_length) { // Make sure there are no new nodes after the last barrier. // Work should start after the event. if (IREE_UNLIKELY(command_buffer->graph_node_count != 0)) { @@ -98,7 +99,7 @@ static void iree_hip_graph_command_buffer_trace_zone_begin_external( size_t dependency_count = command_buffer->hip_barrier_node ? 1 : 0; IREE_HIP_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL( command_buffer->tracing_context, &command_buffer->tracing_event_list, - tracing_event_node, command_buffer->hip_graph, + tracing_event_node, command_buffer->hip_graph, verbosity, &command_buffer->hip_barrier_node, dependency_count, file_name, file_name_length, line, function_name, function_name_length, name, name_length); @@ -110,7 +111,7 @@ static void iree_hip_graph_command_buffer_trace_zone_begin_external( } static void iree_hip_graph_command_buffer_trace_zone_end( - iree_hal_hip_graph_command_buffer_t* command_buffer) { + iree_hal_hip_graph_command_buffer_t* command_buffer, int32_t verbosity) { // Make sure there are no new nodes after the last barrier. // Prior work should end before the tracing event is recorded. if (IREE_UNLIKELY(command_buffer->graph_node_count != 0)) { @@ -125,7 +126,7 @@ static void iree_hip_graph_command_buffer_trace_zone_end( "ending a zone should at least depend on the beginning"); IREE_HIP_GRAPH_TRACE_ZONE_END( command_buffer->tracing_context, &command_buffer->tracing_event_list, - tracing_event_node, command_buffer->hip_graph, + tracing_event_node, command_buffer->hip_graph, verbosity, &command_buffer->hip_barrier_node, dependency_count); // We need to wait on the tracing end before other work starts. @@ -133,26 +134,29 @@ static void iree_hip_graph_command_buffer_trace_zone_end( command_buffer->hip_barrier_node = *tracing_event_node; } -#define IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( \ - command_buffer, file_name, file_name_length, line, function_name, \ - function_name_length, name, name_length) \ - iree_hip_graph_command_buffer_trace_zone_begin_external( \ - command_buffer, file_name, file_name_length, line, function_name, \ - function_name_length, name, name_length) -#define IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer) \ +#define IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( \ + command_buffer, verbosity, file_name, file_name_length, line, \ + function_name, function_name_length, name, name_length) \ + iree_hip_graph_command_buffer_trace_zone_begin_external( \ + command_buffer, verbosity, file_name, file_name_length, line, \ + function_name, function_name_length, name, name_length) +#define IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer, \ + verbosity) \ IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( \ - command_buffer, /*file_name=*/NULL, 0, /*line=*/0, __FUNCTION__, \ - strlen(__FUNCTION__), /*name=*/NULL, 0) -#define IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer) \ - iree_hip_graph_command_buffer_trace_zone_end(command_buffer) + command_buffer, verbosity, /*file_name=*/NULL, 0, /*line=*/0, \ + __FUNCTION__, strlen(__FUNCTION__), /*name=*/NULL, 0) +#define IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, \ + verbosity) \ + iree_hip_graph_command_buffer_trace_zone_end(command_buffer, verbosity) #else // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE -#define IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( \ - command_buffer, file_name, file_name_length, line, function_name, \ - function_name_length, name, name_length) -#define IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer) -#define IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer) +#define IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( \ + command_buffer, verbosity, file_name, file_name_length, line, \ + function_name, function_name_length, name, name_length) +#define IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer, \ + verbosity) +#define IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, verbosity) #endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE @@ -340,7 +344,7 @@ static iree_status_t iree_hal_hip_graph_command_buffer_begin( "hipGraphCreate"); IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer, + command_buffer, 1, /*file_name=*/NULL, 0, /*line=*/0, "iree_hal_hip_graph_command_buffer", strlen("iree_hal_hip_graph_command_buffer"), /*name=*/NULL, 0); @@ -357,7 +361,7 @@ static iree_status_t iree_hal_hip_graph_command_buffer_end( IREE_RETURN_IF_ERROR( iree_hal_hip_graph_command_buffer_flush_collectives(command_buffer)); - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, 1); // Reset state used during recording. command_buffer->hip_barrier_node = NULL; @@ -392,7 +396,7 @@ static void iree_hal_hip_graph_command_buffer_begin_debug_group( (void)command_buffer; IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer, location ? location->file.data : NULL, + command_buffer, 1, location ? location->file.data : NULL, location ? location->file.size : 0, location ? location->line : 0, /*func_name=*/NULL, 0, label.data, label.size); } @@ -402,7 +406,7 @@ static void iree_hal_hip_graph_command_buffer_end_debug_group( iree_hal_hip_graph_command_buffer_t* command_buffer = iree_hal_hip_graph_command_buffer_cast(base_command_buffer); (void)command_buffer; - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, 1); } static iree_status_t @@ -515,7 +519,7 @@ static iree_status_t iree_hal_hip_graph_command_buffer_fill_buffer( iree_hal_hip_graph_command_buffer_t* command_buffer = iree_hal_hip_graph_command_buffer_cast(base_command_buffer); IREE_TRACE_ZONE_BEGIN(z0); - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer, 2); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_hip_graph_command_buffer_flush_collectives(command_buffer)); @@ -553,7 +557,7 @@ static iree_status_t iree_hal_hip_graph_command_buffer_fill_buffer( dependency_count, ¶ms), "hipGraphAddMemsetNode"); - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, 2); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } @@ -569,7 +573,7 @@ static iree_status_t iree_hal_hip_graph_command_buffer_update_buffer( "cannot use graph-based command buffer"); } IREE_TRACE_ZONE_BEGIN(z0); - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer, 2); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_hip_graph_command_buffer_flush_collectives(command_buffer)); @@ -621,7 +625,7 @@ static iree_status_t iree_hal_hip_graph_command_buffer_update_buffer( dependency_count, ¶ms, command_buffer->hip_context), "hipDrvGraphAddMemcpyNode"); - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, 2); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } @@ -637,7 +641,7 @@ static iree_status_t iree_hal_hip_graph_command_buffer_copy_buffer( "cannot use graph-based command buffer"); } IREE_TRACE_ZONE_BEGIN(z0); - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer, 2); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_hip_graph_command_buffer_flush_collectives(command_buffer)); @@ -683,7 +687,7 @@ static iree_status_t iree_hal_hip_graph_command_buffer_copy_buffer( dependency_count, ¶ms, command_buffer->hip_context), "hipDrvGraphAddMemcpyNode"); - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, 2); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } @@ -787,7 +791,7 @@ static iree_status_t iree_hal_hip_graph_command_buffer_dispatch( executable, entry_point, &kernel_info)); IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer, kernel_info.source_filename.data, + command_buffer, 2, kernel_info.source_filename.data, kernel_info.source_filename.size, kernel_info.source_line, kernel_info.function_name.data, kernel_info.function_name.size, /*name=*/NULL, 0); @@ -874,7 +878,7 @@ static iree_status_t iree_hal_hip_graph_command_buffer_dispatch( dependency_count, ¶ms), "hipGraphAddKernelNode"); - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, 2); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } diff --git a/runtime/src/iree/hal/drivers/hip/hip_device.c b/runtime/src/iree/hal/drivers/hip/hip_device.c index 133d3f5de4c2..385727469405 100644 --- a/runtime/src/iree/hal/drivers/hip/hip_device.c +++ b/runtime/src/iree/hal/drivers/hip/hip_device.c @@ -275,7 +275,7 @@ IREE_API_EXPORT void iree_hal_hip_device_params_initialize( out_params->event_pool_capacity = 32; out_params->queue_count = 1; out_params->command_buffer_mode = IREE_HAL_HIP_COMMAND_BUFFER_MODE_STREAM; - out_params->stream_tracing = false; + out_params->stream_tracing = 0; out_params->async_allocations = true; out_params->allow_inline_execution = false; } @@ -346,7 +346,8 @@ static iree_status_t iree_hal_hip_device_create_internal( if (iree_status_is_ok(status) && device->params.stream_tracing) { status = iree_hal_hip_tracing_context_allocate( device->hip_symbols, device->identifier, dispatch_stream, - &device->block_pool, host_allocator, &device->tracing_context); + device->params.stream_tracing, &device->block_pool, host_allocator, + &device->tracing_context); } // Memory pool support is conditional. diff --git a/runtime/src/iree/hal/drivers/hip/rccl_channel.c b/runtime/src/iree/hal/drivers/hip/rccl_channel.c index e3c38a21bf7c..578af202917e 100644 --- a/runtime/src/iree/hal/drivers/hip/rccl_channel.c +++ b/runtime/src/iree/hal/drivers/hip/rccl_channel.c @@ -593,9 +593,9 @@ iree_status_t iree_hal_hip_nccl_submit_batch( iree_string_view_t collective_str = iree_hal_collective_op_format(&entry->op, &string_temp); IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( - tracing_context, tracing_event_list, stream, __FILE__, strlen(__FILE__), - (uint32_t)__LINE__, __FUNCTION__, strlen(__FUNCTION__), - collective_str.data, collective_str.size); + tracing_context, tracing_event_list, stream, 2, __FILE__, + strlen(__FILE__), (uint32_t)__LINE__, __FUNCTION__, + strlen(__FUNCTION__), collective_str.data, collective_str.size); } #endif // IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE @@ -613,7 +613,7 @@ iree_status_t iree_hal_hip_nccl_submit_batch( IREE_TRACE({ for (iree_host_size_t i = 0; i < batch->count; ++i) { IREE_HIP_STREAM_TRACE_ZONE_END(tracing_context, tracing_event_list, - stream); + stream, 2); } }); diff --git a/runtime/src/iree/hal/drivers/hip/registration/driver_module.c b/runtime/src/iree/hal/drivers/hip/registration/driver_module.c index cabb1dadb3db..27a5d57dd120 100644 --- a/runtime/src/iree/hal/drivers/hip/registration/driver_module.c +++ b/runtime/src/iree/hal/drivers/hip/registration/driver_module.c @@ -36,10 +36,14 @@ IREE_FLAG( "Enables HIP asynchronous stream-ordered allocations when supported."); IREE_FLAG( - bool, hip_tracing, true, - "Enables tracing of stream events when Tracy instrumentation is enabled.\n" - "Severely impacts benchmark timings and should only be used when\n" - "analyzing dispatch timings."); + int32_t, hip_tracing, 2, + "Controls the verbosity of tracing when Tracy instrumentation is enabled.\n" + "The impact to benchmark timing becomes more severe as the verbosity\n" + "increases, and thus should be only enabled when needed.\n" + "Permissible values are:\n" + " 0 : stream tracing disabled.\n" + " 1 : coarse command buffer level tracing enabled.\n" + " 2 : fine-grained kernel level tracing enabled.\n"); IREE_FLAG(int32_t, hip_default_index, 0, "Specifies the index of the default HIP device to use"); @@ -181,7 +185,13 @@ static iree_status_t iree_hal_hip_driver_populate_options( "Option 'hip_tracing' expected to be int. Got: '%.*s'", (int)value.size, value.data); } - device_params->stream_tracing = ivalue ? true : false; + if (!(ivalue >= 0 && ivalue <= 3)) { + return iree_make_status( + IREE_STATUS_FAILED_PRECONDITION, + "Option 'hip_tracing' expected to be 0, 1, 2. Got: '%.*s'", + (int)value.size, value.data); + } + device_params->stream_tracing = ivalue; } else if (iree_string_view_equal(key, key_hip_default_index)) { if (!iree_string_view_atoi_int32(value, &ivalue)) { return iree_make_status( diff --git a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c index e4ffac2200a9..35920fe0a818 100644 --- a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c +++ b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c @@ -183,7 +183,7 @@ static iree_status_t iree_hal_hip_stream_command_buffer_begin( IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->hip_stream, + command_buffer->hip_stream, 1, /*file_name=*/NULL, 0, /*line=*/0, "iree_hal_hip_stream_command_buffer", strlen("iree_hal_hip_stream_command_buffer"), /*name=*/NULL, 0); @@ -214,7 +214,7 @@ static iree_status_t iree_hal_hip_stream_command_buffer_end( IREE_HIP_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->hip_stream); + command_buffer->hip_stream, 1); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); @@ -230,7 +230,7 @@ static void iree_hal_hip_stream_command_buffer_begin_debug_group( IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->hip_stream, location ? location->file.data : NULL, + command_buffer->hip_stream, 1, location ? location->file.data : NULL, location ? location->file.size : 0, location ? location->line : 0, /*func_name=*/NULL, 0, label.data, label.size); } @@ -243,7 +243,7 @@ static void iree_hal_hip_stream_command_buffer_end_debug_group( IREE_HIP_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->hip_stream); + command_buffer->hip_stream, 1); } static iree_status_t iree_hal_hip_stream_command_buffer_execution_barrier( @@ -542,7 +542,7 @@ static iree_status_t iree_hal_hip_stream_command_buffer_dispatch( IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->hip_stream, kernel_info.source_filename.data, + command_buffer->hip_stream, 2, kernel_info.source_filename.data, kernel_info.source_filename.size, kernel_info.source_line, kernel_info.function_name.data, kernel_info.function_name.size, /*name=*/NULL, 0); @@ -617,7 +617,7 @@ static iree_status_t iree_hal_hip_stream_command_buffer_dispatch( IREE_HIP_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->hip_stream); + command_buffer->hip_stream, 2); IREE_TRACE_ZONE_END(z0); return status; diff --git a/runtime/src/iree/hal/drivers/hip/tracing.c b/runtime/src/iree/hal/drivers/hip/tracing.c index f1a7007d7a4c..a18f240fb21f 100644 --- a/runtime/src/iree/hal/drivers/hip/tracing.c +++ b/runtime/src/iree/hal/drivers/hip/tracing.c @@ -67,6 +67,8 @@ struct iree_hal_hip_tracing_context_t { // Submitted events iree_hal_hip_tracing_context_event_list_t submitted_event_list; + int32_t verbosity; + uint32_t query_capacity; // Event pool reused to capture tracing timestamps. @@ -119,7 +121,8 @@ static iree_status_t iree_hal_hip_tracing_context_initial_calibration( iree_status_t iree_hal_hip_tracing_context_allocate( const iree_hal_hip_dynamic_symbols_t* symbols, iree_string_view_t queue_name, hipStream_t stream, - iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator, + int32_t stream_tracing_verbosity, iree_arena_block_pool_t* block_pool, + iree_allocator_t host_allocator, iree_hal_hip_tracing_context_t** out_context) { IREE_TRACE_ZONE_BEGIN(z0); IREE_ASSERT_ARGUMENT(symbols); @@ -139,6 +142,7 @@ iree_status_t iree_hal_hip_tracing_context_allocate( context->query_capacity = IREE_ARRAYSIZE(context->event_pool); context->submitted_event_list.head = NULL; context->submitted_event_list.tail = NULL; + context->verbosity = stream_tracing_verbosity; iree_slim_mutex_initialize(&context->event_mutex); } @@ -425,8 +429,9 @@ static uint16_t iree_hal_hip_graph_tracing_context_insert_query( void iree_hal_hip_stream_tracing_zone_begin_impl( iree_hal_hip_tracing_context_t* context, iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream, - const iree_tracing_location_t* src_loc) { + int32_t verbosity, const iree_tracing_location_t* src_loc) { if (!context) return; + if (verbosity > context->verbosity) return; uint16_t query_id = iree_hal_hip_stream_tracing_context_insert_query( context, event_list, stream); iree_tracing_gpu_zone_begin(context->id, query_id, src_loc); @@ -435,10 +440,11 @@ void iree_hal_hip_stream_tracing_zone_begin_impl( void iree_hal_hip_stream_tracing_zone_begin_external_impl( iree_hal_hip_tracing_context_t* context, iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream, - const char* file_name, size_t file_name_length, uint32_t line, - const char* function_name, size_t function_name_length, const char* name, - size_t name_length) { + int32_t verbosity, const char* file_name, size_t file_name_length, + uint32_t line, const char* function_name, size_t function_name_length, + const char* name, size_t name_length) { if (!context) return; + if (verbosity > context->verbosity) return; uint16_t query_id = iree_hal_hip_stream_tracing_context_insert_query( context, event_list, stream); iree_tracing_gpu_zone_begin_external(context->id, query_id, file_name, @@ -449,12 +455,13 @@ void iree_hal_hip_stream_tracing_zone_begin_external_impl( void iree_hal_hip_graph_tracing_zone_begin_external_impl( iree_hal_hip_tracing_context_t* context, iree_hal_hip_tracing_context_event_list_t* event_list, - hipGraphNode_t* out_node, hipGraph_t graph, + hipGraphNode_t* out_node, hipGraph_t graph, int32_t verbosity, hipGraphNode_t* dependency_nodes, size_t dependency_nodes_count, const char* file_name, size_t file_name_length, uint32_t line, const char* function_name, size_t function_name_length, const char* name, size_t name_length) { if (!context) return; + if (verbosity > context->verbosity) return; uint16_t query_id = iree_hal_hip_graph_tracing_context_insert_query( context, event_list, out_node, graph, dependency_nodes, dependency_nodes_count); @@ -465,8 +472,10 @@ void iree_hal_hip_graph_tracing_zone_begin_external_impl( void iree_hal_hip_stream_tracing_zone_end_impl( iree_hal_hip_tracing_context_t* context, - iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream) { + iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream, + int32_t verbosity) { if (!context) return; + if (verbosity > context->verbosity) return; uint16_t query_id = iree_hal_hip_stream_tracing_context_insert_query( context, event_list, stream); iree_tracing_gpu_zone_end(context->id, query_id); @@ -475,9 +484,10 @@ void iree_hal_hip_stream_tracing_zone_end_impl( void iree_hal_hip_graph_tracing_zone_end_impl( iree_hal_hip_tracing_context_t* context, iree_hal_hip_tracing_context_event_list_t* event_list, - hipGraphNode_t* out_node, hipGraph_t graph, + hipGraphNode_t* out_node, hipGraph_t graph, int32_t verbosity, hipGraphNode_t* dependency_nodes, size_t dependency_nodes_count) { if (!context) return; + if (verbosity > context->verbosity) return; uint16_t query_id = iree_hal_hip_graph_tracing_context_insert_query( context, event_list, out_node, graph, dependency_nodes, dependency_nodes_count); diff --git a/runtime/src/iree/hal/drivers/hip/tracing.h b/runtime/src/iree/hal/drivers/hip/tracing.h index 24e12b8f3a83..7d294254fc12 100644 --- a/runtime/src/iree/hal/drivers/hip/tracing.h +++ b/runtime/src/iree/hal/drivers/hip/tracing.h @@ -57,7 +57,8 @@ typedef struct iree_hal_hip_tracing_context_event_list_t { iree_status_t iree_hal_hip_tracing_context_allocate( const iree_hal_hip_dynamic_symbols_t* symbols, iree_string_view_t queue_name, hipStream_t stream, - iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator, + int32_t stream_tracing_verbosity, iree_arena_block_pool_t* block_pool, + iree_allocator_t host_allocator, iree_hal_hip_tracing_context_t** out_context); // Frees a tracing context and all associated HIP resources. @@ -87,21 +88,21 @@ void iree_hal_hip_tracing_free( void iree_hal_hip_stream_tracing_zone_begin_impl( iree_hal_hip_tracing_context_t* context, iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream, - const iree_tracing_location_t* src_loc); + int32_t verbosity, const iree_tracing_location_t* src_loc); // Begins an external zone using the given source information. // The provided strings will be copied into the tracy buffer. void iree_hal_hip_stream_tracing_zone_begin_external_impl( iree_hal_hip_tracing_context_t* context, iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream, - const char* file_name, size_t file_name_length, uint32_t line, - const char* function_name, size_t function_name_length, const char* name, - size_t name_length); + int32_t verbosity, const char* file_name, size_t file_name_length, + uint32_t line, const char* function_name, size_t function_name_length, + const char* name, size_t name_length); void iree_hal_hip_graph_tracing_zone_begin_external_impl( iree_hal_hip_tracing_context_t* context, iree_hal_hip_tracing_context_event_list_t* event_list, - hipGraphNode_t* out_node, hipGraph_t graph, + hipGraphNode_t* out_node, hipGraph_t graph, int32_t verbosity, hipGraphNode_t* dependency_nodes, size_t dependency_nodes_count, const char* file_name, size_t file_name_length, uint32_t line, const char* function_name, size_t function_name_length, const char* name, @@ -109,61 +110,66 @@ void iree_hal_hip_graph_tracing_zone_begin_external_impl( void iree_hal_hip_stream_tracing_zone_end_impl( iree_hal_hip_tracing_context_t* context, - iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream); + iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream, + int32_t verbosity); void iree_hal_hip_graph_tracing_zone_end_impl( iree_hal_hip_tracing_context_t* context, iree_hal_hip_tracing_context_event_list_t* event_list, - hipGraphNode_t* out_node, hipGraph_t graph, + hipGraphNode_t* out_node, hipGraph_t graph, int32_t verbosity, hipGraphNode_t* dependency_nodes, size_t dependency_nodes_count); // Begins a new zone with the parent function name. -#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN(context, event_list, stream) \ +#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN(context, event_list, stream, \ + verbosity) \ static const iree_tracing_location_t TracyConcat( \ __tracy_source_location, __LINE__) = {NULL, __FUNCTION__, __FILE__, \ (uint32_t)__LINE__, 0}; \ iree_hal_hip_stream_tracing_zone_begin_impl( \ - context, event_list, stream, \ + context, event_list, stream, verbosity, \ &TracyConcat(__tracy_source_location, __LINE__)); // Begins an externally defined zone with a dynamic source location. // The |file_name|, |function_name|, and optional |name| strings will be copied // into the trace buffer and do not need to persist. -#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( \ - context, event_list, stream, file_name, file_name_length, line, \ - function_name, function_name_length, name, name_length) \ - iree_hal_hip_stream_tracing_zone_begin_external_impl( \ - context, event_list, stream, file_name, file_name_length, line, \ - function_name, function_name_length, name, name_length) +#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( \ + context, event_list, stream, verbosity, file_name, file_name_length, line, \ + function_name, function_name_length, name, name_length) \ + iree_hal_hip_stream_tracing_zone_begin_external_impl( \ + context, event_list, stream, verbosity, file_name, file_name_length, \ + line, function_name, function_name_length, name, name_length) + #define IREE_HIP_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL( \ - context, event_list, out_node, graph, dependency_nodes, \ + context, event_list, out_node, graph, verbosity, dependency_nodes, \ dependency_nodes_count, file_name, file_name_length, line, function_name, \ function_name_length, name, name_length) \ iree_hal_hip_graph_tracing_zone_begin_external_impl( \ - context, event_list, out_node, graph, dependency_nodes, \ + context, event_list, out_node, graph, verbosity, dependency_nodes, \ dependency_nodes_count, file_name, file_name_length, line, \ function_name, function_name_length, name, name_length) -#define IREE_HIP_STREAM_TRACE_ZONE_END(context, event_list, stream) \ - iree_hal_hip_stream_tracing_zone_end_impl(context, event_list, stream) -#define IREE_HIP_GRAPH_TRACE_ZONE_END(context, event_list, out_node, graph, \ - dependency_nodes, \ - dependency_nodes_count) \ - iree_hal_hip_graph_tracing_zone_end_impl(context, event_list, out_node, \ - graph, dependency_nodes, \ +#define IREE_HIP_STREAM_TRACE_ZONE_END(context, event_list, stream, verbosity) \ + iree_hal_hip_stream_tracing_zone_end_impl(context, event_list, stream, \ + verbosity) + +#define IREE_HIP_GRAPH_TRACE_ZONE_END(context, event_list, out_node, graph, \ + verbosity, dependency_nodes, \ + dependency_nodes_count) \ + iree_hal_hip_graph_tracing_zone_end_impl(context, event_list, out_node, \ + graph, verbosity, dependency_nodes, \ dependency_nodes_count) #else -#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN(context, event_list, stream) -#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( \ - context, event_list, stream, file_name, file_name_length, line, \ +#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN(context, event_list, stream, verbosity) +#define IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( \ + context, event_list, stream, verbosity, file_name, file_name_length, line, \ function_name, function_name_length, name, name_length) #define IREE_HIP_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL( \ - context, event_list, out_node, graph, dependency_nodes, \ + context, event_list, out_node, graph, verbosity, dependency_nodes, \ dependency_nodes_count, file_name, file_name_length, line, function_name, \ function_name_length, name, name_length) #define IREE_HIP_STREAM_TRACE_ZONE_END(context, evnet_list, stream) #define IREE_HIP_GRAPH_TRACE_ZONE_END(context, event_list, out_node, graph, \ - dependency_nodes, \ + verbosity, dependency_nodes, \ dependency_nodes_count) #endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE From 8a6f6bbfa28d4b3ca40b76315f5dd41352643f92 Mon Sep 17 00:00:00 2001 From: Andrew Woloszyn Date: Fri, 9 Aug 2024 11:26:19 -0500 Subject: [PATCH 2/5] Replaced raw integer with enum for verbosity. Signed-off-by: Andrew Woloszyn --- .../hal/drivers/hip/graph_command_buffer.c | 52 ++++++++++++------- runtime/src/iree/hal/drivers/hip/hip_device.c | 8 +++ .../src/iree/hal/drivers/hip/rccl_channel.c | 10 ++-- .../drivers/hip/registration/driver_module.c | 6 --- .../hal/drivers/hip/stream_command_buffer.c | 32 ++++++------ runtime/src/iree/hal/drivers/hip/tracing.c | 21 ++++---- runtime/src/iree/hal/drivers/hip/tracing.h | 28 ++++++---- 7 files changed, 94 insertions(+), 63 deletions(-) diff --git a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c index c3de559a7385..a946a89faba9 100644 --- a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c +++ b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c @@ -83,10 +83,10 @@ iree_hal_hip_graph_command_buffer_cast(iree_hal_command_buffer_t* base_value) { #if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE static void iree_hip_graph_command_buffer_trace_zone_begin_external( - iree_hal_hip_graph_command_buffer_t* command_buffer, int32_t verbosity, - const char* file_name, size_t file_name_length, uint32_t line, - const char* function_name, size_t function_name_length, const char* name, - size_t name_length) { + iree_hal_hip_graph_command_buffer_t* command_buffer, + iree_hal_hip_tracing_verbosity_t verbosity, const char* file_name, + size_t file_name_length, uint32_t line, const char* function_name, + size_t function_name_length, const char* name, size_t name_length) { // Make sure there are no new nodes after the last barrier. // Work should start after the event. if (IREE_UNLIKELY(command_buffer->graph_node_count != 0)) { @@ -111,7 +111,8 @@ static void iree_hip_graph_command_buffer_trace_zone_begin_external( } static void iree_hip_graph_command_buffer_trace_zone_end( - iree_hal_hip_graph_command_buffer_t* command_buffer, int32_t verbosity) { + iree_hal_hip_graph_command_buffer_t* command_buffer, + iree_hal_hip_tracing_verbosity_t verbosity) { // Make sure there are no new nodes after the last barrier. // Prior work should end before the tracing event is recorded. if (IREE_UNLIKELY(command_buffer->graph_node_count != 0)) { @@ -344,7 +345,7 @@ static iree_status_t iree_hal_hip_graph_command_buffer_begin( "hipGraphCreate"); IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer, 1, + command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_COARSE, /*file_name=*/NULL, 0, /*line=*/0, "iree_hal_hip_graph_command_buffer", strlen("iree_hal_hip_graph_command_buffer"), /*name=*/NULL, 0); @@ -361,7 +362,8 @@ static iree_status_t iree_hal_hip_graph_command_buffer_end( IREE_RETURN_IF_ERROR( iree_hal_hip_graph_command_buffer_flush_collectives(command_buffer)); - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, 1); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( + command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_COARSE); // Reset state used during recording. command_buffer->hip_barrier_node = NULL; @@ -396,8 +398,9 @@ static void iree_hal_hip_graph_command_buffer_begin_debug_group( (void)command_buffer; IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer, 1, location ? location->file.data : NULL, - location ? location->file.size : 0, location ? location->line : 0, + command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_COARSE, + location ? location->file.data : NULL, location ? location->file.size : 0, + location ? location->line : 0, /*func_name=*/NULL, 0, label.data, label.size); } @@ -406,7 +409,8 @@ static void iree_hal_hip_graph_command_buffer_end_debug_group( iree_hal_hip_graph_command_buffer_t* command_buffer = iree_hal_hip_graph_command_buffer_cast(base_command_buffer); (void)command_buffer; - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, 1); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( + command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_COARSE); } static iree_status_t @@ -519,7 +523,8 @@ static iree_status_t iree_hal_hip_graph_command_buffer_fill_buffer( iree_hal_hip_graph_command_buffer_t* command_buffer = iree_hal_hip_graph_command_buffer_cast(base_command_buffer); IREE_TRACE_ZONE_BEGIN(z0); - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer, 2); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN( + command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_hip_graph_command_buffer_flush_collectives(command_buffer)); @@ -557,7 +562,8 @@ static iree_status_t iree_hal_hip_graph_command_buffer_fill_buffer( dependency_count, ¶ms), "hipGraphAddMemsetNode"); - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, 2); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( + command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } @@ -573,7 +579,8 @@ static iree_status_t iree_hal_hip_graph_command_buffer_update_buffer( "cannot use graph-based command buffer"); } IREE_TRACE_ZONE_BEGIN(z0); - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer, 2); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN( + command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_hip_graph_command_buffer_flush_collectives(command_buffer)); @@ -625,7 +632,8 @@ static iree_status_t iree_hal_hip_graph_command_buffer_update_buffer( dependency_count, ¶ms, command_buffer->hip_context), "hipDrvGraphAddMemcpyNode"); - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, 2); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( + command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } @@ -641,7 +649,8 @@ static iree_status_t iree_hal_hip_graph_command_buffer_copy_buffer( "cannot use graph-based command buffer"); } IREE_TRACE_ZONE_BEGIN(z0); - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer, 2); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN( + command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_hip_graph_command_buffer_flush_collectives(command_buffer)); @@ -687,7 +696,8 @@ static iree_status_t iree_hal_hip_graph_command_buffer_copy_buffer( dependency_count, ¶ms, command_buffer->hip_context), "hipDrvGraphAddMemcpyNode"); - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, 2); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( + command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } @@ -791,9 +801,10 @@ static iree_status_t iree_hal_hip_graph_command_buffer_dispatch( executable, entry_point, &kernel_info)); IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer, 2, kernel_info.source_filename.data, - kernel_info.source_filename.size, kernel_info.source_line, - kernel_info.function_name.data, kernel_info.function_name.size, + command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE, + kernel_info.source_filename.data, kernel_info.source_filename.size, + kernel_info.source_line, kernel_info.function_name.data, + kernel_info.function_name.size, /*name=*/NULL, 0); IREE_RETURN_AND_END_ZONE_IF_ERROR( @@ -878,7 +889,8 @@ static iree_status_t iree_hal_hip_graph_command_buffer_dispatch( dependency_count, ¶ms), "hipGraphAddKernelNode"); - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, 2); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( + command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } diff --git a/runtime/src/iree/hal/drivers/hip/hip_device.c b/runtime/src/iree/hal/drivers/hip/hip_device.c index 385727469405..f92c784338e2 100644 --- a/runtime/src/iree/hal/drivers/hip/hip_device.c +++ b/runtime/src/iree/hal/drivers/hip/hip_device.c @@ -344,6 +344,14 @@ static iree_status_t iree_hal_hip_device_create_internal( // Enable tracing for the (currently only) stream - no-op if disabled. if (iree_status_is_ok(status) && device->params.stream_tracing) { + if (device->params.stream_tracing >= IREE_HAL_HIP_TRACING_VERBOSITY_MAX || + device->params.stream_tracing < IREE_HAL_HIP_TRACING_VERBOSITY_OFF) { + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "invalid stream_tracing argument: expected to be between %d and %d", + IREE_HAL_HIP_TRACING_VERBOSITY_OFF, + IREE_HAL_HIP_TRACING_VERBOSITY_MAX); + } status = iree_hal_hip_tracing_context_allocate( device->hip_symbols, device->identifier, dispatch_stream, device->params.stream_tracing, &device->block_pool, host_allocator, diff --git a/runtime/src/iree/hal/drivers/hip/rccl_channel.c b/runtime/src/iree/hal/drivers/hip/rccl_channel.c index 578af202917e..84e592cb9ff1 100644 --- a/runtime/src/iree/hal/drivers/hip/rccl_channel.c +++ b/runtime/src/iree/hal/drivers/hip/rccl_channel.c @@ -593,9 +593,10 @@ iree_status_t iree_hal_hip_nccl_submit_batch( iree_string_view_t collective_str = iree_hal_collective_op_format(&entry->op, &string_temp); IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( - tracing_context, tracing_event_list, stream, 2, __FILE__, - strlen(__FILE__), (uint32_t)__LINE__, __FUNCTION__, - strlen(__FUNCTION__), collective_str.data, collective_str.size); + tracing_context, tracing_event_list, stream, + IREE_HAL_HIP_TRACING_VERBOSITY_FINE, __FILE__, strlen(__FILE__), + (uint32_t)__LINE__, __FUNCTION__, strlen(__FUNCTION__), + collective_str.data, collective_str.size); } #endif // IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE @@ -613,7 +614,8 @@ iree_status_t iree_hal_hip_nccl_submit_batch( IREE_TRACE({ for (iree_host_size_t i = 0; i < batch->count; ++i) { IREE_HIP_STREAM_TRACE_ZONE_END(tracing_context, tracing_event_list, - stream, 2); + stream, + IREE_HAL_HIP_TRACING_VERBOSITY_FINE); } }); diff --git a/runtime/src/iree/hal/drivers/hip/registration/driver_module.c b/runtime/src/iree/hal/drivers/hip/registration/driver_module.c index 27a5d57dd120..1998cfcd846c 100644 --- a/runtime/src/iree/hal/drivers/hip/registration/driver_module.c +++ b/runtime/src/iree/hal/drivers/hip/registration/driver_module.c @@ -185,12 +185,6 @@ static iree_status_t iree_hal_hip_driver_populate_options( "Option 'hip_tracing' expected to be int. Got: '%.*s'", (int)value.size, value.data); } - if (!(ivalue >= 0 && ivalue <= 3)) { - return iree_make_status( - IREE_STATUS_FAILED_PRECONDITION, - "Option 'hip_tracing' expected to be 0, 1, 2. Got: '%.*s'", - (int)value.size, value.data); - } device_params->stream_tracing = ivalue; } else if (iree_string_view_equal(key, key_hip_default_index)) { if (!iree_string_view_atoi_int32(value, &ivalue)) { diff --git a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c index 35920fe0a818..8924ed4068fa 100644 --- a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c +++ b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c @@ -183,7 +183,7 @@ static iree_status_t iree_hal_hip_stream_command_buffer_begin( IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->hip_stream, 1, + command_buffer->hip_stream, IREE_HAL_HIP_TRACING_VERBOSITY_COARSE, /*file_name=*/NULL, 0, /*line=*/0, "iree_hal_hip_stream_command_buffer", strlen("iree_hal_hip_stream_command_buffer"), /*name=*/NULL, 0); @@ -212,9 +212,9 @@ static iree_status_t iree_hal_hip_stream_command_buffer_end( z0, iree_hal_resource_set_allocate(command_buffer->arena.block_pool, &command_buffer->resource_set)); - IREE_HIP_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, - &command_buffer->tracing_event_list, - command_buffer->hip_stream, 1); + IREE_HIP_STREAM_TRACE_ZONE_END( + command_buffer->tracing_context, &command_buffer->tracing_event_list, + command_buffer->hip_stream, IREE_HAL_HIP_TRACING_VERBOSITY_COARSE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); @@ -230,8 +230,9 @@ static void iree_hal_hip_stream_command_buffer_begin_debug_group( IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->hip_stream, 1, location ? location->file.data : NULL, - location ? location->file.size : 0, location ? location->line : 0, + command_buffer->hip_stream, IREE_HAL_HIP_TRACING_VERBOSITY_COARSE, + location ? location->file.data : NULL, location ? location->file.size : 0, + location ? location->line : 0, /*func_name=*/NULL, 0, label.data, label.size); } @@ -241,9 +242,9 @@ static void iree_hal_hip_stream_command_buffer_end_debug_group( iree_hal_hip_stream_command_buffer_cast(base_command_buffer); (void)command_buffer; - IREE_HIP_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, - &command_buffer->tracing_event_list, - command_buffer->hip_stream, 1); + IREE_HIP_STREAM_TRACE_ZONE_END( + command_buffer->tracing_context, &command_buffer->tracing_event_list, + command_buffer->hip_stream, IREE_HAL_HIP_TRACING_VERBOSITY_COARSE); } static iree_status_t iree_hal_hip_stream_command_buffer_execution_barrier( @@ -542,9 +543,10 @@ static iree_status_t iree_hal_hip_stream_command_buffer_dispatch( IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->hip_stream, 2, kernel_info.source_filename.data, - kernel_info.source_filename.size, kernel_info.source_line, - kernel_info.function_name.data, kernel_info.function_name.size, + command_buffer->hip_stream, IREE_HAL_HIP_TRACING_VERBOSITY_FINE, + kernel_info.source_filename.data, kernel_info.source_filename.size, + kernel_info.source_line, kernel_info.function_name.data, + kernel_info.function_name.size, /*name=*/NULL, 0); IREE_RETURN_AND_END_ZONE_IF_ERROR( @@ -615,9 +617,9 @@ static iree_status_t iree_hal_hip_stream_command_buffer_dispatch( command_buffer->hip_stream, params_ptr, NULL), "hipModuleLaunchKernel"); - IREE_HIP_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, - &command_buffer->tracing_event_list, - command_buffer->hip_stream, 2); + IREE_HIP_STREAM_TRACE_ZONE_END( + command_buffer->tracing_context, &command_buffer->tracing_event_list, + command_buffer->hip_stream, IREE_HAL_HIP_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return status; diff --git a/runtime/src/iree/hal/drivers/hip/tracing.c b/runtime/src/iree/hal/drivers/hip/tracing.c index a18f240fb21f..96dba1f77d98 100644 --- a/runtime/src/iree/hal/drivers/hip/tracing.c +++ b/runtime/src/iree/hal/drivers/hip/tracing.c @@ -121,8 +121,8 @@ static iree_status_t iree_hal_hip_tracing_context_initial_calibration( iree_status_t iree_hal_hip_tracing_context_allocate( const iree_hal_hip_dynamic_symbols_t* symbols, iree_string_view_t queue_name, hipStream_t stream, - int32_t stream_tracing_verbosity, iree_arena_block_pool_t* block_pool, - iree_allocator_t host_allocator, + iree_hal_hip_tracing_verbosity_t stream_tracing_verbosity, + iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator, iree_hal_hip_tracing_context_t** out_context) { IREE_TRACE_ZONE_BEGIN(z0); IREE_ASSERT_ARGUMENT(symbols); @@ -429,7 +429,8 @@ static uint16_t iree_hal_hip_graph_tracing_context_insert_query( void iree_hal_hip_stream_tracing_zone_begin_impl( iree_hal_hip_tracing_context_t* context, iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream, - int32_t verbosity, const iree_tracing_location_t* src_loc) { + iree_hal_hip_tracing_verbosity_t verbosity, + const iree_tracing_location_t* src_loc) { if (!context) return; if (verbosity > context->verbosity) return; uint16_t query_id = iree_hal_hip_stream_tracing_context_insert_query( @@ -440,9 +441,9 @@ void iree_hal_hip_stream_tracing_zone_begin_impl( void iree_hal_hip_stream_tracing_zone_begin_external_impl( iree_hal_hip_tracing_context_t* context, iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream, - int32_t verbosity, const char* file_name, size_t file_name_length, - uint32_t line, const char* function_name, size_t function_name_length, - const char* name, size_t name_length) { + iree_hal_hip_tracing_verbosity_t verbosity, const char* file_name, + size_t file_name_length, uint32_t line, const char* function_name, + size_t function_name_length, const char* name, size_t name_length) { if (!context) return; if (verbosity > context->verbosity) return; uint16_t query_id = iree_hal_hip_stream_tracing_context_insert_query( @@ -455,7 +456,8 @@ void iree_hal_hip_stream_tracing_zone_begin_external_impl( void iree_hal_hip_graph_tracing_zone_begin_external_impl( iree_hal_hip_tracing_context_t* context, iree_hal_hip_tracing_context_event_list_t* event_list, - hipGraphNode_t* out_node, hipGraph_t graph, int32_t verbosity, + hipGraphNode_t* out_node, hipGraph_t graph, + iree_hal_hip_tracing_verbosity_t verbosity, hipGraphNode_t* dependency_nodes, size_t dependency_nodes_count, const char* file_name, size_t file_name_length, uint32_t line, const char* function_name, size_t function_name_length, const char* name, @@ -473,7 +475,7 @@ void iree_hal_hip_graph_tracing_zone_begin_external_impl( void iree_hal_hip_stream_tracing_zone_end_impl( iree_hal_hip_tracing_context_t* context, iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream, - int32_t verbosity) { + iree_hal_hip_tracing_verbosity_t verbosity) { if (!context) return; if (verbosity > context->verbosity) return; uint16_t query_id = iree_hal_hip_stream_tracing_context_insert_query( @@ -484,7 +486,8 @@ void iree_hal_hip_stream_tracing_zone_end_impl( void iree_hal_hip_graph_tracing_zone_end_impl( iree_hal_hip_tracing_context_t* context, iree_hal_hip_tracing_context_event_list_t* event_list, - hipGraphNode_t* out_node, hipGraph_t graph, int32_t verbosity, + hipGraphNode_t* out_node, hipGraph_t graph, + iree_hal_hip_tracing_verbosity_t verbosity, hipGraphNode_t* dependency_nodes, size_t dependency_nodes_count) { if (!context) return; if (verbosity > context->verbosity) return; diff --git a/runtime/src/iree/hal/drivers/hip/tracing.h b/runtime/src/iree/hal/drivers/hip/tracing.h index 7d294254fc12..baca80e568ca 100644 --- a/runtime/src/iree/hal/drivers/hip/tracing.h +++ b/runtime/src/iree/hal/drivers/hip/tracing.h @@ -52,13 +52,20 @@ typedef struct iree_hal_hip_tracing_context_event_list_t { iree_hal_hip_tracing_context_event_t* tail; } iree_hal_hip_tracing_context_event_list_t; +typedef enum iree_hal_hip_tracing_verbosity_e { + IREE_HAL_HIP_TRACING_VERBOSITY_OFF = 0, + IREE_HAL_HIP_TRACING_VERBOSITY_COARSE, + IREE_HAL_HIP_TRACING_VERBOSITY_FINE, + IREE_HAL_HIP_TRACING_VERBOSITY_MAX +} iree_hal_hip_tracing_verbosity_t; + // Allocates a tracing context for the given HIP |stream|. // Each context must only be used with the stream it was created for. iree_status_t iree_hal_hip_tracing_context_allocate( const iree_hal_hip_dynamic_symbols_t* symbols, iree_string_view_t queue_name, hipStream_t stream, - int32_t stream_tracing_verbosity, iree_arena_block_pool_t* block_pool, - iree_allocator_t host_allocator, + iree_hal_hip_tracing_verbosity_t stream_tracing_verbosity, + iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator, iree_hal_hip_tracing_context_t** out_context); // Frees a tracing context and all associated HIP resources. @@ -88,21 +95,23 @@ void iree_hal_hip_tracing_free( void iree_hal_hip_stream_tracing_zone_begin_impl( iree_hal_hip_tracing_context_t* context, iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream, - int32_t verbosity, const iree_tracing_location_t* src_loc); + iree_hal_hip_tracing_verbosity_t verbosity, + const iree_tracing_location_t* src_loc); // Begins an external zone using the given source information. // The provided strings will be copied into the tracy buffer. void iree_hal_hip_stream_tracing_zone_begin_external_impl( iree_hal_hip_tracing_context_t* context, iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream, - int32_t verbosity, const char* file_name, size_t file_name_length, - uint32_t line, const char* function_name, size_t function_name_length, - const char* name, size_t name_length); + iree_hal_hip_tracing_verbosity_t verbosity, const char* file_name, + size_t file_name_length, uint32_t line, const char* function_name, + size_t function_name_length, const char* name, size_t name_length); void iree_hal_hip_graph_tracing_zone_begin_external_impl( iree_hal_hip_tracing_context_t* context, iree_hal_hip_tracing_context_event_list_t* event_list, - hipGraphNode_t* out_node, hipGraph_t graph, int32_t verbosity, + hipGraphNode_t* out_node, hipGraph_t graph, + iree_hal_hip_tracing_verbosity_t verbosity, hipGraphNode_t* dependency_nodes, size_t dependency_nodes_count, const char* file_name, size_t file_name_length, uint32_t line, const char* function_name, size_t function_name_length, const char* name, @@ -111,11 +120,12 @@ void iree_hal_hip_graph_tracing_zone_begin_external_impl( void iree_hal_hip_stream_tracing_zone_end_impl( iree_hal_hip_tracing_context_t* context, iree_hal_hip_tracing_context_event_list_t* event_list, hipStream_t stream, - int32_t verbosity); + iree_hal_hip_tracing_verbosity_t verbosity); void iree_hal_hip_graph_tracing_zone_end_impl( iree_hal_hip_tracing_context_t* context, iree_hal_hip_tracing_context_event_list_t* event_list, - hipGraphNode_t* out_node, hipGraph_t graph, int32_t verbosity, + hipGraphNode_t* out_node, hipGraph_t graph, + iree_hal_hip_tracing_verbosity_t verbosity, hipGraphNode_t* dependency_nodes, size_t dependency_nodes_count); // Begins a new zone with the parent function name. From 3eb4a66d0e73d35cc1ea15321b942c40732a5182 Mon Sep 17 00:00:00 2001 From: Andrew Woloszyn Date: Fri, 9 Aug 2024 12:05:27 -0500 Subject: [PATCH 3/5] [cuda] merge fine-grained tracing options from hip. Bring over all of our fine-grained tracing options from our hip implementation. Signed-off-by: Andrew Woloszyn --- runtime/src/iree/hal/drivers/cuda/api.h | 13 ++- .../src/iree/hal/drivers/cuda/cuda_device.c | 13 ++- .../hal/drivers/cuda/graph_command_buffer.c | 91 +++++++++++-------- .../src/iree/hal/drivers/cuda/nccl_channel.c | 7 +- .../drivers/cuda/registration/driver_module.c | 12 ++- .../hal/drivers/cuda/stream_command_buffer.c | 32 ++++--- runtime/src/iree/hal/drivers/cuda/tracing.c | 43 ++++++--- runtime/src/iree/hal/drivers/cuda/tracing.h | 79 +++++++++------- 8 files changed, 176 insertions(+), 114 deletions(-) diff --git a/runtime/src/iree/hal/drivers/cuda/api.h b/runtime/src/iree/hal/drivers/cuda/api.h index a53ada06cccf..d40242263103 100644 --- a/runtime/src/iree/hal/drivers/cuda/api.h +++ b/runtime/src/iree/hal/drivers/cuda/api.h @@ -77,15 +77,14 @@ typedef struct iree_hal_cuda_device_params_t { // Specifies how command buffers are recorded and executed. iree_hal_cuda_command_buffer_mode_t command_buffer_mode; - // Enables tracing of command buffers when IREE tracing is enabled. - // May take advantage of additional extensions for more accurate timing or - // hardware-specific performance counters. + // Controls the verbosity of command buffers tracing when when IREE + // tracing is enabled. // // NOTE: tracing has a non-trivial overhead and will skew the timing of - // submissions and introduce false barriers between dispatches. Use this to - // identify slow dispatches and refine from there; be wary of whole-program - // tracing with this enabled. - bool stream_tracing; + // submissions and may introduce false barriers between dispatches. + // Use this to identify slow dispatches and command buffers and refine + // from there; be wary of whole-program tracing with this enabled. + int32_t stream_tracing; // Whether to use async allocations even if reported as available by the // device. Defaults to true when the device supports it. diff --git a/runtime/src/iree/hal/drivers/cuda/cuda_device.c b/runtime/src/iree/hal/drivers/cuda/cuda_device.c index a53f3818e610..30cccca36ff9 100644 --- a/runtime/src/iree/hal/drivers/cuda/cuda_device.c +++ b/runtime/src/iree/hal/drivers/cuda/cuda_device.c @@ -277,7 +277,7 @@ IREE_API_EXPORT void iree_hal_cuda_device_params_initialize( out_params->event_pool_capacity = 32; out_params->queue_count = 1; out_params->command_buffer_mode = IREE_HAL_CUDA_COMMAND_BUFFER_MODE_GRAPH; - out_params->stream_tracing = false; + out_params->stream_tracing = 0; out_params->async_allocations = true; } @@ -346,9 +346,18 @@ static iree_status_t iree_hal_cuda_device_create_internal( // Enable tracing for the (currently only) stream - no-op if disabled. if (iree_status_is_ok(status) && device->params.stream_tracing) { + if (device->params.stream_tracing >= IREE_HAL_CUDA_TRACING_VERBOSITY_MAX || + device->params.stream_tracing < IREE_HAL_CUDA_TRACING_VERBOSITY_OFF) { + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "invalid stream_tracing argument: expected to be between %d and %d", + IREE_HAL_CUDA_TRACING_VERBOSITY_OFF, + IREE_HAL_CUDA_TRACING_VERBOSITY_MAX); + } status = iree_hal_cuda_tracing_context_allocate( device->cuda_symbols, device->identifier, dispatch_stream, - &device->block_pool, host_allocator, &device->tracing_context); + device->params.stream_tracing, &device->block_pool, host_allocator, + &device->tracing_context); } // Memory pool support is conditional. diff --git a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c index 68d4d34668bb..a8526acd34df 100644 --- a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c +++ b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c @@ -82,9 +82,10 @@ iree_hal_cuda_graph_command_buffer_cast(iree_hal_command_buffer_t* base_value) { #if IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE static void iree_cuda_graph_command_buffer_trace_zone_begin_external( - iree_hal_cuda_graph_command_buffer_t* command_buffer, const char* file_name, - size_t file_name_length, uint32_t line, const char* function_name, - size_t function_name_length, const char* name, size_t name_length) { + iree_hal_cuda_graph_command_buffer_t* command_buffer, int32_t verbosity, + const char* file_name, size_t file_name_length, uint32_t line, + const char* function_name, size_t function_name_length, const char* name, + size_t name_length) { // Make sure there are no new nodes after the last barrier. // Work should start after the event. if (IREE_UNLIKELY(command_buffer->graph_node_count != 0)) { @@ -97,7 +98,7 @@ static void iree_cuda_graph_command_buffer_trace_zone_begin_external( size_t dependency_count = command_buffer->cu_barrier_node ? 1 : 0; IREE_CUDA_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL( command_buffer->tracing_context, &command_buffer->tracing_event_list, - tracing_event_node, command_buffer->cu_graph, + tracing_event_node, command_buffer->cu_graph, verbosity, &command_buffer->cu_barrier_node, dependency_count, file_name, file_name_length, line, function_name, function_name_length, name, name_length); @@ -109,7 +110,7 @@ static void iree_cuda_graph_command_buffer_trace_zone_begin_external( } static void iree_cuda_graph_command_buffer_trace_zone_end( - iree_hal_cuda_graph_command_buffer_t* command_buffer) { + iree_hal_cuda_graph_command_buffer_t* command_buffer, int32_t verbosity) { // Make sure there are no new nodes after the last barrier. // Prior work should end before the tracing event is recorded. if (IREE_UNLIKELY(command_buffer->graph_node_count != 0)) { @@ -124,7 +125,7 @@ static void iree_cuda_graph_command_buffer_trace_zone_end( "ending a zone should at least depend on the beginning"); IREE_CUDA_GRAPH_TRACE_ZONE_END( command_buffer->tracing_context, &command_buffer->tracing_event_list, - tracing_event_node, command_buffer->cu_graph, + tracing_event_node, command_buffer->cu_graph, verbosity, &command_buffer->cu_barrier_node, dependency_count); // We need to wait on the tracing end before other work starts. @@ -132,27 +133,29 @@ static void iree_cuda_graph_command_buffer_trace_zone_end( command_buffer->cu_barrier_node = *tracing_event_node; } -#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( \ - command_buffer, file_name, file_name_length, line, function_name, \ - function_name_length, name, name_length) \ - iree_cuda_graph_command_buffer_trace_zone_begin_external( \ - command_buffer, file_name, file_name_length, line, function_name, \ - function_name_length, name, name_length) -#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer) \ +#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( \ + command_buffer, verbosity, file_name, file_name_length, line, \ + function_name, function_name_length, name, name_length) \ + iree_cuda_graph_command_buffer_trace_zone_begin_external( \ + command_buffer, verbosity, file_name, file_name_length, line, \ + function_name, function_name_length, name, name_length) +#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer, \ + verbosity) \ IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( \ - command_buffer, /*file_name=*/NULL, 0, /*line=*/0, __FUNCTION__, \ - strlen(__FUNCTION__), /*name=*/NULL, 0) -#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer) \ - iree_cuda_graph_command_buffer_trace_zone_end(command_buffer) + command_buffer, verbosity, /*file_name=*/NULL, 0, /*line=*/0, \ + __FUNCTION__, strlen(__FUNCTION__), /*name=*/NULL, 0) +#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, \ + verbosity) \ + iree_cuda_graph_command_buffer_trace_zone_end(command_buffer, verbosity) #else // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE -#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( \ - command_buffer, file_name, file_name_length, line, function_name, \ - function_name_length, name, name_length) -#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer) -#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer) - +#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( \ + command_buffer, verbosity, file_name, file_name_length, line, \ + function_name, function_name_length, name, name_length) +#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer, \ + verbosity) +#define IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer, verbosity) #endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE iree_status_t iree_hal_cuda_graph_command_buffer_create( @@ -335,7 +338,8 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_begin( command_buffer->symbols, cuGraphCreate(&command_buffer->cu_graph, /*flags=*/0), "cuGraphCreate"); - IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer); + IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN( + command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE); return iree_ok_status(); } @@ -349,7 +353,8 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_end( IREE_RETURN_IF_ERROR( iree_hal_cuda_graph_command_buffer_flush_collectives(command_buffer)); - IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer); + IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( + command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE); // Reset state used during recording. command_buffer->cu_barrier_node = NULL; @@ -384,8 +389,9 @@ static void iree_hal_cuda_graph_command_buffer_begin_debug_group( (void)command_buffer; IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer, location ? location->file.data : NULL, - location ? location->file.size : 0, location ? location->line : 0, + command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE, + location ? location->file.data : NULL, location ? location->file.size : 0, + location ? location->line : 0, /*func_name=*/NULL, 0, label.data, label.size); } @@ -394,7 +400,8 @@ static void iree_hal_cuda_graph_command_buffer_end_debug_group( iree_hal_cuda_graph_command_buffer_t* command_buffer = iree_hal_cuda_graph_command_buffer_cast(base_command_buffer); (void)command_buffer; - IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer); + IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( + command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE); } static iree_status_t @@ -507,7 +514,8 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_fill_buffer( iree_hal_cuda_graph_command_buffer_t* command_buffer = iree_hal_cuda_graph_command_buffer_cast(base_command_buffer); IREE_TRACE_ZONE_BEGIN(z0); - IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer); + IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN( + command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_cuda_graph_command_buffer_flush_collectives(command_buffer)); @@ -546,7 +554,8 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_fill_buffer( dependency_count, ¶ms, command_buffer->cu_context), "cuGraphAddMemsetNode"); - IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer); + IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( + command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } @@ -557,7 +566,8 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_update_buffer( iree_hal_cuda_graph_command_buffer_t* command_buffer = iree_hal_cuda_graph_command_buffer_cast(base_command_buffer); IREE_TRACE_ZONE_BEGIN(z0); - IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer); + IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN( + command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_cuda_graph_command_buffer_flush_collectives(command_buffer)); @@ -608,7 +618,8 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_update_buffer( dependency_count, ¶ms, command_buffer->cu_context), "cuGraphAddMemcpyNode"); - IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer); + IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( + command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } @@ -619,7 +630,8 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_copy_buffer( iree_hal_cuda_graph_command_buffer_t* command_buffer = iree_hal_cuda_graph_command_buffer_cast(base_command_buffer); IREE_TRACE_ZONE_BEGIN(z0); - IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN(command_buffer); + IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN( + command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_cuda_graph_command_buffer_flush_collectives(command_buffer)); @@ -666,7 +678,8 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_copy_buffer( dependency_count, ¶ms, command_buffer->cu_context), "cuGraphAddMemcpyNode"); - IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer); + IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( + command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } @@ -763,9 +776,10 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_dispatch( executable, entry_point, &kernel_info)); IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer, kernel_info.source_filename.data, - kernel_info.source_filename.size, kernel_info.source_line, - kernel_info.function_name.data, kernel_info.function_name.size, + command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE, + kernel_info.source_filename.data, kernel_info.source_filename.size, + kernel_info.source_line, kernel_info.function_name.data, + kernel_info.function_name.size, /*name=*/NULL, 0); IREE_RETURN_AND_END_ZONE_IF_ERROR( @@ -865,7 +879,8 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_dispatch( dependency_count, ¶ms), "cuGraphAddKernelNode"); - IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer); + IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( + command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } diff --git a/runtime/src/iree/hal/drivers/cuda/nccl_channel.c b/runtime/src/iree/hal/drivers/cuda/nccl_channel.c index e3eb31c16d3b..2f6eb3fe93c6 100644 --- a/runtime/src/iree/hal/drivers/cuda/nccl_channel.c +++ b/runtime/src/iree/hal/drivers/cuda/nccl_channel.c @@ -559,7 +559,8 @@ iree_status_t iree_hal_cuda_nccl_submit_batch( iree_string_view_t collective_str = iree_hal_collective_op_format(&entry->op, &string_temp); IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( - tracing_context, tracing_event_list, stream, __FILE__, strlen(__FILE__), + tracing_context, tracing_event_list, stream, + IREE_HAL_CUDA_TRACING_VERBOSITY_FINE, __FILE__, strlen(__FILE__), (uint32_t)__LINE__, __FUNCTION__, strlen(__FUNCTION__), collective_str.data, collective_str.size); } @@ -578,8 +579,8 @@ iree_status_t iree_hal_cuda_nccl_submit_batch( // End all zones we began above - note that these are just simply nested so // order doesn't matter so long as we end the right number of zones. for (iree_host_size_t i = 0; i < batch->count; ++i) { - IREE_CUDA_STREAM_TRACE_ZONE_END(tracing_context, tracing_event_list, - stream); + IREE_CUDA_STREAM_TRACE_ZONE_END(tracing_context, tracing_event_list, stream, + IREE_HAL_CUDA_TRACING_VERBOSITY_FINE); } #endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE diff --git a/runtime/src/iree/hal/drivers/cuda/registration/driver_module.c b/runtime/src/iree/hal/drivers/cuda/registration/driver_module.c index bea81bc8176b..2e5bcff6c7c8 100644 --- a/runtime/src/iree/hal/drivers/cuda/registration/driver_module.c +++ b/runtime/src/iree/hal/drivers/cuda/registration/driver_module.c @@ -26,10 +26,14 @@ IREE_FLAG( "Enables CUDA asynchronous stream-ordered allocations when supported."); IREE_FLAG( - bool, cuda_tracing, true, - "Enables tracing of stream events when Tracy instrumentation is enabled.\n" - "Severely impacts benchmark timings and should only be used when\n" - "analyzing dispatch timings."); + int32_t, cuda_tracing, 2, + "Controls the verbosity of tracing when Tracy instrumentation is enabled.\n" + "The impact to benchmark timing becomes more severe as the verbosity\n" + "increases, and thus should be only enabled when needed.\n" + "Permissible values are:\n" + " 0 : stream tracing disabled.\n" + " 1 : coarse command buffer level tracing enabled.\n" + " 2 : fine-grained kernel level tracing enabled.\n"); IREE_FLAG(int32_t, cuda_default_index, 0, "Specifies the index of the default CUDA device to use"); diff --git a/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c b/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c index a9b50fc19f4a..2f5c0974ef1e 100644 --- a/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c +++ b/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c @@ -182,7 +182,7 @@ static iree_status_t iree_hal_cuda_stream_command_buffer_begin( IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->cu_stream, + command_buffer->cu_stream, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE, /*file_name=*/NULL, 0, /*line=*/0, "iree_hal_cuda_stream_command_buffer", strlen("iree_hal_cuda_stream_command_buffer"), /*name=*/NULL, 0); @@ -217,9 +217,9 @@ static iree_status_t iree_hal_cuda_stream_command_buffer_end( command_buffer->resource_set, &command_buffer->collective_batch); - IREE_CUDA_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, - &command_buffer->tracing_event_list, - command_buffer->cu_stream); + IREE_CUDA_STREAM_TRACE_ZONE_END( + command_buffer->tracing_context, &command_buffer->tracing_event_list, + command_buffer->cu_stream, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); @@ -235,8 +235,9 @@ static void iree_hal_cuda_stream_command_buffer_begin_debug_group( IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->cu_stream, location ? location->file.data : NULL, - location ? location->file.size : 0, location ? location->line : 0, + command_buffer->cu_stream, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE, + location ? location->file.data : NULL, location ? location->file.size : 0, + location ? location->line : 0, /*func_name=*/NULL, 0, label.data, label.size); // TODO: pass along to CUPTI if available. @@ -250,9 +251,9 @@ static void iree_hal_cuda_stream_command_buffer_end_debug_group( // TODO: pass along to CUPTI if available. - IREE_CUDA_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, - &command_buffer->tracing_event_list, - command_buffer->cu_stream); + IREE_CUDA_STREAM_TRACE_ZONE_END( + command_buffer->tracing_context, &command_buffer->tracing_event_list, + command_buffer->cu_stream, IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE); } static iree_status_t iree_hal_cuda_stream_command_buffer_execution_barrier( @@ -550,9 +551,10 @@ static iree_status_t iree_hal_cuda_stream_command_buffer_dispatch( IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->cu_stream, kernel_info.source_filename.data, - kernel_info.source_filename.size, kernel_info.source_line, - kernel_info.function_name.data, kernel_info.function_name.size, + command_buffer->cu_stream, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE, + kernel_info.source_filename.data, kernel_info.source_filename.size, + kernel_info.source_line, kernel_info.function_name.data, + kernel_info.function_name.size, /*name=*/NULL, 0); IREE_RETURN_AND_END_ZONE_IF_ERROR( @@ -634,9 +636,9 @@ static iree_status_t iree_hal_cuda_stream_command_buffer_dispatch( params_ptr, NULL), "cuLaunchKernel"); - IREE_CUDA_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, - &command_buffer->tracing_event_list, - command_buffer->cu_stream); + IREE_CUDA_STREAM_TRACE_ZONE_END( + command_buffer->tracing_context, &command_buffer->tracing_event_list, + command_buffer->cu_stream, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); diff --git a/runtime/src/iree/hal/drivers/cuda/tracing.c b/runtime/src/iree/hal/drivers/cuda/tracing.c index 913ca5dfd2e1..057fddad9b02 100644 --- a/runtime/src/iree/hal/drivers/cuda/tracing.c +++ b/runtime/src/iree/hal/drivers/cuda/tracing.c @@ -69,6 +69,8 @@ struct iree_hal_cuda_tracing_context_t { uint32_t query_capacity; + iree_hal_cuda_tracing_verbosity_t verbosity; + // Event pool reused to capture tracing timestamps. // The lifetime of the events are as follows. // 1) All events are allocated when the tracing context is created. @@ -118,6 +120,7 @@ static iree_status_t iree_hal_cuda_tracing_context_initial_calibration( iree_status_t iree_hal_cuda_tracing_context_allocate( const iree_hal_cuda_dynamic_symbols_t* symbols, iree_string_view_t queue_name, CUstream stream, + iree_hal_cuda_tracing_verbosity_t stream_tracing_verbosity, iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator, iree_hal_cuda_tracing_context_t** out_context) { IREE_TRACE_ZONE_BEGIN(z0); @@ -138,6 +141,7 @@ iree_status_t iree_hal_cuda_tracing_context_allocate( context->query_capacity = IREE_ARRAYSIZE(context->event_pool); context->submitted_event_list.head = NULL; context->submitted_event_list.tail = NULL; + context->verbosity = stream_tracing_verbosity; iree_slim_mutex_initialize(&context->event_mutex); } @@ -364,7 +368,8 @@ static void iree_hal_cuda_tracing_context_event_list_append_event( // event. static uint16_t iree_hal_cuda_stream_tracing_context_insert_query( iree_hal_cuda_tracing_context_t* context, - iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream) { + iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream, + iree_hal_cuda_tracing_verbosity_t verbosity) { iree_slim_mutex_lock(&context->event_mutex); IREE_ASSERT_ARGUMENT(event_list); @@ -392,7 +397,8 @@ static uint16_t iree_hal_cuda_stream_tracing_context_insert_query( static uint16_t iree_hal_cuda_graph_tracing_context_insert_query( iree_hal_cuda_tracing_context_t* context, iree_hal_cuda_tracing_context_event_list_t* event_list, - CUgraphNode* out_node, CUgraph graph, CUgraphNode* dependency_nodes, + CUgraphNode* out_node, CUgraph graph, + iree_hal_cuda_tracing_verbosity_t verbosity, CUgraphNode* dependency_nodes, size_t dependency_nodes_count) { IREE_ASSERT_ARGUMENT(event_list); iree_slim_mutex_lock(&context->event_mutex); @@ -426,22 +432,26 @@ static uint16_t iree_hal_cuda_graph_tracing_context_insert_query( void iree_hal_cuda_stream_tracing_zone_begin_impl( iree_hal_cuda_tracing_context_t* context, iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream, + iree_hal_cuda_tracing_verbosity_t verbosity, const iree_tracing_location_t* src_loc) { if (!context) return; + if (verbosity > context->verbosity) return; + uint16_t query_id = iree_hal_cuda_stream_tracing_context_insert_query( - context, event_list, stream); + context, event_list, stream, verbosity); iree_tracing_gpu_zone_begin(context->id, query_id, src_loc); } void iree_hal_cuda_stream_tracing_zone_begin_external_impl( iree_hal_cuda_tracing_context_t* context, iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream, - const char* file_name, size_t file_name_length, uint32_t line, - const char* function_name, size_t function_name_length, const char* name, - size_t name_length) { + iree_hal_cuda_tracing_verbosity_t verbosity, const char* file_name, + size_t file_name_length, uint32_t line, const char* function_name, + size_t function_name_length, const char* name, size_t name_length) { if (!context) return; + if (verbosity > context->verbosity) return; uint16_t query_id = iree_hal_cuda_stream_tracing_context_insert_query( - context, event_list, stream); + context, event_list, stream, verbosity); iree_tracing_gpu_zone_begin_external(context->id, query_id, file_name, file_name_length, line, function_name, function_name_length, name, name_length); @@ -450,13 +460,15 @@ void iree_hal_cuda_stream_tracing_zone_begin_external_impl( void iree_hal_cuda_graph_tracing_zone_begin_external_impl( iree_hal_cuda_tracing_context_t* context, iree_hal_cuda_tracing_context_event_list_t* event_list, - CUgraphNode* out_node, CUgraph graph, CUgraphNode* dependency_nodes, + CUgraphNode* out_node, CUgraph graph, + iree_hal_cuda_tracing_verbosity_t verbosity, CUgraphNode* dependency_nodes, size_t dependency_nodes_count, const char* file_name, size_t file_name_length, uint32_t line, const char* function_name, size_t function_name_length, const char* name, size_t name_length) { if (!context) return; + if (verbosity > context->verbosity) return; uint16_t query_id = iree_hal_cuda_graph_tracing_context_insert_query( - context, event_list, out_node, graph, dependency_nodes, + context, event_list, out_node, graph, verbosity, dependency_nodes, dependency_nodes_count); iree_tracing_gpu_zone_begin_external(context->id, query_id, file_name, file_name_length, line, function_name, @@ -465,21 +477,25 @@ void iree_hal_cuda_graph_tracing_zone_begin_external_impl( void iree_hal_cuda_stream_tracing_zone_end_impl( iree_hal_cuda_tracing_context_t* context, - iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream) { + iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream, + iree_hal_cuda_tracing_verbosity_t verbosity) { if (!context) return; + if (verbosity > context->verbosity) return; uint16_t query_id = iree_hal_cuda_stream_tracing_context_insert_query( - context, event_list, stream); + context, event_list, stream, verbosity); iree_tracing_gpu_zone_end(context->id, query_id); } void iree_hal_cuda_graph_tracing_zone_end_impl( iree_hal_cuda_tracing_context_t* context, iree_hal_cuda_tracing_context_event_list_t* event_list, - CUgraphNode* out_node, CUgraph graph, CUgraphNode* dependency_nodes, + CUgraphNode* out_node, CUgraph graph, + iree_hal_cuda_tracing_verbosity_t verbosity, CUgraphNode* dependency_nodes, size_t dependency_nodes_count) { if (!context) return; + if (verbosity > context->verbosity) return; uint16_t query_id = iree_hal_cuda_graph_tracing_context_insert_query( - context, event_list, out_node, graph, dependency_nodes, + context, event_list, out_node, graph, verbosity, dependency_nodes, dependency_nodes_count); iree_tracing_gpu_zone_end(context->id, query_id); } @@ -489,6 +505,7 @@ void iree_hal_cuda_graph_tracing_zone_end_impl( iree_status_t iree_hal_cuda_tracing_context_allocate( const iree_hal_cuda_dynamic_symbols_t* symbols, iree_string_view_t queue_name, CUstream stream, + iree_hal_cuda_tracing_verbosity_t stream_tracing_verbosity, iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator, iree_hal_cuda_tracing_context_t** out_context) { *out_context = NULL; diff --git a/runtime/src/iree/hal/drivers/cuda/tracing.h b/runtime/src/iree/hal/drivers/cuda/tracing.h index abe468f1c389..1174f778337c 100644 --- a/runtime/src/iree/hal/drivers/cuda/tracing.h +++ b/runtime/src/iree/hal/drivers/cuda/tracing.h @@ -52,11 +52,19 @@ typedef struct iree_hal_cuda_tracing_context_event_list_t { iree_hal_cuda_tracing_context_event_t* tail; } iree_hal_cuda_tracing_context_event_list_t; +typedef enum iree_hal_cuda_tracing_verbosity_e { + IREE_HAL_CUDA_TRACING_VERBOSITY_OFF = 0, + IREE_HAL_CUDA_TRACING_VERBOSITY_COARSE, + IREE_HAL_CUDA_TRACING_VERBOSITY_FINE, + IREE_HAL_CUDA_TRACING_VERBOSITY_MAX +} iree_hal_cuda_tracing_verbosity_t; + // Allocates a tracing context for the given CUDA |stream|. // Each context must only be used with the stream it was created for. iree_status_t iree_hal_cuda_tracing_context_allocate( const iree_hal_cuda_dynamic_symbols_t* symbols, iree_string_view_t queue_name, CUstream stream, + iree_hal_cuda_tracing_verbosity_t stream_tracing_verbosity, iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator, iree_hal_cuda_tracing_context_t** out_context); @@ -88,6 +96,7 @@ void iree_hal_cuda_tracing_free( void iree_hal_cuda_stream_tracing_zone_begin_impl( iree_hal_cuda_tracing_context_t* context, iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream, + iree_hal_cuda_tracing_verbosity_t verbosity, const iree_tracing_location_t* src_loc); // Begins an external zone using the given source information. @@ -95,74 +104,80 @@ void iree_hal_cuda_stream_tracing_zone_begin_impl( void iree_hal_cuda_stream_tracing_zone_begin_external_impl( iree_hal_cuda_tracing_context_t* context, iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream, - const char* file_name, size_t file_name_length, uint32_t line, - const char* function_name, size_t function_name_length, const char* name, - size_t name_length); + iree_hal_cuda_tracing_verbosity_t verbosity, const char* file_name, + size_t file_name_length, uint32_t line, const char* function_name, + size_t function_name_length, const char* name, size_t name_length); void iree_hal_cuda_graph_tracing_zone_begin_external_impl( iree_hal_cuda_tracing_context_t* context, iree_hal_cuda_tracing_context_event_list_t* event_list, - CUgraphNode* out_node, CUgraph graph, CUgraphNode* dependency_nodes, + CUgraphNode* out_node, CUgraph graph, + iree_hal_cuda_tracing_verbosity_t verbosity, CUgraphNode* dependency_nodes, size_t dependency_nodes_count, const char* file_name, size_t file_name_length, uint32_t line, const char* function_name, size_t function_name_length, const char* name, size_t name_length); void iree_hal_cuda_stream_tracing_zone_end_impl( iree_hal_cuda_tracing_context_t* context, - iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream); + iree_hal_cuda_tracing_context_event_list_t* event_list, CUstream stream, + iree_hal_cuda_tracing_verbosity_t verbosity); void iree_hal_cuda_graph_tracing_zone_end_impl( iree_hal_cuda_tracing_context_t* context, iree_hal_cuda_tracing_context_event_list_t* event_list, - CUgraphNode* out_node, CUgraph graph, CUgraphNode* dependency_nodes, + CUgraphNode* out_node, CUgraph graph, + iree_hal_cuda_tracing_verbosity_t verbosity, CUgraphNode* dependency_nodes, size_t dependency_nodes_count); // Begins a new zone with the parent function name. -#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN(context, event_list_begin, \ - event_list_end, stream) \ - static const iree_tracing_location_t TracyConcat( \ - __tracy_source_location, __LINE__) = {NULL, __FUNCTION__, __FILE__, \ - (uint32_t)__LINE__, 0}; \ - iree_hal_cuda_stream_tracing_zone_begin_impl( \ - context, event_list_begin, event_list_end, stream, \ +#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN(context, event_list_begin, \ + event_list_end, stream, verbosity) \ + static const iree_tracing_location_t TracyConcat( \ + __tracy_source_location, __LINE__) = {NULL, __FUNCTION__, __FILE__, \ + (uint32_t)__LINE__, 0}; \ + iree_hal_cuda_stream_tracing_zone_begin_impl( \ + context, event_list_begin, event_list_end, stream, verbosity, \ &TracyConcat(__tracy_source_location, __LINE__)); // Begins an externally defined zone with a dynamic source location. // The |file_name|, |function_name|, and optional |name| strings will be copied // into the trace buffer and do not need to persist. -#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( \ - context, event_list, stream, file_name, file_name_length, line, \ - function_name, function_name_length, name, name_length) \ - iree_hal_cuda_stream_tracing_zone_begin_external_impl( \ - context, event_list, stream, file_name, file_name_length, line, \ - function_name, function_name_length, name, name_length) +#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( \ + context, event_list, stream, verbosity, file_name, file_name_length, line, \ + function_name, function_name_length, name, name_length) \ + iree_hal_cuda_stream_tracing_zone_begin_external_impl( \ + context, event_list, stream, verbosity, file_name, file_name_length, \ + line, function_name, function_name_length, name, name_length) #define IREE_CUDA_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL( \ - context, event_list, out_node, graph, dependency_nodes, \ + context, event_list, out_node, graph, verbosity, dependency_nodes, \ dependency_nodes_count, file_name, file_name_length, line, function_name, \ function_name_length, name, name_length) \ iree_hal_cuda_graph_tracing_zone_begin_external_impl( \ - context, event_list, out_node, graph, dependency_nodes, \ + context, event_list, out_node, graph, verbosity, dependency_nodes, \ dependency_nodes_count, file_name, file_name_length, line, \ function_name, function_name_length, name, name_length) -#define IREE_CUDA_STREAM_TRACE_ZONE_END(context, event_list, stream) \ - iree_hal_cuda_stream_tracing_zone_end_impl(context, event_list, stream) +#define IREE_CUDA_STREAM_TRACE_ZONE_END(context, event_list, stream, \ + verbosity) \ + iree_hal_cuda_stream_tracing_zone_end_impl(context, event_list, stream, \ + verbosity) #define IREE_CUDA_GRAPH_TRACE_ZONE_END(context, event_list, out_node, graph, \ - dependency_nodes, \ + verbosity, dependency_nodes, \ dependency_nodes_count) \ - iree_hal_cuda_graph_tracing_zone_end_impl(context, event_list, out_node, \ - graph, dependency_nodes, \ - dependency_nodes_count) + iree_hal_cuda_graph_tracing_zone_end_impl( \ + context, event_list, out_node, graph, verbosity, dependency_nodes, \ + dependency_nodes_count) #else -#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN(context, event_list, stream) -#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( \ - context, event_list, stream, file_name, file_name_length, line, \ +#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN(context, event_list, stream, \ + verbosity) +#define IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( \ + context, event_list, stream, verbosity, file_name, file_name_length, line, \ function_name, function_name_length, name, name_length) #define IREE_CUDA_GRAPH_TRACE_ZONE_BEGIN_EXTERNAL( \ - context, event_list, out_node, graph, dependency_nodes, \ + context, event_list, out_node, graph, verbosity, dependency_nodes, \ dependency_nodes_count, file_name, file_name_length, line, function_name, \ function_name_length, name, name_length) -#define IREE_CUDA_STREAM_TRACE_ZONE_END(context, event_list, stream) +#define IREE_CUDA_STREAM_TRACE_ZONE_END(context, event_list, stream, verbosity) #endif // IREE_TRACING_FEATURES & IREE_TRACING_FEATURE_INSTRUMENTATION_DEVICE From 6df5803a09537053eb7813c52d5b6063430bc9ec Mon Sep 17 00:00:00 2001 From: Andrew Woloszyn Date: Fri, 9 Aug 2024 13:08:02 -0500 Subject: [PATCH 4/5] Fix some minor compilation issues. Signed-off-by: Andrew Woloszyn --- runtime/src/iree/hal/drivers/hip/tracing.c | 1 + runtime/src/iree/hal/drivers/hip/tracing.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/runtime/src/iree/hal/drivers/hip/tracing.c b/runtime/src/iree/hal/drivers/hip/tracing.c index 96dba1f77d98..62b15effa71b 100644 --- a/runtime/src/iree/hal/drivers/hip/tracing.c +++ b/runtime/src/iree/hal/drivers/hip/tracing.c @@ -502,6 +502,7 @@ void iree_hal_hip_graph_tracing_zone_end_impl( iree_status_t iree_hal_hip_tracing_context_allocate( const iree_hal_hip_dynamic_symbols_t* symbols, iree_string_view_t queue_name, hipStream_t stream, + iree_hal_hip_tracing_verbosity_t stream_tracing_verbosity, iree_arena_block_pool_t* block_pool, iree_allocator_t host_allocator, iree_hal_hip_tracing_context_t** out_context) { *out_context = NULL; diff --git a/runtime/src/iree/hal/drivers/hip/tracing.h b/runtime/src/iree/hal/drivers/hip/tracing.h index baca80e568ca..8323fd768b5c 100644 --- a/runtime/src/iree/hal/drivers/hip/tracing.h +++ b/runtime/src/iree/hal/drivers/hip/tracing.h @@ -177,7 +177,7 @@ void iree_hal_hip_graph_tracing_zone_end_impl( context, event_list, out_node, graph, verbosity, dependency_nodes, \ dependency_nodes_count, file_name, file_name_length, line, function_name, \ function_name_length, name, name_length) -#define IREE_HIP_STREAM_TRACE_ZONE_END(context, evnet_list, stream) +#define IREE_HIP_STREAM_TRACE_ZONE_END(context, evnet_list, stream, verbosity) #define IREE_HIP_GRAPH_TRACE_ZONE_END(context, event_list, out_node, graph, \ verbosity, dependency_nodes, \ dependency_nodes_count) From f7e860c6f7b8cd08587c3b77bc66bda560bf4a0d Mon Sep 17 00:00:00 2001 From: Andrew Woloszyn Date: Thu, 15 Aug 2024 11:10:07 -0500 Subject: [PATCH 5/5] Fix compilation post-merge. Signed-off-by: Andrew Woloszyn --- .../iree/hal/drivers/cuda/graph_command_buffer.c | 11 ++++++----- .../iree/hal/drivers/cuda/stream_command_buffer.c | 14 +++++++------- .../iree/hal/drivers/hip/graph_command_buffer.c | 11 ++++++----- .../iree/hal/drivers/hip/stream_command_buffer.c | 14 +++++++------- 4 files changed, 26 insertions(+), 24 deletions(-) diff --git a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c index a8526acd34df..e5b88df0286f 100644 --- a/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c +++ b/runtime/src/iree/hal/drivers/cuda/graph_command_buffer.c @@ -913,10 +913,10 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_dispatch2( executable, entry_point, &kernel_info)); IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer, kernel_info.source_filename.data, - kernel_info.source_filename.size, kernel_info.source_line, - kernel_info.function_name.data, kernel_info.function_name.size, - /*name=*/NULL, 0); + command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE, + kernel_info.source_filename.data, kernel_info.source_filename.size, + kernel_info.source_line, kernel_info.function_name.data, + kernel_info.function_name.size, /*name=*/NULL, 0); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1, @@ -1005,7 +1005,8 @@ static iree_status_t iree_hal_cuda_graph_command_buffer_dispatch2( dependency_count, ¶ms), "cuGraphAddKernelNode"); - IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer); + IREE_CUDA_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( + command_buffer, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } diff --git a/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c b/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c index 2f5c0974ef1e..4b8a0b106f8f 100644 --- a/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c +++ b/runtime/src/iree/hal/drivers/cuda/stream_command_buffer.c @@ -674,10 +674,10 @@ static iree_status_t iree_hal_cuda_stream_command_buffer_dispatch2( IREE_CUDA_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->cu_stream, kernel_info.source_filename.data, - kernel_info.source_filename.size, kernel_info.source_line, - kernel_info.function_name.data, kernel_info.function_name.size, - /*name=*/NULL, 0); + command_buffer->cu_stream, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE, + kernel_info.source_filename.data, kernel_info.source_filename.size, + kernel_info.source_line, kernel_info.function_name.data, + kernel_info.function_name.size, /*name=*/NULL, 0); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1, @@ -749,9 +749,9 @@ static iree_status_t iree_hal_cuda_stream_command_buffer_dispatch2( command_buffer->cu_stream, params_ptr, NULL), "cuLaunchKernel"); - IREE_CUDA_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, - &command_buffer->tracing_event_list, - command_buffer->cu_stream); + IREE_CUDA_STREAM_TRACE_ZONE_END( + command_buffer->tracing_context, &command_buffer->tracing_event_list, + command_buffer->cu_stream, IREE_HAL_CUDA_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); diff --git a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c index a946a89faba9..afade26d4be0 100644 --- a/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c +++ b/runtime/src/iree/hal/drivers/hip/graph_command_buffer.c @@ -923,10 +923,10 @@ static iree_status_t iree_hal_hip_graph_command_buffer_dispatch2( executable, entry_point, &kernel_info)); IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_BEGIN_EXTERNAL( - command_buffer, kernel_info.source_filename.data, - kernel_info.source_filename.size, kernel_info.source_line, - kernel_info.function_name.data, kernel_info.function_name.size, - /*name=*/NULL, 0); + command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE, + kernel_info.source_filename.data, kernel_info.source_filename.size, + kernel_info.source_line, kernel_info.function_name.data, + kernel_info.function_name.size, /*name=*/NULL, 0); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1, @@ -1006,7 +1006,8 @@ static iree_status_t iree_hal_hip_graph_command_buffer_dispatch2( dependency_count, ¶ms), "hipGraphAddKernelNode"); - IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END(command_buffer); + IREE_HIP_GRAPH_COMMAND_BUFFER_TRACE_ZONE_END( + command_buffer, IREE_HAL_HIP_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return iree_ok_status(); } diff --git a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c index 8924ed4068fa..1b8b6b665289 100644 --- a/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c +++ b/runtime/src/iree/hal/drivers/hip/stream_command_buffer.c @@ -654,10 +654,10 @@ static iree_status_t iree_hal_hip_stream_command_buffer_dispatch2( IREE_HIP_STREAM_TRACE_ZONE_BEGIN_EXTERNAL( command_buffer->tracing_context, &command_buffer->tracing_event_list, - command_buffer->hip_stream, kernel_info.source_filename.data, - kernel_info.source_filename.size, kernel_info.source_line, - kernel_info.function_name.data, kernel_info.function_name.size, - /*name=*/NULL, 0); + command_buffer->hip_stream, IREE_HAL_HIP_TRACING_VERBOSITY_FINE, + kernel_info.source_filename.data, kernel_info.source_filename.size, + kernel_info.source_line, kernel_info.function_name.data, + kernel_info.function_name.size, /*name=*/NULL, 0); IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1, @@ -720,9 +720,9 @@ static iree_status_t iree_hal_hip_stream_command_buffer_dispatch2( params_ptr, NULL), "hipModuleLaunchKernel"); - IREE_HIP_STREAM_TRACE_ZONE_END(command_buffer->tracing_context, - &command_buffer->tracing_event_list, - command_buffer->hip_stream); + IREE_HIP_STREAM_TRACE_ZONE_END( + command_buffer->tracing_context, &command_buffer->tracing_event_list, + command_buffer->hip_stream, IREE_HAL_HIP_TRACING_VERBOSITY_FINE); IREE_TRACE_ZONE_END(z0); return status;